diff --git a/cdlib/algorithms/edge_clustering.py b/cdlib/algorithms/edge_clustering.py
index 54e55e91..ec591c24 100644
--- a/cdlib/algorithms/edge_clustering.py
+++ b/cdlib/algorithms/edge_clustering.py
@@ -1,10 +1,10 @@
from cdlib import EdgeClustering
from collections import defaultdict
import networkx as nx
-from cdlib.utils import convert_graph_formats
-from cdlib.algorithms.internal.HLC import HLC, HLC_read_edge_list_unweighted
+from cdlib.utils import convert_graph_formats, nx_node_integer_mapping, remap_edge_communities
+from cdlib.algorithms.internal.HLC import HLC, HLC_read_edge_list_unweighted, HLC_read_edge_list_weighted, HLC_full
-__all__ = ["hierarchical_link_community"]
+__all__ = ["hierarchical_link_community", "hierarchical_link_community_w", "hierarchical_link_community_full"]
def hierarchical_link_community(g_original: object) -> EdgeClustering:
@@ -50,3 +50,92 @@ def hierarchical_link_community(g_original: object) -> EdgeClustering:
coms = [list(c) for c in coms.values()]
return EdgeClustering(coms, g_original, "HLC", method_parameters={})
+
+
+def hierarchical_link_community_w(g_original: object) -> EdgeClustering:
+ """
+ HLC (hierarchical link clustering) is a method to classify links into topologically related groups.
+ The algorithm uses a similarity between links to build a dendrogram where each leaf is a link from the original network and branches represent link communities.
+ At each level of the link dendrogram is calculated the partition density function, based on link density inside communities, to pick the best level to cut.
+
+
+ **Supported Graph Types**
+
+ ========== ======== ========
+ Undirected Directed Weighted
+ ========== ======== ========
+ Yes No Yes
+ ========== ======== ========
+
+ :param g_original: a networkx/igraph object
+ :return: EdgeClustering object
+
+
+ :Example:
+
+ >>> from cdlib import algorithms
+ >>> import networkx as nx
+ >>> G = nx.karate_club_graph()
+ >>> com = algorithms.hierarchical_link_community_w(G)
+
+ :References:
+
+ Ahn, Yong-Yeol, James P. Bagrow, and Sune Lehmann. `Link communities reveal multiscale complexity in networks. `_ nature 466.7307 (2010): 761.
+ """
+
+ g = convert_graph_formats(g_original, nx.Graph)
+
+ adj, edges, ij2wij = HLC_read_edge_list_weighted(g)
+ edge2cid, _, _, _ = HLC(adj, edges).single_linkage(w=ij2wij)
+
+ coms = defaultdict(list)
+ for e, com in edge2cid.items():
+ coms[com].append(e)
+
+ coms = [list(c) for c in coms.values()]
+ return EdgeClustering(coms, g_original, "HLC_w", method_parameters={})
+
+
+ """
+ HLC (hierarchical link clustering) is a method to classify links into topologically related groups.
+ The algorithm uses a similarity between links to build a dendrogram where each leaf is a link from the original network and branches represent link communities.
+ At each level of the link dendrogram is calculated the partition density function, based on link density inside communities, to pick the best level to cut.
+ This implementation follows exactly the algorithm described in Ahn et al and uses numpy/scipy to improve the clustering computation (It is faster and consumes less memory.
+
+
+ **Supported Graph Types**
+
+ ========== ======== ========
+ Undirected Directed Weighted
+ ========== ======== ========
+ Yes No Yes
+ ========== ======== ========
+
+ :param g_original: a networkx/igraph object
+ :weight: None for unweighted networks, jaccard approximation is used. When defined with a string, edge attribute name (usually 'weight') to be used as weight and Tanimoto approximation is used.
+ :simthr: None by default. If set to float, all values less than threshold are set to 0 in similarity matrix (it could reduce memory usage).
+ :hcmethod: Linkage method used in hierarchical clustering, 'single' by default. See scipy.cluster.hierarchy.linkage to get full method list.
+ :min_edges: None by default. If set to float, minimum number of edges that a community must contain to be kept in the clustering
+ :verbose: If True, write intermediary steps to disk.
+ :return: EdgeClustering object
+
+
+ :Example:
+
+ >>> from cdlib import algorithms
+ >>> import networkx as nx
+ >>> G = nx.karate_club_graph()
+ >>> com = algorithms.hierarchical_link_community_full(G)
+
+ :References:
+
+ Ahn, Yong-Yeol, James P. Bagrow, and Sune Lehmann. `Link communities reveal multiscale complexity in networks. `_ nature 466.7307 (2010): 761.
+ """
+def hierarchical_link_community_full(g_original: object, weight='weight', simthr=None, hcmethod='single', min_edges= None, verbose=False) -> EdgeClustering:
+ g = convert_graph_formats(g_original, nx.Graph)
+ g_number, dictio = nx_node_integer_mapping(g)
+
+ coms = HLC_full(g_number, weight=weight, simthr=simthr, hcmethod=hcmethod, min_edges=min_edges, verbose=verbose, dictio= dictio).clusters
+ clustering = EdgeClustering(coms, g_number, "HLC_f", method_parameters={})
+ if dictio != None: clustering.communities = remap_edge_communities(clustering.communities, dictio)
+ return clustering
\ No newline at end of file
diff --git a/cdlib/algorithms/internal/HLC.py b/cdlib/algorithms/internal/HLC.py
index e20daf11..8a1ed0fc 100644
--- a/cdlib/algorithms/internal/HLC.py
+++ b/cdlib/algorithms/internal/HLC.py
@@ -2,7 +2,9 @@
from heapq import heappush, heappop
from itertools import combinations, chain
from collections import defaultdict
-
+import networkx as nx
+import numpy as np
+from scipy.cluster.hierarchy import linkage
# Hierarchical Link Community
"""
@@ -220,3 +222,187 @@ def cal_jaccard(intersect_val, left_val, right_val):
similarity_ratio = cal_jaccard(ai_dot_aj, n2a_sqrd[i], n2a_sqrd[j])
heappush(min_heap, (1 - similarity_ratio, edge_pair))
return [heappop(min_heap) for _ in range(len(min_heap))]
+
+class HLC_full(object):
+ def __init__(self, net, weight='weight', simthr=None, hcmethod=None, min_edges=None, verbose=False, dictio=None ):
+ self.edge_counter = 0
+ self.clusters = self.edge_clustering(net, weight=weight, simthr=simthr, hcmethod=hcmethod, min_edges=min_edges, verbose=verbose, dictio = dictio)
+
+ def edge_clustering(self, net, weight=None, simthr=None, hcmethod=None, min_edges=None, verbose=False, dictio=None):
+ condensed_dist_vector, edge_matrix_len, edge_list = self.get_edge_similarity_matrix(net, weight=weight, simthr=simthr, verbose=verbose, dictio=dictio) # TODO: Add jaccard matrix without use weigths
+ clustering = linkage(condensed_dist_vector, hcmethod)
+ final_clusters = self.get_clusters_by_partition_density(clustering, edge_matrix_len, edge_list, min_edges=min_edges)
+ return final_clusters
+
+ def get_edge_similarity_matrix(self, net, weight=None, simthr=None, verbose=False, dictio=None):
+ node_list = list(net.nodes())
+ node_list.sort()
+ adj = nx.adjacency_matrix(net, weight=weight, nodelist=node_list)
+ adj = adj.toarray() # This line is needed as a change in csr format from matrix to array sparse. Diference in dot product if this line is removed!
+
+ if weight == None:
+ degree = np.sum(adj, axis=1)
+ np.fill_diagonal(adj, 1)
+ else:
+ degree = adj > 0
+ degree = np.sum(degree, axis=1)
+ weigth_sum = np.sum(adj, axis=1)
+ np.fill_diagonal(adj, weigth_sum/degree) # Ahn ecuation 4 in supplementary file
+
+ dotproduct = np.dot(adj, adj) # This efficiently calculates the vector products needed for tanimoto coefficient (ai and aj)
+ adj = None # Remove matrix in order to save memory
+ edge_dict = {}
+ data = []; col_i = []; col_j = [] # To save tanimoto similarities as sparse data
+ cache = {} # To save tanimoto similarity by pairs and avoid caclulate it when a pair is repeated
+ # k_node, i_neigh, j_neigh they are adj matrix indexes AND node names
+ edge_similarities = []
+ for k_node in node_list: # take node as reference and calculate tanimoto coeff for each pair of edges
+ neighbor_list = list(net.neighbors(k_node)) # take neighbors to get pairs and compare edges
+ neighbor_list.sort()
+ if len(neighbor_list) < 2: continue # Skip k nodes that has NOT enough neighbours to perform tanimoto
+ while len(neighbor_list) > 0:
+ i_neigh = neighbor_list.pop()
+ for j_neigh in neighbor_list:
+ if weight != None:
+ sim = self.get_tanimoto_index(i_neigh, j_neigh, dotproduct, cache)
+ else:
+ sim = self.get_jaccard_index(i_neigh, j_neigh, dotproduct, degree, cache)
+ if simthr != None and sim < simthr: continue
+ pair = [self.get_edge_id(k_node, i_neigh, edge_dict), self.get_edge_id(k_node, j_neigh, edge_dict)]
+ pair.sort()
+ ki_edge_id, kj_edge_id = pair
+ data.append(sim)
+ if verbose:
+ a_pair = '_'.join(sorted([dictio[k_node], dictio[i_neigh]]))
+ b_pair = '_'.join(sorted([dictio[k_node], dictio[j_neigh]]))
+ ids = sorted([a_pair, b_pair])
+ edge_similarities.append(ids + [str(sim)])
+ col_i.append(ki_edge_id)
+ col_j.append(kj_edge_id)
+
+ if verbose:
+ with open('edge_scores.txt', 'w') as f:
+ for e in edge_similarities: f.write("\t".join(e) + "\n")
+ condensed_dist_vector, edge_matrix_len = self.get_distance_condensed_vector(data, col_i, col_j)
+ return condensed_dist_vector, edge_matrix_len, list(edge_dict.keys())
+
+ def get_tanimoto_index(self, i_neigh, j_neigh, dotproduct, cache):
+ sim, pair = self.get_sim(i_neigh, j_neigh, cache)
+ if sim == None:
+ a_i2 = dotproduct[i_neigh,i_neigh]
+ a_j2 = dotproduct[j_neigh,j_neigh]
+ a_ij = dotproduct[i_neigh, j_neigh]
+ sim = a_ij /( a_i2 + a_j2 - a_ij)
+ cache[pair] = sim
+ return sim
+
+ def get_jaccard_index(self, i_neigh, j_neigh, dotproduct, degree, cache):
+ sim, pair = self.get_sim(i_neigh, j_neigh, cache)
+ if sim == None:
+ a_ij = dotproduct[i_neigh, j_neigh]
+ sim = a_ij /min(degree[i_neigh], degree[j_neigh])
+ cache[pair] = sim
+ return sim
+
+ def get_sim(self, i_neigh, j_neigh, cache):
+ pair = [i_neigh, j_neigh]
+ pair.sort()
+ pair = tuple(pair)
+ sim = cache.get(pair)
+ return sim, pair
+
+ def get_edge_id(self, a, b, e_dict):
+ e = [a, b]
+ e.sort()
+ edge_id = tuple(e)
+ e_index = e_dict.get(edge_id)
+ if e_index == None:
+ e_index = self.edge_counter
+ e_dict[edge_id] = e_index
+ self.edge_counter += 1
+ return e_index
+
+ def get_distance_condensed_vector(self, data, col_i, col_j):
+ edge_matrix_len = max([max(col_i), max(col_j)]) + 1 # Values in col_i and col_j are 0 based indexes so we need to add 1 to get the vector size
+ upper_triangle_size = (edge_matrix_len**2 - edge_matrix_len)//2
+ condensed_vector = np.ones(upper_triangle_size)
+ for idx, sim in enumerate(data): # m * i + j - ((i + 2) * (i + 1)) / 2 from https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
+ i = col_i[idx]
+ j = col_j[idx]
+ v = edge_matrix_len * i + j - ((i + 2) * (i + 1)) // 2
+ condensed_vector[v] = 1 - sim
+ return condensed_vector, edge_matrix_len
+
+ def get_clusters_by_partition_density(self, clustering, edge_len, edge_list, min_edges=None):
+ tree = {} # clust id : [member_ids]
+ edges_per_cluster = {} #clust_id : [ edge_tuples ]
+ partial_partition_densities = {} #clust_id : [ cluster_partition_density ]
+
+ counter = edge_len # this works as cluster id. This is used by the linkage method to tag the intermediate clusters: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
+ last_dist = None
+ constant = 2/edge_len
+ last_cluster_pool = []
+ max_pden = -10000000000
+ max_cluster_ids = []
+ for a_id, b_id, dist, n_members in clustering:
+ dist = round(dist, 5) # To make equal similar distances that differs in very low values
+ if last_dist != None and dist != last_dist: # We could have several clusters at the same dist, so we group the merge events to clculate the partition density
+ p_den = self.get_pden(last_cluster_pool, partial_partition_densities, constant)
+ if p_den > max_pden: # check the best partition density
+ max_pden = p_den
+ max_cluster_ids = last_cluster_pool
+ a_id = int(a_id) # Linkage method returns member ids as float instead of int
+ b_id = int(b_id)
+ member_list = self.get_member_list(counter, a_id, b_id, edge_len, tree) # members that we merge to build the new agglomerative cluster
+ nodes, edges = self.get_nodesNedges_per_cluster(member_list, edge_list)
+ edges_per_cluster[counter] = edges
+ partial_partition_densities[counter] = self.get_cluster_partial_partition_density(member_list, nodes)
+ last_cluster_pool = [ cl_id for cl_id in last_cluster_pool if cl_id not in [a_id, b_id] ] # update clusters removin merged cl ids and adding the new cluters ids
+ last_cluster_pool.append(counter)
+ last_dist = dist
+ counter += 1
+
+ p_den = self.get_pden(last_cluster_pool, partial_partition_densities, constant) # update clusters removin merged cl ids and adding the new cluters ids
+ if p_den > max_pden: # check the best partition density on the last distance that not was checked
+ max_pden = p_den
+ max_cluster_ids = last_cluster_pool
+
+ final_clusters = [ ]
+ for cluster_id in max_cluster_ids:
+ members = edges_per_cluster[cluster_id]
+ if min_edges == None or len(members) >= min_edges:
+ final_clusters.append(members)
+ return final_clusters
+
+ def add_cluster_members(self, cluster, member_id, n_records, tree):
+ if member_id < n_records: # check if member_id is a cluster with only one member that is a original record. That id is less than n_records is the criteria described in https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
+ cluster.append(member_id)
+ else: # The id represents the merge of two previous clusters. We obtain the member list from the tree and remove it to merge in the it in the new cluster
+ cluster.extend(tree.pop(member_id))
+
+ def get_member_list(self, cluster_id, a_id, b_id, edge_len, tree):
+ member_list = []
+ self.add_cluster_members(member_list, a_id, edge_len, tree) # get cluster members from previous a cluster
+ self.add_cluster_members(member_list, b_id, edge_len, tree) # get cluster members from previous b cluster
+ tree[cluster_id] = member_list
+ return member_list
+
+ def get_nodesNedges_per_cluster(self, members, edge_list):
+ nodes = []
+ edges = []
+ for member in members:
+ edge = edge_list[member]
+ edges.append(edge)
+ nodes.extend(edge) # Add edge nodes to node list
+ return list(set(nodes)), edges
+
+ def get_cluster_partial_partition_density(self, edges, nodes):
+ n = len(nodes) #node number
+ m = len(edges) # link number
+ #return (m-(n-1))/(n*(n-1)/(2-(n-1))) #Ahn
+ return (m*(m-n+1))/((n-2)*(n-1)) #kalinka
+
+ def get_pden(self, last_cluster_pool, partial_partition_densities, constant):
+ partition_den_sum = sum([ partial_partition_densities[cl_id] for cl_id in last_cluster_pool]) #Partition density
+ p_den = constant * partition_den_sum
+ return p_den
\ No newline at end of file
diff --git a/cdlib/test/test_edgeclustering.py b/cdlib/test/test_edgeclustering.py
index 8a591cd2..d78c07d5 100644
--- a/cdlib/test/test_edgeclustering.py
+++ b/cdlib/test/test_edgeclustering.py
@@ -12,6 +12,16 @@ def test_to_json(self):
js = coms.to_json()
self.assertIsInstance(js, str)
+ coms = algorithms.hierarchical_link_community_w(g)
+ self.assertIsInstance(coms, EdgeClustering)
+ js = coms.to_json()
+ self.assertIsInstance(js, str)
+
+ coms = algorithms.hierarchical_link_community_full(g)
+ self.assertIsInstance(coms, EdgeClustering)
+ js = coms.to_json()
+ self.assertIsInstance(js, str)
+
def test_node_map(self):
g = nx.karate_club_graph()
coms = algorithms.hierarchical_link_community(g)
diff --git a/cdlib/utils.py b/cdlib/utils.py
index ae9b74a9..528f5909 100644
--- a/cdlib/utils.py
+++ b/cdlib/utils.py
@@ -239,6 +239,20 @@ def remap_node_communities(communities: object, node_map: dict) -> list:
communities = cms
return communities
+def remap_edge_communities(communities: object, node_map: dict) -> list: # ADDED TO HANDLE THIS CASE, VERSION FOR NODES CAN'T HANDLE THIS
+ """Apply a map to the obtained communities to retreive the original node labels
+
+ :param communities: EdgeClustering object
+ :param node_map: dictionary
+ :return: remapped communities
+ """
+
+ cms = []
+ for community in communities:
+ community = [(node_map[e[0]], node_map[e[1]]) for e in community]
+ cms.append(community)
+ communities = cms
+ return communities
def affiliations2nodesets(affiliations: dict) -> dict:
"""