diff --git a/cdlib/algorithms/edge_clustering.py b/cdlib/algorithms/edge_clustering.py index 54e55e91..ec591c24 100644 --- a/cdlib/algorithms/edge_clustering.py +++ b/cdlib/algorithms/edge_clustering.py @@ -1,10 +1,10 @@ from cdlib import EdgeClustering from collections import defaultdict import networkx as nx -from cdlib.utils import convert_graph_formats -from cdlib.algorithms.internal.HLC import HLC, HLC_read_edge_list_unweighted +from cdlib.utils import convert_graph_formats, nx_node_integer_mapping, remap_edge_communities +from cdlib.algorithms.internal.HLC import HLC, HLC_read_edge_list_unweighted, HLC_read_edge_list_weighted, HLC_full -__all__ = ["hierarchical_link_community"] +__all__ = ["hierarchical_link_community", "hierarchical_link_community_w", "hierarchical_link_community_full"] def hierarchical_link_community(g_original: object) -> EdgeClustering: @@ -50,3 +50,92 @@ def hierarchical_link_community(g_original: object) -> EdgeClustering: coms = [list(c) for c in coms.values()] return EdgeClustering(coms, g_original, "HLC", method_parameters={}) + + +def hierarchical_link_community_w(g_original: object) -> EdgeClustering: + """ + HLC (hierarchical link clustering) is a method to classify links into topologically related groups. + The algorithm uses a similarity between links to build a dendrogram where each leaf is a link from the original network and branches represent link communities. + At each level of the link dendrogram is calculated the partition density function, based on link density inside communities, to pick the best level to cut. + + + **Supported Graph Types** + + ========== ======== ======== + Undirected Directed Weighted + ========== ======== ======== + Yes No Yes + ========== ======== ======== + + :param g_original: a networkx/igraph object + :return: EdgeClustering object + + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> G = nx.karate_club_graph() + >>> com = algorithms.hierarchical_link_community_w(G) + + :References: + + Ahn, Yong-Yeol, James P. Bagrow, and Sune Lehmann. `Link communities reveal multiscale complexity in networks. `_ nature 466.7307 (2010): 761. + """ + + g = convert_graph_formats(g_original, nx.Graph) + + adj, edges, ij2wij = HLC_read_edge_list_weighted(g) + edge2cid, _, _, _ = HLC(adj, edges).single_linkage(w=ij2wij) + + coms = defaultdict(list) + for e, com in edge2cid.items(): + coms[com].append(e) + + coms = [list(c) for c in coms.values()] + return EdgeClustering(coms, g_original, "HLC_w", method_parameters={}) + + + """ + HLC (hierarchical link clustering) is a method to classify links into topologically related groups. + The algorithm uses a similarity between links to build a dendrogram where each leaf is a link from the original network and branches represent link communities. + At each level of the link dendrogram is calculated the partition density function, based on link density inside communities, to pick the best level to cut. + This implementation follows exactly the algorithm described in Ahn et al and uses numpy/scipy to improve the clustering computation (It is faster and consumes less memory. + + + **Supported Graph Types** + + ========== ======== ======== + Undirected Directed Weighted + ========== ======== ======== + Yes No Yes + ========== ======== ======== + + :param g_original: a networkx/igraph object + :weight: None for unweighted networks, jaccard approximation is used. When defined with a string, edge attribute name (usually 'weight') to be used as weight and Tanimoto approximation is used. + :simthr: None by default. If set to float, all values less than threshold are set to 0 in similarity matrix (it could reduce memory usage). + :hcmethod: Linkage method used in hierarchical clustering, 'single' by default. See scipy.cluster.hierarchy.linkage to get full method list. + :min_edges: None by default. If set to float, minimum number of edges that a community must contain to be kept in the clustering + :verbose: If True, write intermediary steps to disk. + :return: EdgeClustering object + + + :Example: + + >>> from cdlib import algorithms + >>> import networkx as nx + >>> G = nx.karate_club_graph() + >>> com = algorithms.hierarchical_link_community_full(G) + + :References: + + Ahn, Yong-Yeol, James P. Bagrow, and Sune Lehmann. `Link communities reveal multiscale complexity in networks. `_ nature 466.7307 (2010): 761. + """ +def hierarchical_link_community_full(g_original: object, weight='weight', simthr=None, hcmethod='single', min_edges= None, verbose=False) -> EdgeClustering: + g = convert_graph_formats(g_original, nx.Graph) + g_number, dictio = nx_node_integer_mapping(g) + + coms = HLC_full(g_number, weight=weight, simthr=simthr, hcmethod=hcmethod, min_edges=min_edges, verbose=verbose, dictio= dictio).clusters + clustering = EdgeClustering(coms, g_number, "HLC_f", method_parameters={}) + if dictio != None: clustering.communities = remap_edge_communities(clustering.communities, dictio) + return clustering \ No newline at end of file diff --git a/cdlib/algorithms/internal/HLC.py b/cdlib/algorithms/internal/HLC.py index e20daf11..8a1ed0fc 100644 --- a/cdlib/algorithms/internal/HLC.py +++ b/cdlib/algorithms/internal/HLC.py @@ -2,7 +2,9 @@ from heapq import heappush, heappop from itertools import combinations, chain from collections import defaultdict - +import networkx as nx +import numpy as np +from scipy.cluster.hierarchy import linkage # Hierarchical Link Community """ @@ -220,3 +222,187 @@ def cal_jaccard(intersect_val, left_val, right_val): similarity_ratio = cal_jaccard(ai_dot_aj, n2a_sqrd[i], n2a_sqrd[j]) heappush(min_heap, (1 - similarity_ratio, edge_pair)) return [heappop(min_heap) for _ in range(len(min_heap))] + +class HLC_full(object): + def __init__(self, net, weight='weight', simthr=None, hcmethod=None, min_edges=None, verbose=False, dictio=None ): + self.edge_counter = 0 + self.clusters = self.edge_clustering(net, weight=weight, simthr=simthr, hcmethod=hcmethod, min_edges=min_edges, verbose=verbose, dictio = dictio) + + def edge_clustering(self, net, weight=None, simthr=None, hcmethod=None, min_edges=None, verbose=False, dictio=None): + condensed_dist_vector, edge_matrix_len, edge_list = self.get_edge_similarity_matrix(net, weight=weight, simthr=simthr, verbose=verbose, dictio=dictio) # TODO: Add jaccard matrix without use weigths + clustering = linkage(condensed_dist_vector, hcmethod) + final_clusters = self.get_clusters_by_partition_density(clustering, edge_matrix_len, edge_list, min_edges=min_edges) + return final_clusters + + def get_edge_similarity_matrix(self, net, weight=None, simthr=None, verbose=False, dictio=None): + node_list = list(net.nodes()) + node_list.sort() + adj = nx.adjacency_matrix(net, weight=weight, nodelist=node_list) + adj = adj.toarray() # This line is needed as a change in csr format from matrix to array sparse. Diference in dot product if this line is removed! + + if weight == None: + degree = np.sum(adj, axis=1) + np.fill_diagonal(adj, 1) + else: + degree = adj > 0 + degree = np.sum(degree, axis=1) + weigth_sum = np.sum(adj, axis=1) + np.fill_diagonal(adj, weigth_sum/degree) # Ahn ecuation 4 in supplementary file + + dotproduct = np.dot(adj, adj) # This efficiently calculates the vector products needed for tanimoto coefficient (ai and aj) + adj = None # Remove matrix in order to save memory + edge_dict = {} + data = []; col_i = []; col_j = [] # To save tanimoto similarities as sparse data + cache = {} # To save tanimoto similarity by pairs and avoid caclulate it when a pair is repeated + # k_node, i_neigh, j_neigh they are adj matrix indexes AND node names + edge_similarities = [] + for k_node in node_list: # take node as reference and calculate tanimoto coeff for each pair of edges + neighbor_list = list(net.neighbors(k_node)) # take neighbors to get pairs and compare edges + neighbor_list.sort() + if len(neighbor_list) < 2: continue # Skip k nodes that has NOT enough neighbours to perform tanimoto + while len(neighbor_list) > 0: + i_neigh = neighbor_list.pop() + for j_neigh in neighbor_list: + if weight != None: + sim = self.get_tanimoto_index(i_neigh, j_neigh, dotproduct, cache) + else: + sim = self.get_jaccard_index(i_neigh, j_neigh, dotproduct, degree, cache) + if simthr != None and sim < simthr: continue + pair = [self.get_edge_id(k_node, i_neigh, edge_dict), self.get_edge_id(k_node, j_neigh, edge_dict)] + pair.sort() + ki_edge_id, kj_edge_id = pair + data.append(sim) + if verbose: + a_pair = '_'.join(sorted([dictio[k_node], dictio[i_neigh]])) + b_pair = '_'.join(sorted([dictio[k_node], dictio[j_neigh]])) + ids = sorted([a_pair, b_pair]) + edge_similarities.append(ids + [str(sim)]) + col_i.append(ki_edge_id) + col_j.append(kj_edge_id) + + if verbose: + with open('edge_scores.txt', 'w') as f: + for e in edge_similarities: f.write("\t".join(e) + "\n") + condensed_dist_vector, edge_matrix_len = self.get_distance_condensed_vector(data, col_i, col_j) + return condensed_dist_vector, edge_matrix_len, list(edge_dict.keys()) + + def get_tanimoto_index(self, i_neigh, j_neigh, dotproduct, cache): + sim, pair = self.get_sim(i_neigh, j_neigh, cache) + if sim == None: + a_i2 = dotproduct[i_neigh,i_neigh] + a_j2 = dotproduct[j_neigh,j_neigh] + a_ij = dotproduct[i_neigh, j_neigh] + sim = a_ij /( a_i2 + a_j2 - a_ij) + cache[pair] = sim + return sim + + def get_jaccard_index(self, i_neigh, j_neigh, dotproduct, degree, cache): + sim, pair = self.get_sim(i_neigh, j_neigh, cache) + if sim == None: + a_ij = dotproduct[i_neigh, j_neigh] + sim = a_ij /min(degree[i_neigh], degree[j_neigh]) + cache[pair] = sim + return sim + + def get_sim(self, i_neigh, j_neigh, cache): + pair = [i_neigh, j_neigh] + pair.sort() + pair = tuple(pair) + sim = cache.get(pair) + return sim, pair + + def get_edge_id(self, a, b, e_dict): + e = [a, b] + e.sort() + edge_id = tuple(e) + e_index = e_dict.get(edge_id) + if e_index == None: + e_index = self.edge_counter + e_dict[edge_id] = e_index + self.edge_counter += 1 + return e_index + + def get_distance_condensed_vector(self, data, col_i, col_j): + edge_matrix_len = max([max(col_i), max(col_j)]) + 1 # Values in col_i and col_j are 0 based indexes so we need to add 1 to get the vector size + upper_triangle_size = (edge_matrix_len**2 - edge_matrix_len)//2 + condensed_vector = np.ones(upper_triangle_size) + for idx, sim in enumerate(data): # m * i + j - ((i + 2) * (i + 1)) / 2 from https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html + i = col_i[idx] + j = col_j[idx] + v = edge_matrix_len * i + j - ((i + 2) * (i + 1)) // 2 + condensed_vector[v] = 1 - sim + return condensed_vector, edge_matrix_len + + def get_clusters_by_partition_density(self, clustering, edge_len, edge_list, min_edges=None): + tree = {} # clust id : [member_ids] + edges_per_cluster = {} #clust_id : [ edge_tuples ] + partial_partition_densities = {} #clust_id : [ cluster_partition_density ] + + counter = edge_len # this works as cluster id. This is used by the linkage method to tag the intermediate clusters: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html + last_dist = None + constant = 2/edge_len + last_cluster_pool = [] + max_pden = -10000000000 + max_cluster_ids = [] + for a_id, b_id, dist, n_members in clustering: + dist = round(dist, 5) # To make equal similar distances that differs in very low values + if last_dist != None and dist != last_dist: # We could have several clusters at the same dist, so we group the merge events to clculate the partition density + p_den = self.get_pden(last_cluster_pool, partial_partition_densities, constant) + if p_den > max_pden: # check the best partition density + max_pden = p_den + max_cluster_ids = last_cluster_pool + a_id = int(a_id) # Linkage method returns member ids as float instead of int + b_id = int(b_id) + member_list = self.get_member_list(counter, a_id, b_id, edge_len, tree) # members that we merge to build the new agglomerative cluster + nodes, edges = self.get_nodesNedges_per_cluster(member_list, edge_list) + edges_per_cluster[counter] = edges + partial_partition_densities[counter] = self.get_cluster_partial_partition_density(member_list, nodes) + last_cluster_pool = [ cl_id for cl_id in last_cluster_pool if cl_id not in [a_id, b_id] ] # update clusters removin merged cl ids and adding the new cluters ids + last_cluster_pool.append(counter) + last_dist = dist + counter += 1 + + p_den = self.get_pden(last_cluster_pool, partial_partition_densities, constant) # update clusters removin merged cl ids and adding the new cluters ids + if p_den > max_pden: # check the best partition density on the last distance that not was checked + max_pden = p_den + max_cluster_ids = last_cluster_pool + + final_clusters = [ ] + for cluster_id in max_cluster_ids: + members = edges_per_cluster[cluster_id] + if min_edges == None or len(members) >= min_edges: + final_clusters.append(members) + return final_clusters + + def add_cluster_members(self, cluster, member_id, n_records, tree): + if member_id < n_records: # check if member_id is a cluster with only one member that is a original record. That id is less than n_records is the criteria described in https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html + cluster.append(member_id) + else: # The id represents the merge of two previous clusters. We obtain the member list from the tree and remove it to merge in the it in the new cluster + cluster.extend(tree.pop(member_id)) + + def get_member_list(self, cluster_id, a_id, b_id, edge_len, tree): + member_list = [] + self.add_cluster_members(member_list, a_id, edge_len, tree) # get cluster members from previous a cluster + self.add_cluster_members(member_list, b_id, edge_len, tree) # get cluster members from previous b cluster + tree[cluster_id] = member_list + return member_list + + def get_nodesNedges_per_cluster(self, members, edge_list): + nodes = [] + edges = [] + for member in members: + edge = edge_list[member] + edges.append(edge) + nodes.extend(edge) # Add edge nodes to node list + return list(set(nodes)), edges + + def get_cluster_partial_partition_density(self, edges, nodes): + n = len(nodes) #node number + m = len(edges) # link number + #return (m-(n-1))/(n*(n-1)/(2-(n-1))) #Ahn + return (m*(m-n+1))/((n-2)*(n-1)) #kalinka + + def get_pden(self, last_cluster_pool, partial_partition_densities, constant): + partition_den_sum = sum([ partial_partition_densities[cl_id] for cl_id in last_cluster_pool]) #Partition density + p_den = constant * partition_den_sum + return p_den \ No newline at end of file diff --git a/cdlib/test/test_edgeclustering.py b/cdlib/test/test_edgeclustering.py index 8a591cd2..d78c07d5 100644 --- a/cdlib/test/test_edgeclustering.py +++ b/cdlib/test/test_edgeclustering.py @@ -12,6 +12,16 @@ def test_to_json(self): js = coms.to_json() self.assertIsInstance(js, str) + coms = algorithms.hierarchical_link_community_w(g) + self.assertIsInstance(coms, EdgeClustering) + js = coms.to_json() + self.assertIsInstance(js, str) + + coms = algorithms.hierarchical_link_community_full(g) + self.assertIsInstance(coms, EdgeClustering) + js = coms.to_json() + self.assertIsInstance(js, str) + def test_node_map(self): g = nx.karate_club_graph() coms = algorithms.hierarchical_link_community(g) diff --git a/cdlib/utils.py b/cdlib/utils.py index ae9b74a9..528f5909 100644 --- a/cdlib/utils.py +++ b/cdlib/utils.py @@ -239,6 +239,20 @@ def remap_node_communities(communities: object, node_map: dict) -> list: communities = cms return communities +def remap_edge_communities(communities: object, node_map: dict) -> list: # ADDED TO HANDLE THIS CASE, VERSION FOR NODES CAN'T HANDLE THIS + """Apply a map to the obtained communities to retreive the original node labels + + :param communities: EdgeClustering object + :param node_map: dictionary + :return: remapped communities + """ + + cms = [] + for community in communities: + community = [(node_map[e[0]], node_map[e[1]]) for e in community] + cms.append(community) + communities = cms + return communities def affiliations2nodesets(affiliations: dict) -> dict: """