Source code for karateclub.community_detection.non_overlapping.gemsec

import random
import numpy as np
import networkx as nx
from typing import Dict
from karateclub.estimator import Estimator
from karateclub.utils.walker import RandomWalker


[docs]class GEMSEC(Estimator): r"""An implementation of `"GEMSEC" <https://arxiv.org/abs/1802.03997>`_ from the ASONAM '19 paper "GEMSEC: Graph Embedding with Self Clustering". The procedure uses random walks to approximate the pointwise mutual information matrix obtained by pooling normalized adjacency matrix powers. This matrix is decomposed by an approximate factorization technique which is combined with a k-means like clustering cost. A node embedding and clustering are learned jointly. Args: walk_number (int): Number of random walks. Default is 5. walk_length (int): Length of random walks. Default is 80. dimensions (int): Dimensionality of embedding. Default is 32. negative_samples (int): Number of negative samples. Default is 5. window_size (int): Matrix power order. Default is 5. learning_rate (float): Gradient descent learning rate. Default is 0.1. clusters (int): Number of cluster centers. Default is 10. gamma (float): Clustering cost weight coefficient. Default is 0.1. seed (int): Random seed value. Default is 42. """ def __init__( self, walk_number: int = 5, walk_length: int = 80, dimensions: int = 32, negative_samples: int = 5, window_size: int = 5, learning_rate: float = 0.1, clusters: int = 10, gamma: float = 0.1, seed: int = 42, ): self.walk_number = walk_number self.walk_length = walk_length self.dimensions = dimensions self.negative_samples = negative_samples self.window_size = window_size self.learning_rate = learning_rate self.clusters = clusters self.gamma = gamma self.seed = seed def _setup_sampling_weights(self, graph): """ Creating a negative sampling table. Arg types: * **graph** *(NetworkX graph)* - The graph for negative sampling. """ self._sampler = {} index = 0 for node in graph.nodes(): for _ in range(graph.degree(node)): self._sampler[index] = node index = index + 1 self._global_index = index - 1 def _initialize_node_embeddings(self, graph): """ Creating a node embedding array. Arg types: * **graph** *(NetworkX graph)* - The graph for negative sampling. """ shape = (graph.number_of_nodes(), self.dimensions) self._base_embedding = np.random.normal(0, 1.0 / self.dimensions, shape) def _initialize_cluster_centers(self, graph): """ Creating a cluster center array. Arg types: * **graph** *(NetworkX graph)* - The graph for negative sampling. """ shape = (self.dimensions, self.clusters) self._cluster_centers = np.random.normal(0, 1.0 / self.dimensions, shape) def _sample_negative_samples(self): """ Sampling a batch of nodes as negative samples. Return types: * **negative_samples** *(list)*: List of negative sampled nodes. """ negative_samples = [ self._sampler[random.randint(0, self._global_index)] for _ in range(self.negative_samples) ] return negative_samples def _calculcate_noise_vector(self, negative_samples, source_node): """ Getting the noise vector for the weight update. Arg types: * **negative_samples** *(list)*: List of negative sampled nodes. * **source_node** *(int)* - Source node in the walk. Return types: * **noise_vector** *(NumPy array) - Noise update vector. """ noise_vectors = self._base_embedding[negative_samples, :] source_vector = self._base_embedding[int(source_node), :] raw_scores = noise_vectors.dot(source_vector.T) raw_scores = np.exp(np.clip(raw_scores, -15, 15)) scores = raw_scores / np.sum(raw_scores) scores = scores.reshape(-1, 1) noise_vector = np.sum(scores * noise_vectors, axis=0) return noise_vector def _calculate_cluster_vector(self, source_node): """ Getting the cluster vector for the weight update. Arg types: * **source_node** *(int)* - Source node in the walk. Return types: * **cluster_vector** *(NumPy array) - Cluster update vector. * **cluster_index** *(int)*: Node cluster membership index. """ distances = ( self._base_embedding[int(source_node), :].reshape(-1, 1) - self._cluster_centers ) scores = np.power(np.sum(np.power(distances, 2), axis=0), 0.5) cluster_index = np.argmin(scores) cluster_vector = distances[:, cluster_index] / scores[cluster_index] return cluster_vector, cluster_index def _do_descent_for_pair(self, negative_samples, source_node, target_node): """ Updating the cluster center and the node embedding. Arg types: * **negative_samples** *(list)* - Negative samples. * **source_node** *(int)* - Source node in the walk. * **target_node** *(int)* - Target node in the walk. """ noise_vector = self._calculcate_noise_vector(negative_samples, source_node) target_vector = self._base_embedding[int(target_node), :] cluster_vector, cluster_index = self._calculate_cluster_vector(source_node) node_gradient = noise_vector - target_vector + self.gamma * cluster_vector node_gradient = node_gradient / np.linalg.norm(node_gradient) self._base_embedding[int(source_node), :] += -self.learning_rate * node_gradient self._cluster_centers[:, cluster_index] += ( self.learning_rate * self.gamma * cluster_vector ) def _update_a_weight(self, source_node, target_node): """ Updating the weights for a pair of nodes. Arg types: * **source_node** *(int)* - Source node in the walk. * **target_node** *(int)* - Target node in the walk. """ negative_samples = self._sample_negative_samples() self._do_descent_for_pair(negative_samples, source_node, target_node) self._do_descent_for_pair(negative_samples, target_node, source_node) def _do_gradient_descent(self): """ Updating the embedding weights and cluster centers with gradient descent. """ random.shuffle(self._walker.walks) for walk in self._walker.walks: for i, source_node in enumerate( walk[: self.walk_length - self.window_size] ): for step in range(1, self.window_size + 1): target_node = walk[i + step] self._update_a_weight(source_node, target_node)
[docs] def fit(self, graph: nx.classes.graph.Graph): """ Fitting a GEMSEC model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. """ self._set_seed() graph = self._check_graph(graph) self._setup_sampling_weights(graph) self._walker = RandomWalker(self.walk_length, self.walk_number) self._walker.do_walks(graph) self._initialize_node_embeddings(graph) self._initialize_cluster_centers(graph) self._do_gradient_descent()
[docs] def get_embedding(self) -> np.array: r"""Getting the node embedding. Return types: * **embedding** *(Numpy array)*: The embedding of nodes. """ return np.array(self._base_embedding)
def _get_membership(self, node): """Getting the cluster membership of a node. Arg types: * **node** *(int)* - The graph to be clustered. Return types: * **cluster_index** *(int)*: Node cluster membership index. """ distances = self._base_embedding[node, :].reshape(-1, 1) - self._cluster_centers scores = np.power(np.sum(np.power(distances, 2), axis=0), 0.5) cluster_index = np.argmin(scores) return cluster_index
[docs] def get_memberships(self) -> Dict[int, int]: r"""Getting the cluster membership of nodes. Return types: * **memberships** *(dict)*: Node cluster memberships. """ memberships = { node: self._get_membership(node) for node in range(self._base_embedding.shape[0]) } return memberships