Source code for karateclub.node_embedding.neighbourhood.diff2vec

import numpy as np
import networkx as nx
from gensim.models.word2vec import Word2Vec
from karateclub.utils.diffuser import EulerianDiffuser
from karateclub.estimator import Estimator


[docs]class Diff2Vec(Estimator): r"""An implementation of `"Diff2Vec" <http://homepages.inf.ed.ac.uk/s1668259/papers/sequence.pdf>`_ from the CompleNet '18 paper "Diff2Vec: Fast Sequence Based Embedding with Diffusion Graphs". The procedure creates diffusion trees from every source node in the graph. These graphs are linearized by a directed Eulerian walk, the walks are used for running the skip-gram algorithm the learn node level neighbourhood based embeddings. Args: diffusion_number (int): Number of diffusions. Default is 10. diffusion_cover (int): Number of nodes in diffusion. Default is 80. dimensions (int): Dimensionality of embedding. Default is 128. workers (int): Number of cores. Default is 4. window_size (int): Matrix power order. Default is 5. epochs (int): Number of epochs. Default is 1. learning_rate (float): HogWild! learning rate. Default is 0.05. min_count (int): Minimal count of node occurrences. Default is 1. seed (int): Random seed value. Default is 42. """ def __init__( self, diffusion_number: int = 10, diffusion_cover: int = 80, dimensions: int = 128, workers: int = 4, window_size: int = 5, epochs: int = 1, learning_rate: float = 0.05, min_count: int = 1, seed: int = 42, ): self.diffusion_number = diffusion_number self.diffusion_cover = diffusion_cover self.dimensions = dimensions self.workers = workers self.window_size = window_size self.epochs = epochs self.learning_rate = learning_rate self.min_count = min_count self.seed = seed
[docs] def fit(self, graph: nx.classes.graph.Graph): """ Fitting a Diff2Vec model. Arg types: * **graph** *(NetworkX graph)* - The graph to be embedded. """ self._set_seed() graph = self._check_graph(graph) diffuser = EulerianDiffuser(self.diffusion_number, self.diffusion_cover) diffuser.do_diffusions(graph) model = Word2Vec( diffuser.diffusions, hs=1, alpha=self.learning_rate, epochs=self.epochs, vector_size=self.dimensions, window=self.window_size, min_count=self.min_count, workers=self.workers, seed=self.seed, ) num_of_nodes = graph.number_of_nodes() self._embedding = [model.wv[str(n)] for n in range(num_of_nodes)]
[docs] def get_embedding(self) -> np.array: r"""Getting the node embedding. Return types: * **embedding** *(Numpy array)* - The embedding of nodes. """ return np.array(self._embedding)