Source code for karateclub.graph_embedding.gl2vec

import numpy as np
import networkx as nx
from typing import List
from karateclub.estimator import Estimator
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from karateclub.utils.treefeatures import WeisfeilerLehmanHashing


[docs]class GL2Vec(Estimator): r"""An implementation of `"GL2Vec" <https://link.springer.com/chapter/10.1007/978-3-030-36718-3_1>`_ from the ICONIP '19 paper "GL2vec: Graph Embedding Enriched by Line Graphs with Edge Features". First, the algorithm creates the line graph of each graph in the graph dataset. The procedure creates Weisfeiler-Lehman tree features for nodes in graphs. Using these features a document (graph) - feature co-occurrence matrix is decomposed in order to generate representations for the graphs. The procedure assumes that nodes have no string feature present and the WL-hashing defaults to the degree centrality. However, if a node feature with the key "feature" is supported for the nodes the feature extraction happens based on the values of this key. Args: wl_iterations (int): Number of Weisfeiler-Lehman iterations. Default is 2. dimensions (int): Dimensionality of embedding. Default is 128. workers (int): Number of cores. Default is 4. down_sampling (float): Down sampling frequency. Default is 0.0001. epochs (int): Number of epochs. Default is 10. learning_rate (float): HogWild! learning rate. Default is 0.025. min_count (int): Minimal count of graph feature occurrences. Default is 5. seed (int): Random seed for the model. Default is 42. """ def __init__( self, wl_iterations: int = 2, dimensions: int = 128, workers: int = 4, down_sampling: float = 0.0001, epochs: int = 10, learning_rate: float = 0.025, min_count: int = 5, seed: int = 42, erase_base_features: bool = False, ): self.wl_iterations = wl_iterations self.dimensions = dimensions self.workers = workers self.down_sampling = down_sampling self.epochs = epochs self.learning_rate = learning_rate self.min_count = min_count self.seed = seed self.erase_base_features = erase_base_features def _create_line_graph(self, graph): r"""Getting the embedding of graphs. Arg types: * **graph** *(NetworkX graph)* - The graph transformed to be a line graph. Return types: * **line_graph** *(NetworkX graph)* - The line graph of the source graph. """ graph = nx.line_graph(graph) node_mapper = {node: i for i, node in enumerate(graph.nodes())} edges = [[node_mapper[edge[0]], node_mapper[edge[1]]] for edge in graph.edges()] line_graph = nx.from_edgelist(edges) return line_graph
[docs] def fit(self, graphs: List[nx.classes.graph.Graph]): """ Fitting a GL2Vec model. Arg types: * **graphs** *(List of NetworkX graphs)* - The graphs to be embedded. """ self._set_seed() graphs = self._check_graphs(graphs) graphs = [self._create_line_graph(graph) for graph in graphs] documents = [ WeisfeilerLehmanHashing( graph, self.wl_iterations, False, self.erase_base_features ) for graph in graphs ] documents = [ TaggedDocument(words=doc.get_graph_features(), tags=[str(i)]) for i, doc in enumerate(documents) ] self.model = Doc2Vec( documents, vector_size=self.dimensions, window=0, min_count=self.min_count, dm=0, sample=self.down_sampling, workers=self.workers, epochs=self.epochs, alpha=self.learning_rate, seed=self.seed, ) self._embedding = [self.model.docvecs[str(i)] for i, _ in enumerate(documents)]
[docs] def get_embedding(self) -> np.array: r"""Getting the embedding of graphs. Return types: * **embedding** *(Numpy array)* - The embedding of graphs. """ return np.array(self._embedding)
[docs] def infer(self, graphs) -> np.array: """Infer the graph embeddings. Arg types: * **graphs** *(List of NetworkX graphs)* - The graphs to be embedded. Return types: * **embedding** *(Numpy array)* - The embedding of graphs. """ self._set_seed() graphs = self._check_graphs(graphs) graphs = [self._create_line_graph(graph) for graph in graphs] documents = [ WeisfeilerLehmanHashing( graph, self.wl_iterations, False, self.erase_base_features ) for graph in graphs ] documents = [doc.get_graph_features() for _, doc in enumerate(documents)] embedding = np.array( [ self.model.infer_vector( doc, alpha=self.learning_rate, min_alpha=0.00001, epochs=self.epochs ) for doc in documents ] ) return embedding