import networkx as nx
import numpy as np
from collections import Counter
from karateclub.estimator import Estimator
[docs]class NodeSketch(Estimator):
r"""An implementation of `"NodeSketch" <https://exascale.info/assets/pdf/yang2019nodesketch.pdf>`_
from the KDD '19 paper "NodeSketch: Highly-Efficient Graph Embeddings
via Recursive Sketching". The procedure starts by sketching the self-loop-augmented
adjacency matrix of the graph to output low-order node embeddings, and then recursively
generates k-order node embeddings based on the self-loop-augmented adjacency matrix
and (k-1)-order node embeddings.
Args:
dimensions (int): Embedding dimensions. Default is 32.
iterations (int): Number of iterations (sketch order minus one). Default is 2.
decay (float): Exponential decay rate. Default is 0.01.
seed (int): Random seed value. Default is 42.
"""
def __init__(
self,
dimensions: int = 32,
iterations: int = 2,
decay: float = 0.01,
seed: int = 42,
):
self.dimensions = dimensions
self.iterations = iterations
self.decay = decay
self.seed = seed
self._weight = self.decay / self.dimensions
def _generate_hash_values(self):
"""
Predefine a hash matrix
"""
random_matrix = np.random.rand(self.dimensions, self._num_nodes)
hashes = -np.log(random_matrix)
return hashes
def _do_single_sketch(self):
"""
Perform a single round of sketching
"""
sketch = []
for iter in range(self.dimensions):
hashed = self._sla.copy()
hashed.data = np.array(
[
self._hash_values[iter, self._sla.col[edge]] / self._sla.data[edge]
for edge in range(len(self._sla.data))
]
)
min_values = [np.inf for k in range(self._num_nodes)]
min_indices = [None for k in range(self._num_nodes)]
for i, j, v in zip(hashed.row, hashed.col, hashed.data):
if v < min_values[i]:
min_values[i] = v
min_indices[i] = j
sketch.append(min_indices)
self._sketch = sketch
def _augment_sla(self):
"""
Augment the sla matrix based on the previous sketch
"""
self._sla = self._sla_original.copy()
data = []
row = []
col = []
for node in range(self._num_nodes):
frequencies = []
for neighbor in list(self._graph[node]):
frequencies.append(Counter([dim[neighbor] for dim in self._sketch]))
frequencies = sum(frequencies, Counter())
for target, value in frequencies.items():
row.append(node)
col.append(target)
data.append(value * self._weight)
self._sla.data = np.append(self._sla.data, data)
self._sla.row = np.append(self._sla.row, row)
self._sla.col = np.append(self._sla.col, col)
self._sla.sum_duplicates()
def _sketch_to_np_array(self):
"""
Transform sketch to numpy array
"""
return np.array(self._sketch)
[docs] def fit(self, graph):
"""
Fitting a NodeSketch model.
Arg types:
* **graph** *(NetworkX graph)* - The graph to be embedded.
"""
self._set_seed()
graph = self._check_graph(graph)
self._graph = graph
self._num_nodes = len(graph.nodes)
self._hash_values = self._generate_hash_values()
self._sla = nx.adjacency_matrix(
self._graph, nodelist=range(self._num_nodes)
).tocoo()
self._sla.data = np.array([1 for _ in range(len(self._sla.data))])
self._sla_original = self._sla.copy()
self._do_single_sketch()
for _ in range(self.iterations - 1):
self._augment_sla()
self._do_single_sketch()
[docs] def get_embedding(self):
r"""Getting the node embedding.
Return types:
* **embedding** *(Numpy array)* - The embedding of nodes.
"""
embedding = np.transpose(self._sketch_to_np_array())
return embedding