Source code for karateclub.node_embedding.neighbourhood.boostne

import numpy as np
import networkx as nx
from scipy import sparse
from sklearn.decomposition import NMF
from karateclub.estimator import Estimator
from inspect import signature


[docs]class BoostNE(Estimator):
    r"""An implementation of `"BoostNE" <https://arxiv.org/abs/1808.08627>`_
    from the ASONAM '19 paper "Multi-Level Network Embedding with Boosted Low-Rank
    Matrix Approximation". The procedure uses non-negative matrix factorization
    iteratively to decompose the residuals obtained by previous factorization models.
    The base target matrix is a pooled sum of adjacency matrix powers.

    Args:
        dimensions (int): Number of individual embedding dimensions. Default is 8.
        iterations (int): Number of boosting iterations. Default is 16.
        order (int): Number of adjacency matrix powers. Default is 2.
        alpha (float): NMF regularization parameter. Default is 0.01.
        seed (int): Random seed value. Default is 42.
    """

    def __init__(
        self,
        dimensions: int = 8,
        iterations: int = 16,
        order: int = 2,
        alpha: float = 0.01,
        seed: int = 42,
    ):
        self.dimensions = dimensions
        self.iterations = iterations
        self.order = order
        self.alpha = alpha
        self.seed = seed

    def _create_D_inverse(self, graph):
        """
        Creating a sparse inverse degree matrix.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be embedded.

        Return types:
            * **D_inverse** *(Scipy array)* - Diagonal inverse degree matrix.
        """
        index = np.arange(graph.number_of_nodes())
        values = np.array(
            [1.0 / graph.degree[node] for node in range(graph.number_of_nodes())]
        )
        shape = (graph.number_of_nodes(), graph.number_of_nodes())
        D_inverse = sparse.coo_matrix((values, (index, index)), shape=shape)
        return D_inverse

    def _create_base_matrix(self, graph):
        """
        Creating a tuple with the normalized adjacency matrix.

        Return types:
            * **(A_hat, A_hat, A_hat)** *(Tuple of SciPy arrays)* - Normalized adjacency matrices.
        """
        A = nx.adjacency_matrix(graph, nodelist=range(graph.number_of_nodes()))
        D_inverse = self._create_D_inverse(graph)
        A_hat = D_inverse.dot(A)
        return (A_hat, A_hat, A_hat)

    def _create_target_matrix(self, graph):
        """
        Creating a log transformed target matrix.

        Return types:
            * **target_matrix** *(SciPy array)* - The PMI matrix.
        """
        A_tilde, A_hat, A_accum = self._create_base_matrix(graph)
        for _ in range(self.order - 1):
            A_tilde = sparse.coo_matrix(A_tilde.dot(A_hat))
            A_accum = A_accum + A_tilde
        A_accum = A_accum / self.order
        return A_accum

    def _sampler(self, index):
        """
        Anchor sampling procedure.

        Arg types:
            * **index** *(int)* - The axis for marginalization.

        Return types:
            * **sample** *(int)* - Anchor point index.
        """
        row_weights = self._residuals.sum(axis=index)
        if len(row_weights.shape) > 1:
            row_weights = row_weights.reshape(-1)
        sums = np.sum(np.sum(row_weights))
        to_pick_from = row_weights.reshape(-1)
        to_pick_from = (to_pick_from / np.sum(to_pick_from)).tolist()[0]
        sample = self._binary_search(to_pick_from)
        return sample

    def _reweighting(self, X, chosen_row, chosen_col):
        """
        Re-scaling the target matrix with the anchor row and column.

        Arg types:
            * **X** *(COO Scipy matrix)* - The target matrix.
            * **chosen_row** *(int)* - The row anchor.
            * **choswen_col** *(int)* - The column anchor.

        Return types:
            * **X** *(COO Scipy matrix)* - The rescaled target matrix.
        """
        row_sims = X.dot(chosen_row.transpose())
        column_sims = chosen_col.transpose().dot(X)
        X = sparse.csr_matrix(row_sims).multiply(X)
        X = X.multiply(sparse.csr_matrix(column_sims))
        return X

    def _fit_and_score_NMF(self, new_residuals):
        """
        Factorizing a residual matrix, returning the approximate target, and an embedding.

        Arg types:
            * **new_residuals** *(COO Scipy matrix)* - The residual matrix.

        Return types:
            * **scores** *(COO Scipy matrix)* - The residual scores.
            * **W** *(Numpy array)* - The embedding matrix.
        """
        
        parameter_names = signature(NMF).parameters

        if "alpha" in parameter_names:
            model = NMF(
                n_components=self.dimensions,
                init="random",
                alpha=self.alpha,
                verbose=False,
            )
        elif "alpha_W" in parameter_names:
            model = NMF(
                n_components=self.dimensions,
                init="random",
                alpha_W=self.alpha,
                verbose=False,
            )
        else:
            raise NotImplementedError(
                "The version of Scikit-learn installed "
                "on this device is not currently supported. "
                "More specifically, in older version of the NMF "
                "method a parameter called `alpha` was available "
                "and it has been replaced with a second parameter "
                "called `alpha_W`. In the installed version neither "
                "parameters were found, and it is therefore unclear "
                "as to how we should proceed."
            )

        W = model.fit_transform(new_residuals)
        H = model.components_

        sub_scores = np.sum(
            np.multiply(W[self._index_1, :], H[:, self._index_2].T), axis=1
        )
        scores = np.maximum(self._residuals.data - sub_scores, 0)
        scores = sparse.csr_matrix(
            (scores, (self._index_1, self._index_2)),
            shape=self._shape,
            dtype=np.float32,
        )
        return scores, W

    def _setup_base_model(self):
        """
        Fitting NMF on the starting matrix.
        """
        self._shape = self._residuals.shape
        indices = self._residuals.nonzero()
        self._index_1 = indices[0]
        self._index_2 = indices[1]
        base_score, embedding = self._fit_and_score_NMF(self._residuals)
        self._embeddings = [embedding]

    def _binary_search(self, weights):
        """
        Weighted search procedure. Choosing a random index.

        Arg types:
            * **weights** *(Numpy array)* - The weights for choosing an index.

        Return types:
            * **low/mid** *(int)* - Sampled index.
        """
        running_totals = np.cumsum(weights)
        target_distance = np.random.uniform(0, 1)
        low, high = 0, len(weights)
        while low < high:
            mid = int((low + high) / 2)
            distance = running_totals[mid]
            if distance < target_distance:
                low = mid + 1
            elif distance > target_distance:
                high = mid
            else:
                return mid
        return low

    def _single_boosting_round(self):
        """
        A method to perform anchor sampling, rescaling, factorization and scoring.
        """
        row = self._sampler(1)
        column = self._sampler(0)
        chosen_row = self._residuals[row, :]
        chosen_column = self._residuals[:, column]
        new_residuals = self._reweighting(self._residuals, chosen_row, chosen_column)
        scores, embedding = self._fit_and_score_NMF(new_residuals)
        self._embeddings.append(embedding)
        self._residuals = scores

[docs]    def fit(self, graph: nx.classes.graph.Graph):
        """
        Fitting a BoostNE model.

        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be embedded.
        """
        self._set_seed()
        graph = self._check_graph(graph)
        self._residuals = self._create_target_matrix(graph)
        self._setup_base_model()
        for _ in range(self.iterations):
            self._single_boosting_round()

[docs]    def get_embedding(self) -> np.array:
        r"""Getting the node embedding.

        Return types:
            * **embedding** *(Numpy array)* - The embedding of nodes.
        """
        embedding = np.concatenate(self._embeddings, axis=1)
        return embedding