Source code for cell2location.models.base.base_model

# -*- coding: utf-8 -*-
"""Base model class"""

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.optimize import linear_sum_assignment

from cell2location.plt.plot_heatmap import clustermap


# base model class - defining shared methods but not the model itself
[docs]class BaseModel:
    r"""Base class for pymc3 and pyro models.

    :param X_data: Numpy array of gene expression (cols) in spatial locations (rows)
    :param n_fact: Number of factors
    :param n_iter: Number of training iterations
    :param learning_rate: ADAM learning rate for optimising Variational inference objective
    :param data_type: theano data type used to store parameters ('float32' for single, 'float64' for double precision)
    :param total_grad_norm_constraint: gradient constraints in optimisation
    :param verbose: print diagnostic messages?
    :param var_names: Variable names (e.g. gene identifiers)
    :param var_names_read: Readable variable names (e.g. gene symbol)
    :param obs_names: Observation names (e.g. cell or spot id)
    :param fact_names: Factor names
    :param sample_id: Sample identifiers (e.g. different experiments)
    """

    def __init__(
        self,
        X_data: np.ndarray,
        n_fact: int = 10,
        data_type: str = "float32",
        n_iter: int = 200000,
        learning_rate=0.001,
        total_grad_norm_constraint=200,
        verbose=True,
        var_names=None,
        var_names_read=None,
        obs_names=None,
        fact_names=None,
        sample_id=None,
    ):
        # Initialise parameters
        self.X_data = X_data
        self.n_fact = n_fact
        self.n_var = X_data.shape[1]
        self.n_obs = X_data.shape[0]
        self.data_type = data_type
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.total_grad_norm_constraint = total_grad_norm_constraint
        self.verbose = verbose
        self.fact_filt = None
        self.gene_loadings = None
        self.minibatch_size = None
        self.minibatch_seed = None
        self.extra_data = None  # input data
        self.extra_data_tt = None  # minibatch parameters

        # add essential annotations
        if var_names is None:
            self.var_names = pd.Series(
                ["g_" + str(i) for i in range(self.n_var)], index=["g_" + str(i) for i in range(self.n_var)]
            )
        else:
            self.var_names = pd.Series(var_names, index=var_names)

        if var_names_read is None:
            self.var_names_read = pd.Series(self.var_names, index=self.var_names)
        else:
            self.var_names_read = pd.Series(var_names_read, index=self.var_names)

        if obs_names is None:
            self.obs_names = pd.Series(
                ["c_" + str(i) for i in range(self.n_obs)], index=["c_" + str(i) for i in range(self.n_obs)]
            )
        else:
            self.obs_names = pd.Series(obs_names, index=obs_names)

        if fact_names is None:
            self.fact_names = pd.Series(["fact_" + str(i) for i in range(self.n_fact)])
        else:
            self.fact_names = pd.Series(fact_names)

        if sample_id is None:
            self.sample_id = pd.Series(["sample" for i in range(self.n_obs)], index=self.obs_names)
        else:
            self.sample_id = pd.Series(sample_id, index=self.obs_names)

[docs]    def plot_prior_vs_data(self, data_target_name="data_target", data_node="X_data", log_transform=True):
        r"""Plot data vs a single sample from the prior in a 2D histogram
        Uses self.X_data and self.prior_trace['data_target'].
        :param data_node: name of the object slot containing data
        """

        if type(data_node) is str:
            data_node = getattr(self, data_node)

        if type(data_target_name) is str:
            data_target_name = self.prior_trace[data_target_name]

        # If there are multiple prior samples, expand the data array
        if len(data_target_name.shape) > 2:
            data_node = np.array([data_node for _ in range(data_target_name.shape[0])])

        if log_transform:
            data_node = np.log10(data_node + 1)
            data_target_name = np.log10(data_target_name + 1)

        plt.hist2d(data_node.flatten(), data_target_name.flatten(), bins=50, norm=matplotlib.colors.LogNorm())
        plt.xlabel("Data, log10(nUMI)")
        plt.ylabel("Prior sample, log10(nUMI)")
        plt.title("UMI counts (all spots, all genes)")
        plt.tight_layout()

[docs]    @staticmethod
    def align_plot_stability(fac1, fac2, name1, name2, align=True, return_aligned=False):
        r"""Align columns between two np.ndarrays using scipy.optimize.linear_sum_assignment,
        then plot correlations between columns in fac1 and fac2, ordering fac2 according to alignment

        :param fac1: np.ndarray 1, factors in columns
        :param fac2: np.ndarray 2, factors in columns
        :param name1: axis x name
        :param name2: axis y name
        :param align: boolean, match columns in fac1 and fac2 using linear_sum_assignment?
        """

        corr12 = np.corrcoef(fac1, fac2, False)
        ind_top = np.arange(0, fac1.shape[1])
        ind_right = np.arange(0, fac1.shape[1]) + fac1.shape[1]
        corr12 = corr12[ind_top, :][:, ind_right]
        corr12[np.isnan(corr12)] = -1

        if align:
            img = corr12[:, linear_sum_assignment(2 - corr12)[1]]
        else:
            img = corr12

        plt.imshow(img)

        plt.title(f"Training initialisation \n {name1} vs {name2}")
        plt.xlabel(name2)
        plt.ylabel(name1)

        plt.tight_layout()

        if return_aligned:
            return linear_sum_assignment(2 - corr12)[1]

[docs]    def generate_cv_data(self, n: int = 2, discrete: bool = True, non_discrete_mean_var: float = 1):
        r"""Generate X_data for molecular cross-validation by sampling molecule counts
        with np.random.binomial

        :param n: number of cross-validation folds of equal size to generate, for now, only n=2 is implemented
        """

        if n != 2:
            raise ValueError("only n=2 is implemented for molecular cross-validation")

        self.X_data_sample = {}
        if discrete:
            self.X_data_sample[0] = np.random.binomial(self.X_data.astype("int64"), 1 / n).astype(np.float)
        else:
            shape = (self.X_data / n) ** 2 / ((self.X_data / n) * non_discrete_mean_var)
            scale = ((self.X_data / n) * non_discrete_mean_var) / (self.X_data / n)
            self.X_data_sample[0] = np.random.gamma(shape=shape, scale=scale, size=scale.shape).astype(np.float)
        self.X_data_sample[1] = np.abs(self.X_data - self.X_data_sample[0])

[docs]    def bootstrap_data(self, n=10, downsampling_p=0.8, discrete=True, non_discrete_mean_var=1):
        r"""Generate X_data for bootstrap analysis by sampling molecule counts
        with np.random.binomial

        :param n: number of bootstrap samples to generate
        :param downsampling_p: sample this proportion of values
        :param non_discrete_mean_var: low means lower variance
        """

        self.X_data_sample = {}

        for i in range(n):
            if discrete:
                self.X_data_sample[i] = np.random.binomial(self.X_data.astype("int64"), downsampling_p).astype(np.float)
            else:
                shape = (self.X_data * downsampling_p) ** 2 / (self.X_data * downsampling_p * non_discrete_mean_var)
                scale = (self.X_data * downsampling_p * non_discrete_mean_var) / (self.X_data * downsampling_p)
                self.X_data_sample[i] = np.random.gamma(shape=shape, scale=scale, size=scale.shape).astype(np.float)

[docs]    def plot_posterior_mu_vs_data(self, mu_node_name="mu", data_node="X_data"):
        r"""Plot expected value of the model (e.g. mean of poisson distribution)

        :param mu_node_name: name of the object slot containing expected value
        :param data_node: name of the object slot containing data
        """

        if type(mu_node_name) is str:
            mu = getattr(self, mu_node_name)
        else:
            mu = mu_node_name

        if type(data_node) is str:
            data_node = getattr(self, data_node)

        plt.hist2d(
            np.log10(data_node.flatten() + 1), np.log10(mu.flatten() + 1), bins=50, norm=matplotlib.colors.LogNorm()
        )
        plt.gca().set_aspect("equal", adjustable="box")
        plt.xlabel("Data, log10(nUMI)")
        plt.ylabel("Posterior sample, log10(nUMI)")
        plt.title("UMI counts (all cell, all genes)")
        plt.tight_layout()

[docs]    def plot_history(self, iter_start=0, iter_end=-1, mean_field_slot=None, log_y=True, ax=None):
        r"""Plot training history

        :param iter_start: omit initial iterations from the plot
        :param iter_end: omit last iterations from the plot
        """

        if ax is None:
            ax = plt
            ax.set_xlabel = plt.xlabel
            ax.set_ylabel = plt.ylabel

        if mean_field_slot is None:
            mean_field_slot = self.hist.keys()

        if type(mean_field_slot) == str:
            mean_field_slot = [mean_field_slot]

        for i in mean_field_slot:
            if iter_end == -1:
                iter_end = np.array(self.hist[i]).flatten().shape[0]

            y = np.array(self.hist[i]).flatten()[iter_start:iter_end]
            if log_y:
                y = np.log10(y)
            ax.plot(np.arange(iter_start, iter_end), y, label="train")
            ax.set_xlabel("Training epochs")
            ax.set_ylabel("Reconstruction accuracy (ELBO loss)")
            ax.legend()
            plt.tight_layout()

[docs]    def plot_validation_history(self, start_step=0, end_step=-1, mean_field_slot="init_1", log_y=True, ax=None):
        r"""Plot model loss (NB likelihood + penalty) using the model on training and validation data"""

        if ax is None:
            ax = plt
            ax.set_xlabel = plt.xlabel
            ax.set_ylabel = plt.ylabel

        if end_step == -1:
            end_step = np.array(self.training_hist[mean_field_slot]).flatten().shape[0]

        y = np.array(self.training_hist[mean_field_slot]).flatten()[start_step:end_step]
        if log_y:
            y = np.log10(y)
        ax.plot(np.arange(start_step, end_step), y, label="train")

        y = np.array(self.validation_hist[mean_field_slot]).flatten()[start_step:end_step]
        if log_y:
            y = np.log10(y)
        ax.plot(np.arange(start_step, end_step), y, label="validation")
        ax.set_xlabel("Training epochs")
        ax.set_ylabel("Reconstruction accuracy (log10 NB + L2 loss) / ELBO loss")
        ax.legend()
        plt.tight_layout()

[docs]    def plot_posterior_vs_data(self, gene_fact_name="gene_factors", cell_fact_name="cell_factors"):
        # extract posterior samples of scaled gene and cell factors (before the final likelihood step)
        gene_fact_s = self.samples["post_sample_means"][gene_fact_name]
        cell_factors_scaled_s = self.samples["post_sample_means"][cell_fact_name]

        # compute the poisson rate
        self.mu = np.dot(cell_factors_scaled_s, gene_fact_s.T)

        plt.hist2d(
            np.log10(self.X_data.flatten() + 1),
            np.log10(self.mu.flatten() + 1),
            bins=50,
            norm=matplotlib.colors.LogNorm(),
        )
        plt.gca().set_aspect("equal", adjustable="box")
        plt.xlabel("Data, log10(nUMI)")
        plt.ylabel("Posterior sample, log10(nUMI)")
        plt.title("UMI counts (all cell, all genes)")
        plt.tight_layout()

[docs]    def set_fact_filt(self, fact_filt):
        r"""Specify which factors are not relevant/ not expressed.
        It is currently used to filter results shown by .print_gene_loadings() and .plot_gene_loadings()

        :param fact_filt: logical array specifying which factors are to be retained
        """
        self.fact_filt = fact_filt

[docs]    def apply_fact_filt(self, df):
        r"""Select DataFrame columns by factor filter which was saved in the model object

        :param df: pd.DataFrame
        """
        if self.fact_filt is not None:
            df = df.iloc[:, self.fact_filt]

        return df

[docs]    def print_gene_loadings(
        self, gene_fact_name="gene_factors", loadings_attr="gene_loadings", top_n=10, gene_filt=None, fact_filt=None
    ):
        r"""Print top-10 genes for each factor in gene loadings matrix.

        :param gene_fact_name: model parameter name to extract from samples if self.gene_loadings doesn't exist
        :param loadings_attr: model object attribute name that stores loadings
        :param top_n: number of genes to plot for each factor
        :param gene_filt: boolean filter for genes (e.g. restrict printed markers to TFs)
        :param fact_filt: boolean filter for factors
        """

        if getattr(self, loadings_attr) is None:
            gene_f = self.samples["post_sample_means"][gene_fact_name]
            setattr(
                self, loadings_attr, pd.DataFrame.from_records(gene_f, index=self.var_names, columns=self.fact_names)
            )

        gene_loadings = getattr(self, loadings_attr).copy()

        if fact_filt is not None:
            gene_loadings = gene_loadings.loc[:, fact_filt]

        if gene_filt is not None:
            gene_loadings = gene_loadings.loc[gene_filt, :]

        rows = []
        index = []
        columns = [f"top-{i + 1}" for i in range(top_n)]

        for clmn_ in gene_loadings.columns:
            loading_ = gene_loadings[clmn_].sort_values(ascending=False)
            index.append(clmn_)
            row = [f"{self.var_names_read[i]}: {loading_[i]:.2}" for i in loading_.head(top_n).index]
            rows.append(row)

        return pd.DataFrame(rows, index=index, columns=columns)

[docs]    def plot_gene_loadings(
        self,
        sel_var_names,
        var_names,
        gene_fact_name="gene_factors",
        loadings_attr="gene_loadings",
        figsize=(15, 7),
        cluster_factors=False,
        cluster_genes=True,
        cmap="viridis",
        title="",
        fact_filt=None,
        fun_type="heatmap",
        return_linkage=False,
    ):
        r"""Plot gene loadings as a heatmap

        :param sel_var_names: list of variable names to select
        :param var_names: `sel_var_names` matches some names in var_names
            which identifies each gene in gene loadings
        :param gene_fact_name: model parameter name to extract from samples if self.gene_loadings doesn't exist
        :param figsize: histogram figure size
        :param cluster_factors: hierarchically cluster factors?
        :param cluster_genes: hierarchically cluster genes?
        :param cmap: matplotlib colormap
        :param title: plot title
        :param fact_filt: boolean or character filter for factors
        """

        if getattr(self, loadings_attr) is None:
            gene_f = self.samples["post_sample_means"][gene_fact_name]
            setattr(
                self, loadings_attr, pd.DataFrame.from_records(gene_f, index=self.var_names, columns=self.fact_names)
            )

        gene_loadings = getattr(self, loadings_attr).copy()

        if fact_filt is not None:
            gene_loadings = gene_loadings.loc[:, fact_filt]

        # selected variable index
        sel_index = var_names.isin(sel_var_names)
        key_markers = gene_loadings.loc[sel_index, :]
        key_markers.index = self.var_names_read[sel_index]  # use readable names
        key_markers = key_markers.loc[sel_var_names, :]  # reorder

        clustermap(
            key_markers,
            cluster_rows=cluster_genes,
            cluster_cols=cluster_factors,
            return_linkage=return_linkage,
            figure_size=figsize,
            cmap=cmap,
            title=title,
            fun_type=fun_type,
        )

[docs]    def plot_loading_distribution(self, loadings_name="gene_factors", loadings=None):
        r"""Plot histogram for each loading (column-wise)

        :param loadings_name: character name to be extracted from `self.samples['post_sample_means']`
        :param loadings: np.ndarray to be plotted column-wise. Supersedes `loadings_name`.
        """

        if loadings is None:
            loadings = np.log10(self.samples["post_sample_means"][loadings_name])

        for i in range(loadings.shape[1]):
            plt.subplot(loadings.shape[1], 1, i + 1)
            plt.hist(loadings[:, i], bins=20)
            plt.xlabel(loadings_name + " value")
            plt.title("Factor: " + str(i))
        plt.tight_layout()

[docs]    def factor_expressed_plot(
        self,
        shape_cut=4,
        rate_cut=15,
        sample_type="post_sample_means",
        shape="cell_fact_shape_hyp",
        rate="cell_fact_rate_hyp",
        shape_lab="cell_factors, Gamma shape",
        rate_lab="cell_factors, Gamma rate",
        invert_selection=False,
    ):
        r"""Show which factors are expressed on a scatterplot of their regularising priors

        :param shape_cut: Gamma shape cutoff below which factors are expressed
        :param rate_cut: Gamma rate cutoff below which factors are expressed
        :param sample_type: which posterior summary to look at, default 'post_sample_means'
        :param shape: parameter name for the Gamma shape of each factor, default 'cell_fact_mu_hyp'
        :param rate: parameter name for the Gamma rate of each factor, default 'cell_fact_sd_hyp'
        :param shape_lab: axis label for shape
        :param rate_lab: axis label for rate
        :param invert_selection: if values below cutoffs are for not expressed, set invert_selection to True.
        """

        # Expression shape and rate across cells
        shape = self.samples[sample_type][shape]
        rate = self.samples[sample_type][rate]
        plt.scatter(shape, rate)
        plt.xlabel(shape_lab)
        plt.ylabel(rate_lab)
        plt.vlines(shape_cut, 0, rate_cut)
        plt.hlines(rate_cut, 0, shape_cut)

        low_lab = high_lab = "expressed"
        if not invert_selection:
            high_lab = f"not {high_lab}"
        else:
            low_lab = f"not {low_lab}"

        plt.text(shape_cut - 0.5 * shape_cut, rate_cut - 0.5 * rate_cut, low_lab)
        plt.text(shape_cut + 0.1 * shape_cut, rate_cut + 0.1 * rate_cut, high_lab)

        return {"shape": shape, "rate": rate, "shape_cut": shape_cut, "rate_cut": rate_cut}

[docs]    def plot_reconstruction_history(self, n_type="cv", start_step=0, end_step=45):
        r"""Plot reconstruction error using the model on training and validation data"""

        # Extract RMSE from cross-validation parameter tracking
        rmse = np.array([[i["rmse_total"], i["rmse_total_cv"]] for i in self.tracking["init_1"]["rmse"]])
        rmse_cv = (self.X_data_sample[1] - self.X_data_sample[0]) * (self.X_data_sample[1] - self.X_data_sample[0])

        plt.plot(
            np.arange(start_step, end_step) * self.tracking_every,
            np.log10(rmse[start_step:end_step, 0]),
            label="model on training data",
        )
        plt.plot(
            np.arange(start_step, end_step) * self.tracking_every,
            np.log10(rmse[start_step:end_step, 1]),
            label="model on cross-validation data",
        )
        plt.hlines(
            np.log10(np.sqrt(rmse_cv.mean())),
            start_step,
            end_step * self.tracking_every,
            label="reconstructing using cross-validation data",
        )
        plt.xlabel("Training iterations")
        plt.ylabel("Reconstruction accuracy (log10 RMSE)")
        plt.legend()

[docs]    def export2adata(self, adata, slot_name="mod"):
        r"""Add posterior mean and sd for all parameters to unstructured data `adata.uns['mod']`.

        :param adata: anndata object
        """
        # add factor filter and samples of all parameters to unstructured data
        adata.uns[slot_name] = {}

        adata.uns[slot_name]["mod_name"] = str(self.__class__.__name__)
        adata.uns[slot_name]["fact_filt"] = self.fact_filt
        adata.uns[slot_name]["fact_names"] = self.fact_names.tolist()
        adata.uns[slot_name]["var_names"] = self.var_names.tolist()
        adata.uns[slot_name]["obs_names"] = self.obs_names.tolist()
        adata.uns[slot_name]["post_sample_means"] = self.samples["post_sample_means"]
        adata.uns[slot_name]["post_sample_sds"] = self.samples["post_sample_sds"]
        adata.uns[slot_name]["post_sample_q05"] = self.samples["post_sample_q05"]
        adata.uns[slot_name]["post_sample_q95"] = self.samples["post_sample_q95"]

        return adata