Source code for cell2location.cluster_averages.markers_by_hierarhy

import numpy as np
import pandas as pd


[docs]def markers_by_hierarhy(inf_aver, var_names, hierarhy_df, quantile=[0.05, 0.1, 0.2], mode="exclusive"): r"""Find which genes are expressed at which level of cell type hierarchy. Assigns expression counts for each gene to higher levels of hierarhy using estimates of average expression for the lowest level and substracts that expression from the lowest level. For example, low level annotation can be `Inh_SST neurones`, high level `Inh neurones`, very high level `neurones`, top level `all cell types`. The function can deal with any number of layers but the order needs to be carefully considered (from broad to specific). .. math:: g_{g} = \min\limits_{f} g_{f,g} .. math:: g_{fn,g} = (\min\limits_{f∈fn} g_{f,g}) - g_{g} .. math:: ... .. math:: g_{f3,g} = (\min\limits_{f∈f3} g_{f,g}) - ... - g_{fn,g} - g_{g} .. math:: g_{f2,g} = (\min\limits_{f∈f2} g_{f,g}) - g_{f3,g} - ... - g_{fn,g} - g_{g} .. math:: g_{f1, g} = g_{f,g} - g_{f2,g} - g_{f3,g} - ... - g_{fn,g} - g_{g} Here, :math:`g_{f,g}` represents average expression of each gene in each level 1 cluster. :math:`g_{f1,g}` represents average expression of each gene unique to each level 1 cluster. :math:`g_{f2,g}` represents average expression of each gene unique to each level 2 cluster. :math:`g_{f3,g}` represents average expression of each gene unique to each level 3 cluster. :math:`g_{fn,g}` represents average expression of each gene unique to each level n cluster (can be deep). :math:`g_{g}` represents average expression of each gene unique to the top level (all cells). :param inf_aver: np.ndarray with :math:`g_{g,f}` or with :math:`g_{g,f,s}` where `s` represents posterior samples :param var_names: list, array or index with variable names :param hierarhy_df: pd.DataFrame that provides mapping between clusters at different levels. Index corresponds to level 1 :math:`f1`, first columns to the top level, second columns to the n-th level :math:`fn`, last column corresponds to the second level :math:`f2`. It is crucial the order of cell types :math:`f` in `hierarhy_df` matches the order of cell types in axis 1 of `inf_aver`. :param quantile: list of posterior distribution quantiles to be computed :param mode: 'exclusive' or 'tree' mode. In 'exclusive' mode, the number of counts specific to each layer is computed (e.g. counts at layer 2 are excluded from layer 1). In 'tree' mode, children nodes inherit the expression of their parents (e.g. layer 1 countains the original counts :math:`g_{f,g}`, layer 2 contains counts from all parent layers :math:`g_{f2,g} + g_{f3,g} + ... + g_{fn,g} + g_{g}`. :return: When input is :math:`g_{g,f}` the output is pd.DataFrame with values for :math:`f1, f2, f3, ..., fn, all`in columns. When input is :math:`g_{g,f,s}` where `s` represents posterior sample the output is a dictionary with posterior samples for :math:`g_{g,f1-fn+all,s}` and similar dataframes for 'mean' and quantiles of the posterior distribution (e.g. 'q0.05'). """ results = {} names = {} if len(inf_aver.shape) == 2: # using summarised posterior samples results["level_1"] = pd.DataFrame(inf_aver, index=var_names, columns=list(hierarhy_df.index)) for k in np.arange(hierarhy_df.shape[1]) + 2: k_names = list(hierarhy_df.iloc[:, k - 2].unique()) k_level = hierarhy_df.shape[1] + 3 - k results[f"level_{k_level}"] = pd.DataFrame(index=var_names, columns=k_names) # iterate over clusters at each level (e.g. f2, f3 ...) for c in k_names: ind = hierarhy_df.iloc[:, k - 2] == c c_names = hierarhy_df.index[ind] ind_min = results["level_1"][c_names].min(1) results[f"level_{k_level}"][c] = ind_min results["level_1"][c_names] = (results["level_1"][c_names].T - ind_min).T if mode == "tree": # when mode is tree, add counts from parent levels for plev in np.arange(len(results) - 1): p_level = len(results) - plev p_names = list(hierarhy_df.iloc[:, plev].unique()) # iterate over clusters at each level (e.g. f2, f3 ...) for p in p_names: ind = hierarhy_df.iloc[:, plev] == p if (plev) == (len(results) - 2): ch_names = hierarhy_df.index[ind] else: ch_names = hierarhy_df.loc[ind, :].iloc[:, plev + 1] results[f"level_{p_level-1}"][ch_names] = ( results[f"level_{p_level-1}"][ch_names].T + results[f"level_{p_level}"][p].values ).T # concatenate to produce a general summary sep_inf_aver = pd.concat(list(results.values()), axis=1) return sep_inf_aver elif len(inf_aver.shape) == 3: # using all posterior samples n_genes = inf_aver.shape[0] n_samples = inf_aver.shape[2] results["level_1"] = inf_aver.copy() names["level_1"] = list(hierarhy_df.index) for k in np.arange(hierarhy_df.shape[1]) + 2: k_names = list(hierarhy_df.iloc[:, k - 2].unique()) k_level = hierarhy_df.shape[1] + 3 - k results[f"level_{k_level}"] = np.zeros((n_genes, len(k_names), n_samples)) names[f"level_{k_level}"] = k_names # iterate over clusters at each level (e.g. f2, f3 ...) for c in k_names: ind = hierarhy_df.iloc[:, k - 2] == c k_ind = np.isin(k_names, c) ind_min = results["level_1"][:, ind, :].min(axis=1).reshape((n_genes, 1, n_samples)) results[f"level_{k_level}"][:, k_ind, :] = ind_min results["level_1"][:, ind, :] = results["level_1"][:, ind, :] - ind_min if mode == "tree": # when mode is tree, add counts from parent levels for plev in np.arange(len(results) - 1): p_level = len(results) - plev p_names = list(hierarhy_df.iloc[:, plev].unique()) # iterate over clusters at each level (e.g. f2, f3 ...) for p in p_names: ind = hierarhy_df.iloc[:, plev] == p if (plev) == (len(results) - 2): ch_names = hierarhy_df.index[ind] else: ch_names = hierarhy_df.loc[ind, :].iloc[:, plev + 1] ind = np.isin(names[f"level_{p_level-1}"], ch_names) p_ind = np.isin(p_names, p) results[f"level_{p_level-1}"][:, ind, :] = results[f"level_{p_level-1}"][:, ind, :] + results[ f"level_{p_level}" ][:, p_ind, :].reshape((n_genes, 1, n_samples)) sep_inf_aver = np.concatenate(list(results.values()), axis=1) from itertools import chain sep_inf_aver_names = list(chain(*names.values())) out = { "samples": sep_inf_aver, "mean": pd.DataFrame( np.squeeze(np.mean(sep_inf_aver, axis=2)), index=var_names, columns=sep_inf_aver_names ), } for q in quantile: out[f"q{q}"] = pd.DataFrame( np.squeeze(np.quantile(sep_inf_aver, q=q, axis=2)), index=var_names, columns=sep_inf_aver_names ) # TODO remove redundant layers return out