Source code for acdc_py.pp

### ---------- IMPORT DEPENDENCIES ----------
from ._pp import _corr_distance, _neighbors_knn, _neighbors_graph, _compute_diffusion_map, _nystrom_extension
import numpy as np

### ---------- EXPORT LIST ----------
__all__ = []

# @-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-
# ------------------------------------------------------------------------------
# ---------------------------- ** DISTANCE FUNCS ** ----------------------------
# ------------------------------------------------------------------------------
# @-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-
[docs] def corr_distance(adata, use_reduction=True, reduction_slot="X_pca", key_added="corr_dist", batch_size=1000, dtype=np.int16, verbose=True): """\ A tool for computing a distance matrix based on pearson correlation. Parameters ---------- adata An anndata object containing a signature in adata.X use_reduction : default: True Whether to use a reduction (True) (highly recommended - accurate & much faster) or to use the direct matrix (False) for computing distance. reduction_slot : default: "X_pca" If reduction is TRUE, then specify which slot for the reduction to use. key_added : default: "corr_dist" Slot in obsp to store the resulting distance matrix. batch_size : default: 1000 Reduce total memory usage by running data in batches. dtype : default: np.int16 Data type used to represent the distance values. np.int16 (default) is a compromise between smaller memory size while not reducing information so much as to affect clustering. dtypes include np.int8, np.int16 (default) np.int32, np.int64, np.float16, np.float32, and np.float64. verbose : default: True Show a progress bar for each batch of data. Returns ------- Adds fields to the input adata, such that it contains a distance matrix stored in adata.obsp[key_added]. """ # returns if isinstance(adata, np.ndarray) or isinstance(adata, pd.DataFrame): return _corr_distance( adata, use_reduction, reduction_slot, key_added, batch_size, dtype, verbose )
# @-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@- # ------------------------------------------------------------------------------ # ---------------------------- ** KNN ARRAY FUNC ** ---------------------------- # ------------------------------------------------------------------------------ # @-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-
[docs] def neighbors_knn(adata, max_knn=101, dist_slot="corr_dist", key_added="knn", batch_size = 1000, verbose = True, njobs = 1): """\ A tool for computing a KNN array used to then rapidly generate connectivity graphs with acdc.pp.neighbors_graph for clustering. Parameters ---------- adata An anndata object containing a distance object in adata.obsp. max_knn : default: 101 The maximum number of k-nearest neighbors (knn) to include in this array. acdc.pp.neighbors_graph will only be able to compute KNN graphs with knn <= max_knn. dist_slot : default: "corr_dist" The slot in adata.obsp where the distance object is stored. One way of generating this object is with adata.pp.corr_distance. key_added : default: "knn" Slot in uns to store the resulting knn array. batch-size : default: 1000 Size of the batches used to reduce memory usage. verbose : default: True Whether to display a progress bar of the batches completed. njobs : default: 1 Paralleization option that allows users to speed up runtime. Returns ------- Adds fields to the input adata, such that it contains a knn array stored in adata.uns[key_added]. """ # returns if isinstance(adata, np.ndarray) or isinstance(adata, pd.DataFrame): return _neighbors_knn( adata, max_knn, dist_slot, key_added, batch_size, verbose, njobs )
# @-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@- # ------------------------------------------------------------------------------ # -------------------------- ** NEIGHBOR GRAPH FUNC ** ------------------------- # ------------------------------------------------------------------------------ # @-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-@-
[docs] def neighbors_graph(adata, n_neighbors=15, knn_slot='knn', batch_size=1000, verbose = True): """\ A tool for rapidly computing a k-nearest neighbor (knn) graph (i.e. connectivities) that can then be used for clustering. graphs with acdc.pp.neighbors_graph for clustering. Parameters ---------- adata An anndata object containing a distance object in adata.obsp. n_neighbors : default: 15 The number of nearest neighbors to use to build the connectivity graph. This number must be less than the total number of knn in the knn array stored in adata.uns[knn_slot]. knn_slot : default: 101 The slot in adata.uns where the knn array is stored. One way of generating this object is with acdc.pp.neighbors_knn. batch-size : default: 1000 Size of the batches used to reduce memory usage. verbose : default: True Whether to display a progress bar of the batches completed. Returns ------- Adds fields to the input adata, such that it contains a knn graph stored in adata.obsp['connectivities'] along with metadata in adata.uns["neighbors"]. """ # returns if isinstance(adata, np.ndarray) or isinstance(adata, pd.DataFrame): return _neighbors_graph( adata, n_neighbors, knn_slot, batch_size, verbose )
[docs] def compute_diffusion_map(reference_data, neigen=2, epsilon=None, pca_comps=None): """\ Compute a diffusion map embedding from reference (training) data. Parameters ---------- reference_data : array-like, shape (n_samples_ref, n_features) Input training data (dense or sparse; will be densified internally). neigen : int, optional Number of non-trivial diffusion components to return. Default is 2. epsilon : float or None, optional Gaussian kernel width parameter. If None, set to the square of the median of non-zero pairwise distances. Default is None. pca_comps : int or None, optional If provided, apply PCA to reduce to this many dimensions before computing distances. Default is None. Returns ------- result : dict Dictionary containing: - 'ref_diffusion' : Diffusion coordinates (n_samples_ref × neigen). - 'eigenvalues' : Selected eigenvalues (length neigen). - 'distance_matrix_ref' : Pairwise distance matrix (dense). - 'ref_proc' : Processed reference data after optional PCA. - 'epsilon' : Kernel width used. - 'pca' : PCA object if used, else None. """ return _compute_diffusion_map(reference_data, neigen=neigen, epsilon=epsilon, pca_comps=pca_comps)
[docs] def nystrom_extension(query_data, diffusion_obj): """\ Extend a reference diffusion map to new query data using the Nyström method. Parameters ---------- query_data : array-like, shape (n_query, n_features) Feature matrix for query samples, preprocessed to match the reference. diffusion_obj : dict Output of _compute_diffusion_map, containing: - 'ref_proc' : ndarray of shape (n_ref, n_features_proc) - 'epsilon' : float - 'ref_diffusion' : ndarray of shape (n_ref, neigen) - 'eigenvalues' : ndarray of length neigen - 'pca' : PCA object or None Returns ------- result : dict Dictionary containing: - 'query_diffusion' : Mapped diffusion coordinates (n_query × neigen). - 'distance_matrix_query': Pairwise distance matrix (dense). """ return _nystrom_extension(query_data, diffusion_obj)