feat(cellguide): improve computational marker gene algorithm (#6003)

chanzuckerberg · Oct 19, 2023 · 71d0c23 · 71d0c23
1 parent 845d776
commit 71d0c23
Show file tree

Hide file tree

Showing 11 changed files with 497 additions and 347 deletions.
diff --git a/backend/cellguide/pipeline/computational_marker_genes/__init__.py b/backend/cellguide/pipeline/computational_marker_genes/__init__.py
@@ -61,6 +61,29 @@ def get_computational_marker_genes(*, snapshot: WmgSnapshot, ontology_tree: Onto
         else:
             marker_genes[key] = marker_genes_per_tissue[key]
 
+    # convert all groupby_dims IDs to labels as required by CellGuide
+    organism_id_to_name = {k: v for d in snapshot.primary_filter_dimensions["organism_terms"] for k, v in d.items()}
+    tissue_id_to_name = {
+        k: v
+        for organism in snapshot.primary_filter_dimensions["tissue_terms"]
+        for i in snapshot.primary_filter_dimensions["tissue_terms"][organism]
+        for k, v in i.items()
+    }
+    for _, marker_gene_stats_list in marker_genes.items():
+        for marker_gene_stats in marker_gene_stats_list:
+            groupby_dims = marker_gene_stats.groupby_dims
+            groupby_terms = list(groupby_dims.keys())
+            groupby_term_labels = [term.rsplit("_", 1)[0] + "_label" for term in groupby_terms]
+            groupby_dims_new = dict(zip(groupby_term_labels, (groupby_dims[term] for term in groupby_terms)))
+
+            for key in groupby_dims_new:
+                if key == "tissue_ontology_term_label":
+                    groupby_dims_new[key] = tissue_id_to_name.get(groupby_dims_new[key], groupby_dims_new[key])
+                elif key == "organism_ontology_term_label":
+                    groupby_dims_new[key] = organism_id_to_name.get(groupby_dims_new[key], groupby_dims_new[key])
+
+            marker_gene_stats.groupby_dims = groupby_dims_new
+
     reformatted_marker_genes = {}
     for cell_type_id, marker_gene_stats_list in marker_genes.items():
         for marker_gene_stats in marker_gene_stats_list:
@@ -87,11 +110,11 @@ def get_computational_marker_genes(*, snapshot: WmgSnapshot, ontology_tree: Onto
             )
             reformatted_marker_genes[symbol][organism][tissue].append(data)
 
-    # assert that cell types do not appear multiple times in each gene, tissue, organism
-    for symbol in reformatted_marker_genes:
-        for organism in reformatted_marker_genes[symbol]:
-            for tissue in reformatted_marker_genes[symbol][organism]:
-                cell_type_ids = [i["cell_type_id"] for i in reformatted_marker_genes[symbol][organism][tissue]]
-                assert len(cell_type_ids) == len(list(set(cell_type_ids)))
+    # # assert that cell types do not appear multiple times in each gene, tissue, organism
+    # for symbol in reformatted_marker_genes:
+    #     for organism in reformatted_marker_genes[symbol]:
+    #         for tissue in reformatted_marker_genes[symbol][organism]:
+    #             cell_type_ids = [i["cell_type_id"] for i in reformatted_marker_genes[symbol][organism][tissue]]
+    #             assert len(cell_type_ids) == len(list(set(cell_type_ids)))
 
     return marker_genes, reformatted_marker_genes
diff --git a/backend/cellguide/pipeline/computational_marker_genes/computational_markers.py b/backend/cellguide/pipeline/computational_marker_genes/computational_markers.py
diff --git a/backend/cellguide/pipeline/computational_marker_genes/types.py b/backend/cellguide/pipeline/computational_marker_genes/types.py
@@ -6,6 +6,8 @@ class ComputationalMarkerGenes:
     me: float
     pc: float
     marker_score: float
+    specificity: float
+    gene_ontology_term_id: str
     symbol: str
     name: str
     groupby_dims: dict[str, str]
diff --git a/backend/cellguide/pipeline/computational_marker_genes/utils.py b/backend/cellguide/pipeline/computational_marker_genes/utils.py
@@ -2,72 +2,34 @@
 from typing import Tuple
 
 import numpy as np
-from numba import njit
-from scipy import stats
+from numba import njit, prange
 
 from backend.common.constants import DEPLOYMENT_STAGE_TO_API_URL
-from backend.common.utils.rollup import are_cell_types_colinear
 from backend.wmg.data.utils import setup_retry_session
 
 
-@njit
-def nanpercentile_2d(arr: np.ndarray, percentile: float, axis: int) -> np.ndarray:
-    """
-    Calculate the specified percentile of a 2D array along an axis, ignoring NaN values.
-
-    Arguments
-    ---------
-    arr - 2D array to calculate percentile of
-    percentile - percentile to calculate, as a number between 0 and 100
-    axis - axis along which to calculate percentile
-
-    Returns
-    -------
-    The specified percentile of the 2D array along the specified axis.
-    """
-    if axis == 0:
-        result = np.empty(arr.shape[1])
-        for i in range(arr.shape[1]):
-            arr_column = arr[:, i]
-            result[i] = nanpercentile(arr_column, percentile)
-        return result
-    else:
-        result = np.empty(arr.shape[0])
-        for i in range(arr.shape[0]):
-            arr_row = arr[i, :]
-            result[i] = nanpercentile(arr_row, percentile)
-        return result
-
-
-@njit
-def nanpercentile(arr: np.ndarray, percentile: float):
-    """
-    Calculate the specified percentile of an array, ignoring NaN values.
-
-    Arguments
-    ---------
-    arr - array to calculate percentile of
-    percentile - percentile to calculate, as a number between 0 and 100
-
-    Returns
-    -------
-    The specified percentile of the array.
-    """
-
-    arr_without_nan = arr[np.logical_not(np.isnan(arr))]
-    length = len(arr_without_nan)
+@njit(parallel=True)
+def calculate_specificity_excluding_nans(treatment, control):
+    treatment = treatment.flatten()
 
-    if length == 0:
-        return np.nan
+    specificities = np.zeros(treatment.size)
+    for i in prange(treatment.size):
+        if np.isnan(treatment[i]):
+            continue
+        col = control[:, i]
+        col = col[~np.isnan(col)]
+        if col.size == 0:
+            specificities[i] = 1
+        else:
+            specificities[i] = (treatment[i] > col).mean()
+    return specificities
 
-    return np.percentile(arr_without_nan, percentile)
 
-
-def run_ttest(
+def calculate_cohens_d(
     *, sum1: np.ndarray, sumsq1: np.ndarray, n1: np.ndarray, sum2: np.ndarray, sumsq2: np.ndarray, n2: np.ndarray
 ) -> Tuple[np.ndarray, np.ndarray]:
     """
-    Run a t-test on two sets of data, element-wise.
+    Calculates Cohen's d for two sets of data.
     Arrays "1" and "2" have to be broadcastable into each other.
 
     Arguments
@@ -96,73 +58,9 @@ def run_ttest(
         var1[var1 < 0] = 0
         var2 = meansq2 - mean2**2
         var2[var2 < 0] = 0
-
-        var1_n = var1 / n1
-        var2_n = var2 / n2
-        sum_var_n = var1_n + var2_n
-        dof = sum_var_n**2 / (var1_n**2 / (n1 - 1) + var2_n**2 / (n2 - 1))
-        tscores = (mean1 - mean2) / np.sqrt(sum_var_n)
         effects = (mean1 - mean2) / np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 1))
 
-    pvals = stats.t.sf(tscores, dof)
-    return pvals, effects
-
-
-def post_process_stats(
-    *,
-    cell_type_target: str,
-    cell_types_context: np.ndarray,
-    genes: np.ndarray,
-    pvals: np.ndarray,
-    effects: np.ndarray,
-    percentile: float = 0.05,
-) -> dict[str, dict[str, float]]:
-    """
-    Post-process the statistical results to handle colinearity of cell types in the ontology and calculate percentiles.
-
-    Arguments
-    ---------
-    cell_type_target - The target cell type
-    cell_types_context - The context cell types
-    genes - The genes involved in the analysis
-    pvals - The p-values from the statistical test
-    effects - The effect sizes from the statistical test
-    percentile - The percentile to use for thresholding (default is 0.05)
-
-    Returns
-    -------
-    A dictionary mapping marker genes to their statistics.
-    """
-
-    # parent nodes msut not be compared to their children because they share expressions,
-    # since the expressions are rolled up across descendants
-    is_colinear = np.array([are_cell_types_colinear(cell_type, cell_type_target) for cell_type in cell_types_context])
-    effects[is_colinear] = np.nan
-    pvals[is_colinear] = np.nan
-
-    pvals[:, np.all(np.isnan(pvals), axis=0)] = 1
-    effects[:, np.all(np.isnan(effects), axis=0)] = 0
-
-    # aggregate
-    effects = nanpercentile_2d(effects, percentile * 100, 0)
-
-    effects[effects == 0] = np.nan
-
-    pvals = np.sort(pvals, axis=0)[int(np.round(0.05 * pvals.shape[0]))]
-
-    markers = np.array(genes)[np.argsort(-effects)]
-    p = pvals[np.argsort(-effects)]
-    effects = effects[np.argsort(-effects)]
-
-    statistics = []
-    final_markers = []
-    for i in range(len(p)):
-        pi = p[i]
-        ei = effects[i]
-        if ei is not np.nan and pi is not np.nan:
-            statistics.append({"p_value": pi, "effect_size": ei})
-            final_markers.append(markers[i])
-    return dict(zip(list(final_markers), statistics))
+    return effects
 
 
 def query_gene_info_for_gene_description(gene_id: str) -> str:
@@ -193,3 +91,67 @@ def query_gene_info_for_gene_description(gene_id: str) -> str:
         return data["name"]
     else:
         return gene_id
+
+
+@njit(parallel=True)
+def bootstrap_rows_percentiles(
+    X: np.ndarray, random_indices: np.ndarray, num_replicates: int = 1000, num_samples: int = 100, percentile: float = 5
+):
+    """
+    This function bootstraps rows of a given matrix X.
+
+    Arguments
+    ---------
+    X : np.ndarray
+        The input matrix to bootstrap.
+    num_replicates : int, optional
+        The number of bootstrap replicates to generate, by default 1000.
+    num_samples : int, optional
+        The number of samples to draw in each bootstrap replicate, by default 100.
+    percentile : float, optional
+        The percentile of the bootstrapped samples for each replicate, by default 15.
+
+    Returns
+    -------
+    bootstrap_percentile : np.ndarray
+        The percentile of the bootstrapped samples for each replicate.
+    """
+
+    bootstrap_percentile = np.zeros((num_replicates, X.shape[1]), dtype="float")
+    # for each replicate
+    for n_i in prange(num_replicates):
+        bootstrap_percentile[n_i] = sort_matrix_columns(X[random_indices[n_i]], percentile, num_samples)
+
+    return bootstrap_percentile
+
+
+@njit
+def sort_matrix_columns(matrix, percentile, num_samples):
+    """
+    This function sorts the columns of a given matrix and returns the index associated with
+    the specified percentile of the sorted samples for each column. This approximates
+    np.nanpercentile(matrix, percentile, axis=0).
+
+    Arguments
+    ---------
+    matrix : np.ndarray
+        The input matrix to sort.
+    percentile : float
+        The percentile of the sorted samples for each column.
+    num_samples : int
+        The number of samples in each column.
+
+    Returns
+    -------
+    result : np.ndarray
+        The sorted columns of the input matrix.
+    """
+    num_cols = matrix.shape[1]
+    result = np.empty(num_cols)
+    for col in range(num_cols):
+        sorted_col = np.sort(matrix[:, col])
+        num_nans = np.isnan(sorted_col).sum()
+        num_non_nans = num_samples - num_nans
+        sample_index = int(np.round(percentile / 100 * num_non_nans))
+        result[col] = sorted_col[sample_index]
+    return result
diff --git a/backend/cellguide/pipeline/constants.py b/backend/cellguide/pipeline/constants.py
@@ -25,6 +25,6 @@
 # 24 CPUs was chosen to balance memory usage and speed on a c6i.32xlarge EC2 machine
 # In trial runs, the memory usage did not exceed 50% of the available memory, which provides
 # ample buffer.
-CELLGUIDE_PIPELINE_NUM_CPUS = min(os.cpu_count(), os.getenv("CELLGUIDE_PIPELINE_NUM_CPUS", 12))
+CELLGUIDE_PIPELINE_NUM_CPUS = min(os.cpu_count(), os.getenv("CELLGUIDE_PIPELINE_NUM_CPUS", 24))
 
 CELL_GUIDE_DATA_BUCKET_PATH_PREFIX = "s3://cellguide-data-public-"
diff --git a/backend/common/utils/rollup.py b/backend/common/utils/rollup.py
@@ -81,6 +81,26 @@ def are_cell_types_colinear(cell_type1, cell_type2):
     return len(set(descendants1).intersection(ancestors2)) > 0 or len(set(descendants2).intersection(ancestors1)) > 0
 
 
+def get_overlapping_cell_type_descendants(cell_type1, cell_type2):
+    """
+    Get overlapping cell type descendants
+
+    Arguments
+    ---------
+    cell_type1 : str
+        Cell type 1 (cell type ontology term id)
+    cell_type2 : str
+        Cell type 2 (cell type ontology term id)
+    Returns
+    -------
+    list[str]
+    """
+    descendants1 = descendants(cell_type1)
+    descendants2 = descendants(cell_type2)
+
+    return list(set(descendants1).intersection(descendants2))
+
+
 def rollup_across_cell_type_descendants(
     df, cell_type_col="cell_type_ontology_term_id", parallel=True, ignore_cols=None
 ) -> pd.DataFrame: