Skip to content

Commit

Permalink
feat(cellguide): improve computational marker gene algorithm (#6003)
Browse files Browse the repository at this point in the history
  • Loading branch information
atarashansky authored Oct 19, 2023
1 parent 845d776 commit 71d0c23
Show file tree
Hide file tree
Showing 11 changed files with 497 additions and 347 deletions.
35 changes: 29 additions & 6 deletions backend/cellguide/pipeline/computational_marker_genes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,29 @@ def get_computational_marker_genes(*, snapshot: WmgSnapshot, ontology_tree: Onto
else:
marker_genes[key] = marker_genes_per_tissue[key]

# convert all groupby_dims IDs to labels as required by CellGuide
organism_id_to_name = {k: v for d in snapshot.primary_filter_dimensions["organism_terms"] for k, v in d.items()}
tissue_id_to_name = {
k: v
for organism in snapshot.primary_filter_dimensions["tissue_terms"]
for i in snapshot.primary_filter_dimensions["tissue_terms"][organism]
for k, v in i.items()
}
for _, marker_gene_stats_list in marker_genes.items():
for marker_gene_stats in marker_gene_stats_list:
groupby_dims = marker_gene_stats.groupby_dims
groupby_terms = list(groupby_dims.keys())
groupby_term_labels = [term.rsplit("_", 1)[0] + "_label" for term in groupby_terms]
groupby_dims_new = dict(zip(groupby_term_labels, (groupby_dims[term] for term in groupby_terms)))

for key in groupby_dims_new:
if key == "tissue_ontology_term_label":
groupby_dims_new[key] = tissue_id_to_name.get(groupby_dims_new[key], groupby_dims_new[key])
elif key == "organism_ontology_term_label":
groupby_dims_new[key] = organism_id_to_name.get(groupby_dims_new[key], groupby_dims_new[key])

marker_gene_stats.groupby_dims = groupby_dims_new

reformatted_marker_genes = {}
for cell_type_id, marker_gene_stats_list in marker_genes.items():
for marker_gene_stats in marker_gene_stats_list:
Expand All @@ -87,11 +110,11 @@ def get_computational_marker_genes(*, snapshot: WmgSnapshot, ontology_tree: Onto
)
reformatted_marker_genes[symbol][organism][tissue].append(data)

# assert that cell types do not appear multiple times in each gene, tissue, organism
for symbol in reformatted_marker_genes:
for organism in reformatted_marker_genes[symbol]:
for tissue in reformatted_marker_genes[symbol][organism]:
cell_type_ids = [i["cell_type_id"] for i in reformatted_marker_genes[symbol][organism][tissue]]
assert len(cell_type_ids) == len(list(set(cell_type_ids)))
# # assert that cell types do not appear multiple times in each gene, tissue, organism
# for symbol in reformatted_marker_genes:
# for organism in reformatted_marker_genes[symbol]:
# for tissue in reformatted_marker_genes[symbol][organism]:
# cell_type_ids = [i["cell_type_id"] for i in reformatted_marker_genes[symbol][organism][tissue]]
# assert len(cell_type_ids) == len(list(set(cell_type_ids)))

return marker_genes, reformatted_marker_genes

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ class ComputationalMarkerGenes:
me: float
pc: float
marker_score: float
specificity: float
gene_ontology_term_id: str
symbol: str
name: str
groupby_dims: dict[str, str]
202 changes: 82 additions & 120 deletions backend/cellguide/pipeline/computational_marker_genes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,72 +2,34 @@
from typing import Tuple

import numpy as np
from numba import njit
from scipy import stats
from numba import njit, prange

from backend.common.constants import DEPLOYMENT_STAGE_TO_API_URL
from backend.common.utils.rollup import are_cell_types_colinear
from backend.wmg.data.utils import setup_retry_session


@njit
def nanpercentile_2d(arr: np.ndarray, percentile: float, axis: int) -> np.ndarray:
"""
Calculate the specified percentile of a 2D array along an axis, ignoring NaN values.
Arguments
---------
arr - 2D array to calculate percentile of
percentile - percentile to calculate, as a number between 0 and 100
axis - axis along which to calculate percentile
Returns
-------
The specified percentile of the 2D array along the specified axis.
"""
if axis == 0:
result = np.empty(arr.shape[1])
for i in range(arr.shape[1]):
arr_column = arr[:, i]
result[i] = nanpercentile(arr_column, percentile)
return result
else:
result = np.empty(arr.shape[0])
for i in range(arr.shape[0]):
arr_row = arr[i, :]
result[i] = nanpercentile(arr_row, percentile)
return result


@njit
def nanpercentile(arr: np.ndarray, percentile: float):
"""
Calculate the specified percentile of an array, ignoring NaN values.
Arguments
---------
arr - array to calculate percentile of
percentile - percentile to calculate, as a number between 0 and 100
Returns
-------
The specified percentile of the array.
"""

arr_without_nan = arr[np.logical_not(np.isnan(arr))]
length = len(arr_without_nan)
@njit(parallel=True)
def calculate_specificity_excluding_nans(treatment, control):
treatment = treatment.flatten()

if length == 0:
return np.nan
specificities = np.zeros(treatment.size)
for i in prange(treatment.size):
if np.isnan(treatment[i]):
continue
col = control[:, i]
col = col[~np.isnan(col)]
if col.size == 0:
specificities[i] = 1
else:
specificities[i] = (treatment[i] > col).mean()
return specificities

return np.percentile(arr_without_nan, percentile)


def run_ttest(
def calculate_cohens_d(
*, sum1: np.ndarray, sumsq1: np.ndarray, n1: np.ndarray, sum2: np.ndarray, sumsq2: np.ndarray, n2: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
"""
Run a t-test on two sets of data, element-wise.
Calculates Cohen's d for two sets of data.
Arrays "1" and "2" have to be broadcastable into each other.
Arguments
Expand Down Expand Up @@ -96,73 +58,9 @@ def run_ttest(
var1[var1 < 0] = 0
var2 = meansq2 - mean2**2
var2[var2 < 0] = 0

var1_n = var1 / n1
var2_n = var2 / n2
sum_var_n = var1_n + var2_n
dof = sum_var_n**2 / (var1_n**2 / (n1 - 1) + var2_n**2 / (n2 - 1))
tscores = (mean1 - mean2) / np.sqrt(sum_var_n)
effects = (mean1 - mean2) / np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 1))

pvals = stats.t.sf(tscores, dof)
return pvals, effects


def post_process_stats(
*,
cell_type_target: str,
cell_types_context: np.ndarray,
genes: np.ndarray,
pvals: np.ndarray,
effects: np.ndarray,
percentile: float = 0.05,
) -> dict[str, dict[str, float]]:
"""
Post-process the statistical results to handle colinearity of cell types in the ontology and calculate percentiles.
Arguments
---------
cell_type_target - The target cell type
cell_types_context - The context cell types
genes - The genes involved in the analysis
pvals - The p-values from the statistical test
effects - The effect sizes from the statistical test
percentile - The percentile to use for thresholding (default is 0.05)
Returns
-------
A dictionary mapping marker genes to their statistics.
"""

# parent nodes msut not be compared to their children because they share expressions,
# since the expressions are rolled up across descendants
is_colinear = np.array([are_cell_types_colinear(cell_type, cell_type_target) for cell_type in cell_types_context])
effects[is_colinear] = np.nan
pvals[is_colinear] = np.nan

pvals[:, np.all(np.isnan(pvals), axis=0)] = 1
effects[:, np.all(np.isnan(effects), axis=0)] = 0

# aggregate
effects = nanpercentile_2d(effects, percentile * 100, 0)

effects[effects == 0] = np.nan

pvals = np.sort(pvals, axis=0)[int(np.round(0.05 * pvals.shape[0]))]

markers = np.array(genes)[np.argsort(-effects)]
p = pvals[np.argsort(-effects)]
effects = effects[np.argsort(-effects)]

statistics = []
final_markers = []
for i in range(len(p)):
pi = p[i]
ei = effects[i]
if ei is not np.nan and pi is not np.nan:
statistics.append({"p_value": pi, "effect_size": ei})
final_markers.append(markers[i])
return dict(zip(list(final_markers), statistics))
return effects


def query_gene_info_for_gene_description(gene_id: str) -> str:
Expand Down Expand Up @@ -193,3 +91,67 @@ def query_gene_info_for_gene_description(gene_id: str) -> str:
return data["name"]
else:
return gene_id


@njit(parallel=True)
def bootstrap_rows_percentiles(
X: np.ndarray, random_indices: np.ndarray, num_replicates: int = 1000, num_samples: int = 100, percentile: float = 5
):
"""
This function bootstraps rows of a given matrix X.
Arguments
---------
X : np.ndarray
The input matrix to bootstrap.
num_replicates : int, optional
The number of bootstrap replicates to generate, by default 1000.
num_samples : int, optional
The number of samples to draw in each bootstrap replicate, by default 100.
percentile : float, optional
The percentile of the bootstrapped samples for each replicate, by default 15.
Returns
-------
bootstrap_percentile : np.ndarray
The percentile of the bootstrapped samples for each replicate.
"""

bootstrap_percentile = np.zeros((num_replicates, X.shape[1]), dtype="float")
# for each replicate
for n_i in prange(num_replicates):
bootstrap_percentile[n_i] = sort_matrix_columns(X[random_indices[n_i]], percentile, num_samples)

return bootstrap_percentile


@njit
def sort_matrix_columns(matrix, percentile, num_samples):
"""
This function sorts the columns of a given matrix and returns the index associated with
the specified percentile of the sorted samples for each column. This approximates
np.nanpercentile(matrix, percentile, axis=0).
Arguments
---------
matrix : np.ndarray
The input matrix to sort.
percentile : float
The percentile of the sorted samples for each column.
num_samples : int
The number of samples in each column.
Returns
-------
result : np.ndarray
The sorted columns of the input matrix.
"""
num_cols = matrix.shape[1]
result = np.empty(num_cols)
for col in range(num_cols):
sorted_col = np.sort(matrix[:, col])
num_nans = np.isnan(sorted_col).sum()
num_non_nans = num_samples - num_nans
sample_index = int(np.round(percentile / 100 * num_non_nans))
result[col] = sorted_col[sample_index]
return result
2 changes: 1 addition & 1 deletion backend/cellguide/pipeline/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,6 @@
# 24 CPUs was chosen to balance memory usage and speed on a c6i.32xlarge EC2 machine
# In trial runs, the memory usage did not exceed 50% of the available memory, which provides
# ample buffer.
CELLGUIDE_PIPELINE_NUM_CPUS = min(os.cpu_count(), os.getenv("CELLGUIDE_PIPELINE_NUM_CPUS", 12))
CELLGUIDE_PIPELINE_NUM_CPUS = min(os.cpu_count(), os.getenv("CELLGUIDE_PIPELINE_NUM_CPUS", 24))

CELL_GUIDE_DATA_BUCKET_PATH_PREFIX = "s3://cellguide-data-public-"
20 changes: 20 additions & 0 deletions backend/common/utils/rollup.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,26 @@ def are_cell_types_colinear(cell_type1, cell_type2):
return len(set(descendants1).intersection(ancestors2)) > 0 or len(set(descendants2).intersection(ancestors1)) > 0


def get_overlapping_cell_type_descendants(cell_type1, cell_type2):
"""
Get overlapping cell type descendants
Arguments
---------
cell_type1 : str
Cell type 1 (cell type ontology term id)
cell_type2 : str
Cell type 2 (cell type ontology term id)
Returns
-------
list[str]
"""
descendants1 = descendants(cell_type1)
descendants2 = descendants(cell_type2)

Check warning on line 99 in backend/common/utils/rollup.py

View check run for this annotation

Codecov / codecov/patch

backend/common/utils/rollup.py#L98-L99

Added lines #L98 - L99 were not covered by tests

return list(set(descendants1).intersection(descendants2))

Check warning on line 101 in backend/common/utils/rollup.py

View check run for this annotation

Codecov / codecov/patch

backend/common/utils/rollup.py#L101

Added line #L101 was not covered by tests


def rollup_across_cell_type_descendants(
df, cell_type_col="cell_type_ontology_term_id", parallel=True, ignore_cols=None
) -> pd.DataFrame:
Expand Down
Loading

0 comments on commit 71d0c23

Please sign in to comment.