Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Differential expression performance improvements #234

Merged
merged 26 commits into from
May 12, 2022
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 2 additions & 13 deletions hosted/create_rdev_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,6 @@
# A list of allowed matrix types. If an empty list, then all matrix types are allowed
allowed_matrix_types: []

diffexp:
alg_cxg:
# The number of threads to use is computed from: min(max_workers, cpu_multipler * cpu_count).
# Where cpu_count is determined at runtime.
max_workers: 64
cpu_multiplier: 4

# The target number of matrix elements that are evaluated
# together in one thread.
target_workunit: 16_000_000

data_locator:
api_base: http://{env}-backend.internal.rdev.single-cell.czi.technology/dp/v1
s3:
Expand All @@ -74,11 +63,11 @@
# data_locator / s3 / region_name.
tiledb_ctx:
sm.tile_cache_size: 60129542144 # 56 GB
sm.num_reader_threads: 32
py.init_buffer_bytes: 17179869184 # 16GiB

limits:
column_request_max: 32
diffexp_cellcount_max: 50000
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the setting for prod is in single-cell-infra, PR incoming

diffexp_cellcount_max: 1500000

dataset:
app:
Expand Down
2 changes: 1 addition & 1 deletion hosted/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ umap-learn==0.4.6
sentry-sdk[flask]==0.14.3
six==1.14.0
sqlalchemy==1.3.18
tiledb==0.13.1
tiledb==0.13.2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Stick with 0.13.2 or should we try 0.14.x now? @MDunitz has already found issues with 0.14.x, so maybe hold off if 0.13.2 is well-tested?

Either way, we need to keep Explorer at-or-ahead of Portal's TileDB version. And we should keep this version in sync with server/requirements.txt (in this repo).

urllib3==1.26.5
Werkzeug==1.0.1
zipp==3.1.0
163 changes: 0 additions & 163 deletions server/common/compute/diffexp_generic.py

This file was deleted.

17 changes: 0 additions & 17 deletions server/common/config/server_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from server.common.errors import ConfigurationError, DatasetAccessError
from server.common.utils.data_locator import discover_s3_region_name
from server.common.utils.utils import is_port_available, find_available_port, custom_format_warning
from server.compute import diffexp_cxg as diffexp_tiledb
from server.dataset.matrix_loader import MatrixDataLoader, MatrixDataType


Expand Down Expand Up @@ -48,10 +47,6 @@ def __init__(self, app_config, default_config):
self.single_dataset__about = default_config["single_dataset"]["about"]
self.single_dataset__title = default_config["single_dataset"]["title"]

self.diffexp__alg_cxg__max_workers = default_config["diffexp"]["alg_cxg"]["max_workers"]
self.diffexp__alg_cxg__cpu_multiplier = default_config["diffexp"]["alg_cxg"]["cpu_multiplier"]
self.diffexp__alg_cxg__target_workunit = default_config["diffexp"]["alg_cxg"]["target_workunit"]

self.data_locator__s3__region_name = default_config["data_locator"]["s3"]["region_name"]
self.data_locator__api_base = default_config["data_locator"]["api_base"]
self.adaptor__cxg_adaptor__tiledb_ctx = default_config["adaptor"]["cxg_adaptor"]["tiledb_ctx"]
Expand All @@ -69,7 +64,6 @@ def complete_config(self, context):
self.handle_adaptor() # may depend on data_locator
self.handle_single_dataset(context) # may depend on adaptor
self.handle_multi_dataset() # may depend on adaptor
self.handle_diffexp()
self.handle_limits()

self.check_config()
Expand Down Expand Up @@ -255,17 +249,6 @@ def handle_multi_dataset(self):
except ValueError:
raise ConfigurationError(f'Invalid matrix type in "allowed_matrix_types": {mtype}')

def handle_diffexp(self):
self.validate_correct_type_of_configuration_attribute("diffexp__alg_cxg__max_workers", (str, int))
self.validate_correct_type_of_configuration_attribute("diffexp__alg_cxg__cpu_multiplier", int)
self.validate_correct_type_of_configuration_attribute("diffexp__alg_cxg__target_workunit", int)

max_workers = self.diffexp__alg_cxg__max_workers
cpu_multiplier = self.diffexp__alg_cxg__cpu_multiplier
cpu_count = os.cpu_count()
max_workers = min(max_workers, cpu_multiplier * cpu_count)
diffexp_tiledb.set_config(max_workers, self.diffexp__alg_cxg__target_workunit)

def handle_adaptor(self):
# cxg
self.validate_correct_type_of_configuration_attribute("adaptor__cxg_adaptor__tiledb_ctx", dict)
Expand Down
Loading