Merge branch 'main' into dependabot/pip/scanpy-1.9.5

OmicsML · Oct 3, 2023 · b85eb38 · b85eb38
2 parents 6bab1ae + c023d40
commit b85eb38
Show file tree

Hide file tree

Showing 18 changed files with 371 additions and 107 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -15,4 +15,3 @@ updates:
     ignore:
       - dependency-name: "torch" # see PyG
       - dependency-name: "torchvision"
-      - dependency-name: "numpy" # see numba
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,34 @@
+name: Lint
+
+on:
+  push:
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  run_lint_and_install_test:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+        python-version: ['3.8', '3.9', '3.10', '3.11']
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install -U pip
+          pip install tox tox-gh-actions -U
+
+      - name: Lint and test installation with tox
+        run: tox
+        env:
+          RUN_SETTINGS: cpu-notest
diff --git a/.github/workflows/test_examples.yml b/.github/workflows/test_examples.yml
@@ -6,17 +6,18 @@ on:
     - cron: '59 23 * * 0,2,4'
 
 env:
-  # CUDA paths for MSU ICER HPC with "module load GCC/8.3.0 CUDA/10.2.89"
-  CUDA_PATH: '/opt/software/CUDA/10.2.89-GCC-8.3.0'
-  LD_LIBRARY_PATH: '/opt/software/CUDA/10.2.89-GCC-8.3.0/lib64'
+  ### CUDA paths for MSU ICER HPC with "module load CUDA/11.8"
+  CUDA_PATH: '/opt/software/CUDA/11.8.0'
+  LD_LIBRARY_PATH: '/opt/software/CUDA/11.8.0/lib64'
+  ### CUDA paths for MSU ICER HPC with "module load GCC/8.3.0 CUDA/10.2.89"
+  # CUDA_PATH: '/opt/software/CUDA/10.2.89-GCC-8.3.0'
+  # LD_LIBRARY_PATH: '/opt/software/CUDA/10.2.89-GCC-8.3.0/lib64'
 
 jobs:
   run_examples:
     runs-on: self-hosted
     strategy:
       fail-fast: false
-      matrix:
-        python-version: ['3.8']
 
     steps:
       - uses: actions/checkout@v3
@@ -27,7 +28,7 @@ jobs:
       - name: Install dependencies
         run:
           python -m pip install -U pip
-          pip install tox tox-gh-actions -U
+          pip install tox -U
 
       - name: Test with tox
-        run: tox -e python${{ matrix.python-version }}-gpu
+        run: tox -e py38-gpu-test
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,13 +13,13 @@ repos:
         args: [-c=.yamllint.yml]
 
   - repo: https://github.com/asottile/pyupgrade
-    rev: v3.10.1
+    rev: v3.13.0
     hooks:
       - id: pyupgrade
         args: [--py3-plus]
 
   - repo: https://github.com/google/yapf
-    rev: v0.40.0
+    rev: v0.40.2
     hooks:
       - id: yapf
         name: Format code

diff --git a/README.md b/README.md
@@ -110,8 +110,8 @@ as well as easily reproducible experiments by providing unified tools for
 The full installation process might be a bit tedious and could involve some debugging when using CUDA enabled packages.
 Thus, we provide an `install.sh` script that simplifies the installation process, assuming the user have [conda](https://conda.io/projects/conda/en/latest/index.html) set up on their machines.
 The installation script creates a conda environment `dance` and install the DANCE package along with all its dependencies with a apseicifc CUDA version.
-Currently, two options are accepted: `cpu` and  `cu117`.
-For example, to install the DANCE package using CUDA11.7 in a `dance-env` conda environment, simply run:
+Currently, two options are accepted: `cpu` and  `cu118`.
+For example, to install the DANCE package using CUDA 11.8 in a `dance-env` conda environment, simply run:
 
 ```bash
 # Clone the repository via SSH
@@ -120,7 +120,7 @@ git clone git@github.com:OmicsML/dance.git && cd dance
 # git clone https://github.com/OmicsML/dance.git  && cd dance
 
 # Run the auto installation script to install DANCE and its dependencies in a conda environment
-source install.sh cu117 dance-env
+source install.sh cu118 dance-env
 ```
 
 **Note**: the first argument for cuda version is mandatory, while the second argument for conda environment name is optional (default is `dance`).
@@ -140,17 +140,17 @@ conda create -n dance python=3.8 -y && conda activate dance-dev
 Then, install CUDA enabled packages (PyTorch, PyG, DGL):
 
 ```bash
-conda install pytorch=2.0.0 torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia -y
-conda install pyg=2.3.0 -c pyg -y
-conda install dgl=1.0.1 -c dglteam/label/cu117 -y
+conda install pytorch=2.0.1 torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y
+conda install pyg=2.3.1 -c pyg -y
+conda install dgl=1.1.2 -c dglteam/label/cu118 -y
 ```
 
 Alternatively, install these dependencies for CPU only:
 
 ```bash
-conda install pytorch=2.0.0 torchvision torchaudio cpuonly -c pytorch -y
-conda install pyg=2.3.0 -c pyg -y
-conda install dgl -c dglteam -y
+conda install pytorch=2.0.1 torchvision torchaudio cpuonly -c pytorch -y
+conda install pyg=2.3.1 -c pyg -y
+conda install dgl=1.1.2 -c dglteam -y
 ```
 
 For more information about installation or other CUDA version options, check out the installation pages for the corresponding packages

diff --git a/dance/data/base.py b/dance/data/base.py
@@ -456,6 +456,7 @@ def get_feature(self, *, split_name: Optional[str] = None, return_type: FeatType
         if split_name is not None:
             if channel_type in ["X", "raw_X", "obs", "obsm", "obsp", "layers"]:
                 idx = self.get_split_idx(split_name, error_on_miss=True)
+                idx = list(filter(lambda a: a < feature.shape[0], idx))
                 feature = feature[idx][:, idx] if channel_type == "obsp" else feature[idx]
             else:
                 logger.warning(f"Indexing option for {channel_type!r} not implemented yet.")

diff --git a/dance/datasets/multimodality.py b/dance/datasets/multimodality.py
@@ -62,8 +62,57 @@ def data_paths(self) -> List[str]:
                 osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_train_mod1.h5ad"),
                 osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_train_mod2.h5ad"),
                 osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod1.h5ad"),
-                osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod2.h5ad"),
+                osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_test_mod2.h5ad")
             ]
+            if self.subtask == "10k_pbmc":
+                paths = [
+                    osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_train_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_train_mod2.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_test_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.10kanti_dataset_subset.output_test_mod2.h5ad")
+                ]
+            if self.subtask == "pbmc_cite":
+                paths = [
+                    osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_train_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_train_mod2.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_test_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.citeanti_dataset.output_test_mod2.h5ad")
+                ]
+            if self.subtask.startswith("5k_pbmc"):
+                paths = [
+                    osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_train_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_train_mod2.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_test_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.5kanti_dataset.output_test_mod2.h5ad")
+                ]
+            if self.subtask.startswith("openproblems_2022"):
+                paths = [
+                    osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_train_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_train_mod2.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_test_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.open_dataset.output_test_mod2.h5ad")
+                ]
+            if self.subtask.startswith("GSE127064"):
+                paths = [
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_train_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_train_mod2.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_test_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE126074_dataset.output_test_mod2.h5ad")
+                ]
+            if self.subtask.startswith("GSE117089"):
+                paths = [
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_train_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_train_mod2.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_test_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE117089_dataset.output_test_mod2.h5ad")
+                ]
+            if self.subtask.startswith("GSE140203"):
+                paths = [
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_train_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_train_mod2.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_test_mod1.h5ad"),
+                    osp.join(self.root, self.subtask, f"{self.subtask}.GSE140203_dataset.output_test_mod2.h5ad")
+                ]
         elif self.TASK == "match_modality":
             paths = [
                 osp.join(self.root, self.subtask, f"{self.subtask}.censor_dataset.output_train_mod1.h5ad"),
@@ -100,6 +149,42 @@ class ModalityPredictionDataset(MultiModalityDataset):
         "https://www.dropbox.com/s/cz60vp7bwapz0kw/openproblems_bmmc_multiome_phase2_rna.zip?dl=1",
         "openproblems_bmmc_cite_phase2_rna_subset":
         "https://www.dropbox.com/s/veytldxkgzyoa8j/openproblems_bmmc_cite_phase2_rna_subset.zip?dl=1",
+        "5k_pbmc":
+        "https://www.dropbox.com/scl/fi/uoyis946glh0oo7g833qj/5k_pbmc.zip?rlkey=mw9cvqq7e12iowfbr9rp7av5u&dl=1",
+        "5k_pbmc_subset":
+        "https://www.dropbox.com/scl/fi/pykqc9zyt1fjypnjf4m1l/5k_pbmc_subset.zip?rlkey=brkmnqhfz5yl9axiuu0f8gmxy&dl=1",
+        "10k_pbmc":
+        "https://www.dropbox.com/scl/fi/npz3n36d3w089creppph2/10k_pbmc.zip?rlkey=6yyv61omv2rw7sqqmfp6u7m1s&dl=1",
+        "pbmc_cite":
+        "https://www.dropbox.com/scl/fi/8yvel9lu2f4pbemjeihzq/pbmc_cite.zip?rlkey=5f5jpjy1fcg14hwzot0hot7xd&dl=1",
+        "openproblems_2022_multi_atac2gex":
+        "https://www.dropbox.com/scl/fi/4ynxepu306g3k6vqpi3aw/openproblems_2022_multi_atac2gex.zip?rlkey=2mq5vjnsh26gg5zgq9d85ikcp&dl=1",
+        "openproblems_2022_cite_gex2adt":
+        "https://www.dropbox.com/scl/fi/dalt3qxwe440107ihjbpy/openproblems_2022_cite_gex2adt.zip?rlkey=ps1fvcr622vhibc1wc1umfdaw&dl=1",
+        "GSE127064_AdBrain_gex2atac":
+        "https://www.dropbox.com/scl/fi/4ybsx6pgiuy6j9m0y92ly/GSE127064_AdBrain_gex2atac.zip?rlkey=6a5u7p7xr2dqsoduflzxjluja&dl=1",
+        "GSE127064_p0Brain_gex2atac":
+        "https://www.dropbox.com/scl/fi/k4p3nkkqq56ev6ljyo5se/GSE127064_p0Brain_gex2atac.zip?rlkey=y7kayqmk2l72jjogzlvfxtl74&dl=1",
+        "GSE117089_mouse_gex2atac":
+        "https://www.dropbox.com/scl/fi/egktuwiognr06xebeuouk/GSE117089_mouse_gex2atac.zip?rlkey=jadp3hlopc3112lmxe6nz5cd1&dl=1",
+        "GSE117089_A549_gex2atac":
+        "https://www.dropbox.com/scl/fi/b7evc2n5ih5o3xxwcd7uq/GSE117089_A549_gex2atac.zip?rlkey=b5o0ykptfodim59qwnu2m89fh&dl=1",
+        "GSE117089_sciCAR_gex2atac":
+        "https://www.dropbox.com/scl/fi/juibpvmtv2otvfsq1xyr7/GSE117089_sciCAR_gex2atac.zip?rlkey=qcdbfqsuhab56bc553cwm78gc&dl=1",
+        "GSE140203_3T3_HG19_atac2gex":
+        "https://www.dropbox.com/scl/fi/v1vbypz87t1rz012vojkh/GSE140203_3T3_HG19_atac2gex.zip?rlkey=xmxrwso5e5ty3w53ctbm5bo9z&dl=1",
+        "GSE140203_3T3_MM10_atac2gex":
+        "https://www.dropbox.com/scl/fi/po9k064twny51subze6df/GSE140203_3T3_MM10_atac2gex.zip?rlkey=q0b4y58bsvacnjrmvsclk4jqu&dl=1",
+        "GSE140203_12878.rep2_atac2gex":
+        "https://www.dropbox.com/scl/fi/jqijimb7h6cv4w4hkax1q/GSE140203_12878.rep2_atac2gex.zip?rlkey=c837xkoacap4wjszffpfrmuak&dl=1",
+        "GSE140203_12878.rep3_atac2gex":
+        "https://www.dropbox.com/scl/fi/wlv9dhvylz78kq8ezncmd/GSE140203_12878.rep3_atac2gex.zip?rlkey=5r607plnqzlxdgxtc4le8d6o1&dl=1",
+        "GSE140203_K562_HG19_atac2gex":
+        "https://www.dropbox.com/scl/fi/n2he1br3u604p3mgniowz/GSE140203_K562_HG19_atac2gex.zip?rlkey=2lhe7s5run8ly5uk4b0vfemyj&dl=1",
+        "GSE140203_K562_MM10_atac2gex":
+        "https://www.dropbox.com/scl/fi/dhdorqy87915uah3xl07a/GSE140203_K562_MM10_atac2gex.zip?rlkey=ecwsy5sp7f2i2gtjo1qyaf4zt&dl=1",
+        "GSE140203_LUNG_atac2gex":
+        "https://www.dropbox.com/scl/fi/gabugiw244ky85j3ckq4d/GSE140203_LUNG_atac2gex.zip?rlkey=uj0we276s6ay2acpioj4tmfj3&dl=1"
     }
     SUBTASK_NAME_MAP = {
         "adt2gex": "openproblems_bmmc_cite_phase2_mod2",
@@ -110,9 +195,10 @@ class ModalityPredictionDataset(MultiModalityDataset):
     }
     AVAILABLE_DATA = sorted(list(URL_DICT) + list(SUBTASK_NAME_MAP))
 
-    def __init__(self, subtask, root="./data", preprocess=None):
+    def __init__(self, subtask, root="./data", preprocess=None, span=0.3):
         # TODO: factor our preprocess
         self.preprocess = preprocess
+        self.span = span
         super().__init__(subtask, root)
 
     def _raw_to_dance(self, raw_data):
@@ -135,7 +221,7 @@ def _maybe_preprocess(self, raw_data, selection_threshold=10000):
         if self.preprocess == "feature_selection":
             if raw_data[0].shape[1] > selection_threshold:
                 sc.pp.highly_variable_genes(raw_data[0], layer="counts", flavor="seurat_v3",
-                                            n_top_genes=selection_threshold)
+                                            n_top_genes=selection_threshold, span=self.span)
                 raw_data[2].var["highly_variable"] = raw_data[0].var["highly_variable"]
                 for i in [0, 2]:
                     raw_data[i] = raw_data[i][:, raw_data[i].var["highly_variable"]]
@@ -169,10 +255,11 @@ class ModalityMatchingDataset(MultiModalityDataset):
     }
     AVAILABLE_DATA = sorted(list(URL_DICT) + list(SUBTASK_NAME_MAP))
 
-    def __init__(self, subtask, root="./data", preprocess=None, pkl_path=None):
+    def __init__(self, subtask, root="./data", preprocess=None, pkl_path=None, span=0.3):
         # TODO: factor our preprocess
         self.preprocess = preprocess
         self.pkl_path = pkl_path
+        self.span = span
         super().__init__(subtask, root)
 
     def _raw_to_dance(self, raw_data):
@@ -252,7 +339,7 @@ def _maybe_preprocess(self, raw_data, selection_threshold=10000):
             for i in range(2):
                 if modalities[i].shape[1] > selection_threshold:
                     sc.pp.highly_variable_genes(modalities[i], layer="counts", flavor="seurat_v3",
-                                                n_top_genes=selection_threshold)
+                                                n_top_genes=selection_threshold, span=self.span)
                     modalities[i + 2].var["highly_variable"] = modalities[i].var["highly_variable"]
                     modalities[i] = modalities[i][:, modalities[i].var["highly_variable"]]
                     modalities[i + 2] = modalities[i + 2][:, modalities[i + 2].var["highly_variable"]]