Merge branch 'master' into feature/improve_wandb_cyclegan_reporting

ai2cm · Jun 22, 2023 · 5f5913e · 5f5913e
2 parents 26de524 + 9327855
commit 5f5913e
Show file tree

Hide file tree

Showing 200 changed files with 32,418 additions and 585 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -232,10 +232,43 @@ jobs:
             cd external/radiation
             . /home/circleci/.nix-profile/etc/profile.d/nix.sh
             nix-shell --command "pytest -s tests/test_driver"
+  build_and_test_scream:
+    parameters:
+      image:
+        default: prognostic_scream_run
+        type: string
+    machine:
+      image: ubuntu-2004:202111-02
+    resource_class: large
+    environment:
+      GOOGLE_PROJECT_ID: vcm-ml
+      GOOGLE_APPLICATION_CREDENTIALS: /tmp/key.json
+      GOOGLE_COMPUTE_ZONE: us-central1
+      IMAGE: <<parameters.image>>
+    steps:
+      - checkout
+      - run:
+          name: "gcloud auth"
+          command: |
+            echo $ENCODED_GOOGLE_CREDENTIALS | base64 -d > $GOOGLE_APPLICATION_CREDENTIALS
+            echo "export GCLOUD_SERVICE_KEY=\$(echo \$ENCODED_GOOGLE_CREDENTIALS | base64 --decode)" >> $BASH_ENV
+      - gcp-gcr/gcr-auth
+      - run:
+          name: "Build and push scream image"
+          no_output_timeout: 20m
+          command: |
+                sudo chown -R circleci:circleci /home/circleci/.docker && \
+                .circleci/build_and_push_image.sh
+parameters:
+  run-weekly-workflow:
+    type: boolean
+    default: false
 
 workflows:
   version: 2
   test_and_lint:
+    when:
+      not: << pipeline.parameters.run-weekly-workflow >>
     jobs:
       - lint
       - argo
@@ -309,3 +342,7 @@ workflows:
             branches:
               only: master
       - validate_radiation_port
+  weekly-scream-workflow:
+    when: << pipeline.parameters.run-weekly-workflow >>
+    jobs:
+      - build_and_test_scream
diff --git a/.environment-scripts/install_local_packages.sh b/.environment-scripts/install_local_packages.sh
@@ -24,5 +24,6 @@ pip install -c constraints.txt \
   -e workflows/fine_res_budget \
   -e workflows/dataflow \
   -e workflows/diagnostics \
+  -e workflows/prognostic_scream_run \
   -e external/wandb-query
 set +e
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -39,6 +39,8 @@ repos:
         workflows/dataflow/fv3net/pipelines/restarts_to_zarr/.+ |
         workflows/prognostic_c48_run/.+ |
         workflows/prognostic_c48_run/tests/.+ |
+        workflows/prognostic_scream_run/.+ |
+        workflows/prognostic_scream_run/tests/.+ |
         external/fv3fit/fv3fit/.+ |
         external/loaders/loaders/.+ |
         external/radiation/radiation/.+ |

diff --git a/Makefile b/Makefile
@@ -134,6 +134,27 @@ enter_%:
 		-w $(PROGNOSTIC_RUN_WORKDIR) \
 		$(REGISTRY)/$*:$(VERSION) bash
 
+SCREAM_INSTALL_PATH ?= docker/prognostic_scream_run/scream
+SCREAM_BRANCH ?= master
+clone_scream_repository:
+	if [ ! -d ${SCREAM_INSTALL_PATH} ]; then \
+		git clone -b ${SCREAM_BRANCH} https://github.com/E3SM-Project/scream.git ${SCREAM_INSTALL_PATH}; \
+	else \
+		echo "${SCREAM_INSTALL_PATH} already existed, nothing to clone";\
+	fi
+
+update_scream_repository: clone_scream_repository
+	cd "${SCREAM_INSTALL_PATH}"; \
+	git pull; \
+	git submodule update --init --recursive; \
+	git submodule sync --recursive; \
+
+build_image_prognostic_scream_run: update_scream_repository
+	tools/docker_build_cached.sh $(REGISTRY)/prognostic_scream_run:$(CACHE_TAG) \
+		-f docker/prognostic_scream_run/Dockerfile -t $(REGISTRY)/prognostic_scream_run:$(VERSION) .
+
+image_test_prognostic_scream_run:
+	tools/docker-run --rm -v $(shell pwd)/tests/scream_run_integration:/tmp/scream_run_integration $(REGISTRY)/prognostic_scream_run:$(VERSION) /tmp/scream_run_integration/test_scream_run.sh
 ############################################################
 # Documentation (rules match "deploy_docs_%")
 ############################################################

diff --git a/constraints.txt b/constraints.txt
@@ -315,7 +315,7 @@ wrapt==1.13.3
 xarray==0.19.0
 xgcm==0.6.1
 xmltodict==0.12.0
-xpartition==0.2.0
+xpartition==0.2.1
 yarl==1.6.3
 yq==2.11.0
 zarr==2.13.2

diff --git a/docker/prognostic_scream_run/Dockerfile b/docker/prognostic_scream_run/Dockerfile
@@ -0,0 +1,126 @@
+ARG MAMBAFORGE_VERSION=4.14.0-0
+FROM condaforge/mambaforge:${MAMBAFORGE_VERSION} AS base
+
+ARG PNETCDF_VERSION=1.12.3
+ENV PNETCDF_VERSION=${PNETCDF_VERSION}
+
+ARG LIBNETCDF_VERSION=4.8.1
+ENV LIBNETCDF_VERSION=${LIBNETCDF_VERSION}
+
+ARG NETCDF_FORTRAN_VERSION=4.6.0
+ENV NETCDF_FORTRAN_VERSION=${NETCDF_FORTRAN_VERSION}
+
+ARG ESMF_VERSION=8.4.0
+ENV ESMF_VERSION=${ESMF_VERSION}
+
+ARG GCC_VERSION=10.*
+ENV GCC_VERSION=${GCC_VERSION}
+
+ENV USER=root
+ENV LOGNAME=root
+
+SHELL ["/bin/bash", "-c"]
+
+# Install common packages
+RUN mamba install --yes -c conda-forge \
+    cmake \
+    make \
+    wget \
+    curl \
+    subversion \
+    m4 \
+    pytest \
+    pytest-cov\
+    pyyaml \
+    vim \
+    openssh && \
+    rm -rf /opt/conda/pkgs/*
+
+# Install version locked packages
+RUN mamba install --yes -c conda-forge \
+    libnetcdf=${LIBNETCDF_VERSION}=*openmpi* \
+    netcdf-fortran=${NETCDF_FORTRAN_VERSION}=*openmpi* \
+    esmf=${ESMF_VERSION}=*openmpi* \
+    gcc_linux-64=${GCC_VERSION} \
+    gxx_linux-64=${GCC_VERSION} \
+    openmpi-mpifort \
+    gfortran_linux-64=${GCC_VERSION} && \
+    rm -rf /opt/conda/pkgs/* && \
+    ln -sf /opt/conda/bin/x86_64-conda-linux-gnu-ar /opt/conda/bin/ar && \
+    ln -sf /opt/conda/bin/x86_64-conda-linux-gnu-ranlib /opt/conda/bin/ranlib
+
+# Install cpan packages
+RUN cpan install XML::LibXML Switch
+
+# Build pnetcdf
+RUN curl -L -k -o "${PWD}/pnetcdf.tar.gz" \
+    https://parallel-netcdf.github.io/Release/pnetcdf-${PNETCDF_VERSION}.tar.gz && \
+    mkdir "${PWD}/pnetcdf" && \
+    tar -xvf "${PWD}/pnetcdf.tar.gz" -C "${PWD}/pnetcdf" --strip-components=1 && \
+    rm -rf "${PWD}/pnetcdf.tar.gz" && \
+    cd "${PWD}/pnetcdf" && \
+    source /opt/conda/etc/profile.d/conda.sh && \
+    conda activate base && \
+    ./configure --prefix /opt/conda --disable-cxx --enable-shared \
+    MPICC=/opt/conda/bin/mpicc \
+    MPICXX=/opt/conda/bin/mpicxx \
+    MPIF77=/opt/conda/bin/mpif77 \
+    MPIF90=/opt/conda/bin/mpif90 && \
+    make -j4 && \
+    make install && \
+    rm -rf "${PWD}/pnetcdf"
+
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive \
+    apt-get install -y --no-install-recommends \
+    curl ca-certificates software-properties-common \
+    gcc make libtool libhwloc-dev libx11-dev libxt-dev libedit-dev \
+    libical-dev ncurses-dev perl python-dev tcl-dev tk-dev swig libexpat-dev libssl-dev \
+    libxext-dev libxft-dev autoconf automake \
+    postgresql-12 postgresql-server-dev-all postgresql-contrib \
+    expat libedit2 python3 sendmail-bin sudo tcl tk && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update && \
+    DEBIAN_FRONTEND=noninteractive \
+    apt-get install -y python3.7 python3.7-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install additional packages
+RUN mamba install --yes -c conda-forge \
+    lapack \
+    blas && \
+    rm -rf /opt/conda/pkgs/*
+
+# Install dependencies
+COPY docker/prognostic_scream_run/scream/components/eamxx/docker/requirements.txt /tmp/requirements.txt
+RUN pip install -r /tmp/requirements.txt
+
+# install gcloud
+RUN apt-get update && apt-get install -y  apt-transport-https ca-certificates gnupg curl gettext
+
+RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list &&\
+    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
+
+RUN apt-get update && apt-get install -y google-cloud-sdk
+RUN gcloud config set project vcm-ml
+
+ENV OMPI_ALLOW_RUN_AS_ROOT=1
+ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
+ENV IS_DOCKER=TRUE
+COPY docker/prognostic_scream_run/scream/ /src/E3SM
+COPY external/vcm /src/vcm
+RUN pip install /src/vcm
+COPY workflows/prognostic_scream_run/ /src/prognostic_scream_run
+RUN pip install -r /src/prognostic_scream_run/requirements.txt
+RUN pip install -e /src/prognostic_scream_run
+COPY docker/prognostic_scream_run/precompile_scream.sh /src/precompile_scream.sh
+ENV CC=/opt/conda/bin/mpicc
+ENV CXX=/opt/conda/bin/mpicxx
+ENV FC=/opt/conda/bin/mpif90
+ENV AR=/opt/conda/bin/x86_64-conda-linux-gnu-ar
+ENV RANLIB=/opt/conda/bin/x86_64-conda-linux-gnu-ranlib
+ENV LDFLAGS="-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/opt/conda/lib -Wl,-rpath-link,/opt/conda/lib -L/opt/conda/lib"
+ENV GOOGLE_APPLICATION_CREDENTIALS /tmp/key.json
+RUN --mount=type=secret,id=gcp,dst=/tmp/key.json \
+    export FSSPEC_GS_TOKEN=/tmp/key.json && \
+    /src/precompile_scream.sh
diff --git a/docker/prognostic_scream_run/precompile_scream.sh b/docker/prognostic_scream_run/precompile_scream.sh
@@ -0,0 +1,25 @@
+function fixup_mct {
+    local mct_path="${1}"
+
+    # TODO make PR to fix
+    if [[ ! -e "${mct_path}/mct/Makefile.bak" ]]
+    then
+        sed -i".bak" "s/\$(AR)/\$(AR) \$(ARFLAGS)/g" "${mct_path}/mct/Makefile"
+    fi
+
+    if [[ ! -e "${mct_path}/mpeu/Makefile.bak" ]]
+    then
+        sed -i".bak" "s/\$(AR)/\$(AR) \$(ARFLAGS)/g" "${mct_path}/mpeu/Makefile"
+    fi
+}
+
+# Fixes mct/mpeu to use ARFLAGS environment variable
+# CIME will eventually have this fixed, remove this function when it does
+fixup_mct "/src/E3SM/externals/mct"
+for number_of_processors in 16 180; do
+    cd /tmp
+    cp /src/prognostic_scream_run/tests/example_configs/scream_ne30pg2.yaml ${number_of_processors}.yaml
+    sed -i -e "s/number_of_processors: 16/number_of_processors: $(printf "%d" $number_of_processors)/g" ${number_of_processors}.yaml
+    mkdir -p rundir
+    scream_run write-rundir ${number_of_processors}.yaml rundir
+done
diff --git a/external/emulation/emulation/masks.py b/external/emulation/emulation/masks.py
@@ -56,6 +56,14 @@ def __call__(self, state: FortranState, emulator: FortranState) -> FortranState:
         use_fortran_state = slice(self.start, self.stop)
         # Fortran state TOA is index 79, and dims are [z, sample]
         emulator_field = np.copy(emulator[self.key])
+
+        # Currently, fortran fields pushed into python state are 64bit floats
+        # while the emulator output is float32, since there are no post-hoc adjustments
+        # for precpd, this lead to noise in the tendencies estimated from the
+        # masked levels due to 32 -> 64 casting, this hack resolves
+        if emulator_field.dtype != np.float64:
+            emulator_field = emulator_field.astype(np.float64)
+
         if self.fill_value is None:
             emulator_field[use_fortran_state] = state[self.key][use_fortran_state]
         elif isinstance(self.fill_value, str):
@@ -64,4 +72,5 @@ def __call__(self, state: FortranState, emulator: FortranState) -> FortranState:
             ]
         elif isinstance(self.fill_value, float):
             emulator_field[use_fortran_state] = self.fill_value
+
         return {**emulator, self.key: emulator_field}
diff --git a/external/emulation/emulation/zhao_carr.py b/external/emulation/emulation/zhao_carr.py
@@ -234,6 +234,7 @@ def enforce_conservative_gscond(state, emulator):
 def enforce_conservative_phase_dependent(state, emulator):
     cloud_out = emulator[GscondOutput.cloud_water]
     net_condensation = cloud_out - state[Input.cloud_water]
+    net_condensation = _limit_net_condensation_conserving(state, net_condensation)
     return {**emulator, **apply_condensation_phase_dependent(state, net_condensation)}
 
 

diff --git a/external/fv3fit/docs/composite-models.rst b/external/fv3fit/docs/composite-models.rst
@@ -43,7 +43,9 @@ Models augmented with out-of-sample detection can be defined with a config file
 Tapered models
 --------------------
 A tapering transform can be applied to an existing saved model:
+
 .. code-block:: yaml
+
     model: gs://vcm-ml-experiments/some_path
     tapering:
         dQ1:
@@ -60,7 +62,28 @@ Combined models
 Combines multiple models with nonoverlapping output variables into a single model.
 Similar functionality is also in the prognostic run's MultipleModelAdapter, but sometimes it
 is more efficient to combine models earlier in the workflow.
+
 .. code-block:: yaml
-models:
-    - gs://vcm-ml-experiments/model1
-    - gs://vcm-ml-experiments/model2
+
+    models:
+        - gs://vcm-ml-experiments/model1
+        - gs://vcm-ml-experiments/model2
+
+
+Squashed output models
+----------------------
+"Squashes" the output of a model, which means that samples less than a threshold value for a
+particular output variable will be set to a target. Configured by a list of squashing rules,
+which specify the name of the variable to determine the threshold, the threshold and the target,
+and additional variables that should also be squared at the same positions.
+
+.. code-block:: yaml
+
+    base_model_path: gs://vcm-ml-experiments/model1
+    squashing:
+      - squash_by_name: cloud_amount
+        squash_threshold: 0.08
+        squash_to: 0.0
+        additional_squash_target_names:
+          - cloud_water_mixing_ratio
+          - cloud_ice_mixing_ratio
diff --git a/external/fv3fit/fv3fit/__init__.py b/external/fv3fit/fv3fit/__init__.py
@@ -50,5 +50,7 @@
 # need to import this to register the training func
 import fv3fit.train_microphysics
 import fv3fit.dataclasses
+import fv3fit.reservoir.train
+import fv3fit.reservoir.transformers.autoencoder
 
 __version__ = "0.1.0"
diff --git a/external/fv3fit/fv3fit/_shared/__init__.py b/external/fv3fit/fv3fit/_shared/__init__.py
@@ -1,4 +1,4 @@
-from .config import SliceConfig, PackerConfig
+from .config import SliceConfig, PackerConfig, OptimizerConfig
 from .training_config import TrainingConfig, register_training_function
 from .packer import (
     pack,
@@ -21,3 +21,4 @@
 )
 from .models import EnsembleModel, DerivedModel, TransformedPredictor
 from .filesystem import get_dir, put_dir
+from .xr_prediction import DatasetPredictor
diff --git a/external/fv3fit/fv3fit/_shared/config.py b/external/fv3fit/fv3fit/_shared/config.py
@@ -1,10 +1,5 @@
 import dataclasses
-from typing import (
-    Any,
-    Hashable,
-    Mapping,
-    Optional,
-)
+from typing import Any, Hashable, Mapping, Optional, Sequence
 
 # TODO: move all keras configs under fv3fit.keras
 import tensorflow as tf
@@ -115,3 +110,33 @@ class PackerConfig:
     """
 
     clip: Mapping[Hashable, SliceConfig] = dataclasses.field(default_factory=dict)
+
+
+@dataclasses.dataclass
+class SquashedOutputConfig:
+    """"
+    Configuration of output squashing
+
+    Attributes:
+        squash_by_name: name of the variable that will determine whether outputs
+            are squashed
+        additional_squash_target_names: name of the variables to be squashed in
+            addition to `squash_by_name`
+        squash_threshold: threshold value in squash_by_name below which squashing will
+            occur for this sample and feature position for all target variables
+        squash_to: value to which squashed values will be set
+    """
+
+    squash_by_name: Hashable
+    squash_threshold: float
+    squash_to: float = 0.0
+    additional_squash_target_names: Sequence[Hashable] = ()
+
+    def squash(self, predictions: xr.Dataset) -> xr.Dataset:
+        squashed_predictions = predictions.copy()
+        for name in [self.squash_by_name] + list(self.additional_squash_target_names):
+            squashed_predictions[name] = predictions[name].where(
+                predictions[self.squash_by_name] > self.squash_threshold,
+                self.squash_to,
+            )
+        return squashed_predictions
diff --git a/...3fit/fv3fit/keras/_models/shared/halos.py → external/fv3fit/fv3fit/_shared/halos.py b/...3fit/fv3fit/keras/_models/shared/halos.py → external/fv3fit/fv3fit/_shared/halos.py