Save basic stats in csv at each cycle (#1040)

### Work done Computes basic stats (count, bias and rmse of omb's and oma's) for different ocean basins, this assumes the obs were flagged with the ocean mask from `RECCAP2_region_masks_all_v20221025.nc`. - The stats are computed within the post ocean analysis jjob. Failure of this job will only results in a warning. - I provided a simple plotting script that will accept multiple experiments as arguments and plot time series. Not sure if we want this in the vrfy job ### Work left to do We're not flagging the obs currently, so that code will do nothing until we update the prep ocean obs yaml to point to `RECCAP2_region_masks_all_v20221025.nc` (it's been copied to the soca fix dir on hera and orion). To have the obs flagged in develop will require the ioda converters to not fail if the `RECCAP2_region_masks_all_v20221025` isn't found.
NOAA-EMC · Apr 15, 2024 · 420459c · 420459c
1 parent 6f7bd18
commit 420459c
Show file tree

Hide file tree

Showing 6 changed files with 406 additions and 3 deletions.
diff --git a/parm/soca/obs/obs_stats.yaml.j2 b/parm/soca/obs/obs_stats.yaml.j2
@@ -0,0 +1,7 @@
+time window:
+  begin: '1900-01-01T00:00:00Z'
+  end: '2035-01-01T00:00:00Z'
+  bound to include: begin
+
+obs spaces:
+  {{ obs_spaces }}
diff --git a/scripts/exgdas_global_marine_analysis_post.py b/scripts/exgdas_global_marine_analysis_post.py
@@ -22,7 +22,11 @@
 import glob
 import shutil
 from datetime import datetime, timedelta
-from wxflow import FileHandler, Logger
+from wxflow import AttrDict, FileHandler, Logger, parse_j2yaml
+from multiprocessing import Process
+import subprocess
+import netCDF4
+import re
 
 logger = Logger()
 
@@ -36,8 +40,6 @@ def list_all_files(dir_in, dir_out, wc='*', fh_list=[]):
     return fh_list
 
 
-logger.info(f"---------------- Copy from RUNDIR to COMOUT")
-
 com_ocean_analysis = os.getenv('COM_OCEAN_ANALYSIS')
 com_ice_restart = os.getenv('COM_ICE_RESTART')
 anl_dir = os.getenv('DATA')
@@ -52,6 +54,8 @@ def list_all_files(dir_in, dir_out, wc='*', fh_list=[]):
 bdate = datetime.strftime(bdatedt, '%Y-%m-%dT%H:00:00Z')
 mdate = datetime.strftime(datetime.strptime(cdate, '%Y%m%d%H'), '%Y-%m-%dT%H:00:00Z')
 
+logger.info(f"---------------- Copy from RUNDIR to COMOUT")
+
 post_file_list = []
 
 # Make a copy the IAU increment
@@ -106,3 +110,70 @@ def list_all_files(dir_in, dir_out, wc='*', fh_list=[]):
                          os.path.join(com_ocean_analysis, 'yaml'), wc='*.yaml', fh_list=fh_list)
 
 FileHandler({'copy': fh_list}).sync()
+
+# obs space statistics
+logger.info(f"---------------- Compute basic stats")
+diags_list = glob.glob(os.path.join(os.path.join(com_ocean_analysis, 'diags', '*.nc4')))
+obsstats_j2yaml = str(os.path.join(os.getenv('HOMEgfs'), 'sorc', 'gdas.cd',
+                                   'parm', 'soca', 'obs', 'obs_stats.yaml.j2'))
+
+
+# function to create a minimalist ioda obs sapce
+def create_obs_space(data):
+    os_dict = {"obs space": {
+               "name": data["obs_space"],
+               "obsdatain": {
+                   "engine": {"type": "H5File", "obsfile": data["obsfile"]}
+               },
+               "simulated variables": [data["variable"]]
+               },
+               "variable": data["variable"],
+               "experiment identifier": data["pslot"],
+               "csv output": data["csv_output"]
+               }
+    return os_dict
+
+
+# attempt to extract the experiment id from the path
+pslot = os.path.normpath(com_ocean_analysis).split(os.sep)[-5]
+
+# iterate through the obs spaces and generate the yaml for gdassoca_obsstats.x
+obs_spaces = []
+for obsfile in diags_list:
+
+    # define an obs space name
+    obs_space = re.sub(r'\.\d{10}\.nc4$', '', os.path.basename(obsfile))
+
+    # get the variable name, assume 1 variable per file
+    nc = netCDF4.Dataset(obsfile, 'r')
+    variable = next(iter(nc.groups["ObsValue"].variables))
+    nc.close()
+
+    # filling values for the templated yaml
+    data = {'obs_space': os.path.basename(obsfile),
+            'obsfile': obsfile,
+            'pslot': pslot,
+            'variable': variable,
+            'csv_output': os.path.join(com_ocean_analysis,
+                                       f"{RUN}.t{cyc}z.ocn.{obs_space}.stats.csv")}
+    obs_spaces.append(create_obs_space(data))
+
+# create the yaml
+data = {'obs_spaces': obs_spaces}
+conf = parse_j2yaml(path=obsstats_j2yaml, data=data)
+stats_yaml = 'diag_stats.yaml'
+conf.save(stats_yaml)
+
+# run the application
+# TODO(GorA): this should be setup properly in the g-w once gdassoca_obsstats is in develop
+gdassoca_obsstats_exec = os.path.join(os.getenv('HOMEgfs'),
+                                      'sorc', 'gdas.cd', 'build', 'bin', 'gdassoca_obsstats.x')
+command = f"{os.getenv('launcher')} {gdassoca_obsstats_exec} {stats_yaml}"
+logger.info(f"{command}")
+result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+# issue a warning if the process has failed
+if result.returncode != 0:
+    logger.warning(f"{command} has failed")
+if result.stderr:
+    print("STDERR:", result.stderr.decode())
diff --git a/ush/soca/gdassoca_obsstats.py b/ush/soca/gdassoca_obsstats.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+
+
+# creates figures of timeseries from the csv outputs computed by gdassoca_obsstats.x
+import argparse
+from itertools import product
+import os
+import glob
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib.dates as mdates
+
+
+class ObsStats:
+    def __init__(self):
+        self.data = pd.DataFrame()
+
+    def read_csv(self, filepaths):
+        # Iterate through the list of file paths and append their data
+        for filepath in filepaths:
+            new_data = pd.read_csv(filepath)
+            # Convert date to datetime for easier plotting
+            new_data['date'] = pd.to_datetime(new_data['date'], format='%Y%m%d%H')
+            self.data = pd.concat([self.data, new_data], ignore_index=True)
+
+    def plot_timeseries(self, ocean, variable):
+        # Filter data for the given ocean and variable
+        filtered_data = self.data[(self.data['Ocean'] == ocean) & (self.data['Variable'] == variable)]
+
+        if filtered_data.empty:
+            print("No data available for the given ocean and variable combination.")
+            return
+
+        # Get unique experiments
+        experiments = filtered_data['Exp'].unique()
+
+        # Plot settings
+        fig, axs = plt.subplots(3, 1, figsize=(10, 15), sharex=True)
+        fig.suptitle(f'{variable} statistics, {ocean} ocean', fontsize=16, fontweight='bold')
+
+        for exp in experiments:
+            exp_data = filtered_data[filtered_data['Exp'] == exp]
+
+            # Plot RMSE
+            axs[0].plot(exp_data['date'], exp_data['RMSE'], marker='o', linestyle='-', label=exp)
+            axs[0].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d %H'))
+            axs[0].xaxis.set_major_locator(mdates.DayLocator())
+            axs[0].tick_params(labelbottom=False)
+            axs[0].set_ylabel('RMSE')
+            axs[0].legend()
+            axs[0].grid(True)
+
+            # Plot Bias
+            axs[1].plot(exp_data['date'], exp_data['Bias'], marker='o', linestyle='-', label=exp)
+            axs[1].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d %H'))
+            axs[1].xaxis.set_major_locator(mdates.DayLocator())
+            axs[1].tick_params(labelbottom=False)
+            axs[1].set_ylabel('Bias')
+            axs[1].legend()
+            axs[1].grid(True)
+
+            # Plot Count
+            axs[2].plot(exp_data['date'], exp_data['Count'], marker='o', linestyle='-', label=exp)
+            axs[2].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d %H'))
+            axs[2].xaxis.set_major_locator(mdates.DayLocator())
+            axs[2].set_ylabel('Count')
+            axs[2].legend()
+            axs[2].grid(True)
+
+        # Improve layout and show plot
+        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
+        plt.savefig(f'{ocean}_test.png')
+
+
+if __name__ == "__main__":
+    epilog = ["Usage examples: ./gdassoca_obsstats.py --exp gdas.*.csv"]
+    parser = argparse.ArgumentParser(description="Observation space RMSE's and BIAS's",
+                                     formatter_class=argparse.RawDescriptionHelpFormatter,
+                                     epilog=os.linesep.join(epilog))
+    parser.add_argument("--exps", nargs='+', required=True,
+                        help="Path to the experiment's COMROOT")
+    parser.add_argument("--variable", required=True, help="ombg_qc or ombg_noqc")
+    parser.add_argument("--figname", required=True, help="The name of the output figure")
+    args = parser.parse_args()
+
+    flist = []
+    for exp in args.exps:
+        print(exp)
+        wc = exp + '/*.*/??/analysis/ocean/*.t??z.stats.csv'
+        flist.append(glob.glob(wc))
+
+    flist = sum(flist, [])
+    obsStats = ObsStats()
+    obsStats.read_csv(flist)
+    for var, ocean in product(['ombg_noqc', 'ombg_qc'],
+                              ['Atlantic', 'Pacific', 'Indian', 'Arctic', 'Southern']):
+        obsStats.plot_timeseries(ocean, var)
diff --git a/utils/soca/CMakeLists.txt b/utils/soca/CMakeLists.txt
@@ -17,3 +17,9 @@ ecbuild_add_executable( TARGET gdas_socahybridweights.x
                         SOURCES gdas_socahybridweights.cc )
 target_compile_features( gdas_socahybridweights.x PUBLIC cxx_std_17)
 target_link_libraries( gdas_socahybridweights.x PUBLIC NetCDF::NetCDF_CXX oops atlas soca)
+
+# Obs Stats
+ecbuild_add_executable( TARGET gdassoca_obsstats.x
+                        SOURCES gdassoca_obsstats.cc )
+target_compile_features( gdassoca_obsstats.x PUBLIC cxx_std_17)
+target_link_libraries( gdassoca_obsstats.x PUBLIC NetCDF::NetCDF_CXX oops ioda)
diff --git a/utils/soca/gdassoca_obsstats.cc b/utils/soca/gdassoca_obsstats.cc
@@ -0,0 +1,10 @@
+#include "gdassoca_obsstats.h"
+#include "oops/runs/Run.h"
+
+// this application computes a few basic statistics in obs space
+
+int main(int argc, char ** argv) {
+  oops::Run run(argc, argv);
+  gdasapp::ObsStats obsStats;
+  return run.execute(obsStats);
+}