Skip to content

Commit

Permalink
Save basic stats in csv at each cycle (#1040)
Browse files Browse the repository at this point in the history
### Work done
Computes basic stats (count, bias and rmse of omb's and oma's) for
different ocean basins, this assumes the obs were flagged with the ocean
mask from `RECCAP2_region_masks_all_v20221025.nc`.
- The stats are computed within the post ocean analysis jjob. Failure of
this job will only results in a warning.
- I provided a simple plotting script that will accept multiple
experiments as arguments and plot time series. Not sure if we want this
in the vrfy job

### Work left to do
We're not flagging the obs currently, so that code will do nothing until
we update the prep ocean obs yaml to point to
`RECCAP2_region_masks_all_v20221025.nc` (it's been copied to the soca
fix dir on hera and orion). To have the obs flagged in develop will
require the ioda converters to not fail if the
`RECCAP2_region_masks_all_v20221025` isn't found.
  • Loading branch information
guillaumevernieres authored Apr 15, 2024
1 parent 6f7bd18 commit 420459c
Show file tree
Hide file tree
Showing 6 changed files with 406 additions and 3 deletions.
7 changes: 7 additions & 0 deletions parm/soca/obs/obs_stats.yaml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
time window:
begin: '1900-01-01T00:00:00Z'
end: '2035-01-01T00:00:00Z'
bound to include: begin

obs spaces:
{{ obs_spaces }}
77 changes: 74 additions & 3 deletions scripts/exgdas_global_marine_analysis_post.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@
import glob
import shutil
from datetime import datetime, timedelta
from wxflow import FileHandler, Logger
from wxflow import AttrDict, FileHandler, Logger, parse_j2yaml
from multiprocessing import Process
import subprocess
import netCDF4
import re

logger = Logger()

Expand All @@ -36,8 +40,6 @@ def list_all_files(dir_in, dir_out, wc='*', fh_list=[]):
return fh_list


logger.info(f"---------------- Copy from RUNDIR to COMOUT")

com_ocean_analysis = os.getenv('COM_OCEAN_ANALYSIS')
com_ice_restart = os.getenv('COM_ICE_RESTART')
anl_dir = os.getenv('DATA')
Expand All @@ -52,6 +54,8 @@ def list_all_files(dir_in, dir_out, wc='*', fh_list=[]):
bdate = datetime.strftime(bdatedt, '%Y-%m-%dT%H:00:00Z')
mdate = datetime.strftime(datetime.strptime(cdate, '%Y%m%d%H'), '%Y-%m-%dT%H:00:00Z')

logger.info(f"---------------- Copy from RUNDIR to COMOUT")

post_file_list = []

# Make a copy the IAU increment
Expand Down Expand Up @@ -106,3 +110,70 @@ def list_all_files(dir_in, dir_out, wc='*', fh_list=[]):
os.path.join(com_ocean_analysis, 'yaml'), wc='*.yaml', fh_list=fh_list)

FileHandler({'copy': fh_list}).sync()

# obs space statistics
logger.info(f"---------------- Compute basic stats")
diags_list = glob.glob(os.path.join(os.path.join(com_ocean_analysis, 'diags', '*.nc4')))
obsstats_j2yaml = str(os.path.join(os.getenv('HOMEgfs'), 'sorc', 'gdas.cd',
'parm', 'soca', 'obs', 'obs_stats.yaml.j2'))


# function to create a minimalist ioda obs sapce
def create_obs_space(data):
os_dict = {"obs space": {
"name": data["obs_space"],
"obsdatain": {
"engine": {"type": "H5File", "obsfile": data["obsfile"]}
},
"simulated variables": [data["variable"]]
},
"variable": data["variable"],
"experiment identifier": data["pslot"],
"csv output": data["csv_output"]
}
return os_dict


# attempt to extract the experiment id from the path
pslot = os.path.normpath(com_ocean_analysis).split(os.sep)[-5]

# iterate through the obs spaces and generate the yaml for gdassoca_obsstats.x
obs_spaces = []
for obsfile in diags_list:

# define an obs space name
obs_space = re.sub(r'\.\d{10}\.nc4$', '', os.path.basename(obsfile))

# get the variable name, assume 1 variable per file
nc = netCDF4.Dataset(obsfile, 'r')
variable = next(iter(nc.groups["ObsValue"].variables))
nc.close()

# filling values for the templated yaml
data = {'obs_space': os.path.basename(obsfile),
'obsfile': obsfile,
'pslot': pslot,
'variable': variable,
'csv_output': os.path.join(com_ocean_analysis,
f"{RUN}.t{cyc}z.ocn.{obs_space}.stats.csv")}
obs_spaces.append(create_obs_space(data))

# create the yaml
data = {'obs_spaces': obs_spaces}
conf = parse_j2yaml(path=obsstats_j2yaml, data=data)
stats_yaml = 'diag_stats.yaml'
conf.save(stats_yaml)

# run the application
# TODO(GorA): this should be setup properly in the g-w once gdassoca_obsstats is in develop
gdassoca_obsstats_exec = os.path.join(os.getenv('HOMEgfs'),
'sorc', 'gdas.cd', 'build', 'bin', 'gdassoca_obsstats.x')
command = f"{os.getenv('launcher')} {gdassoca_obsstats_exec} {stats_yaml}"
logger.info(f"{command}")
result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# issue a warning if the process has failed
if result.returncode != 0:
logger.warning(f"{command} has failed")
if result.stderr:
print("STDERR:", result.stderr.decode())
97 changes: 97 additions & 0 deletions ush/soca/gdassoca_obsstats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env python3


# creates figures of timeseries from the csv outputs computed by gdassoca_obsstats.x
import argparse
from itertools import product
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates


class ObsStats:
def __init__(self):
self.data = pd.DataFrame()

def read_csv(self, filepaths):
# Iterate through the list of file paths and append their data
for filepath in filepaths:
new_data = pd.read_csv(filepath)
# Convert date to datetime for easier plotting
new_data['date'] = pd.to_datetime(new_data['date'], format='%Y%m%d%H')
self.data = pd.concat([self.data, new_data], ignore_index=True)

def plot_timeseries(self, ocean, variable):
# Filter data for the given ocean and variable
filtered_data = self.data[(self.data['Ocean'] == ocean) & (self.data['Variable'] == variable)]

if filtered_data.empty:
print("No data available for the given ocean and variable combination.")
return

# Get unique experiments
experiments = filtered_data['Exp'].unique()

# Plot settings
fig, axs = plt.subplots(3, 1, figsize=(10, 15), sharex=True)
fig.suptitle(f'{variable} statistics, {ocean} ocean', fontsize=16, fontweight='bold')

for exp in experiments:
exp_data = filtered_data[filtered_data['Exp'] == exp]

# Plot RMSE
axs[0].plot(exp_data['date'], exp_data['RMSE'], marker='o', linestyle='-', label=exp)
axs[0].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d %H'))
axs[0].xaxis.set_major_locator(mdates.DayLocator())
axs[0].tick_params(labelbottom=False)
axs[0].set_ylabel('RMSE')
axs[0].legend()
axs[0].grid(True)

# Plot Bias
axs[1].plot(exp_data['date'], exp_data['Bias'], marker='o', linestyle='-', label=exp)
axs[1].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d %H'))
axs[1].xaxis.set_major_locator(mdates.DayLocator())
axs[1].tick_params(labelbottom=False)
axs[1].set_ylabel('Bias')
axs[1].legend()
axs[1].grid(True)

# Plot Count
axs[2].plot(exp_data['date'], exp_data['Count'], marker='o', linestyle='-', label=exp)
axs[2].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d %H'))
axs[2].xaxis.set_major_locator(mdates.DayLocator())
axs[2].set_ylabel('Count')
axs[2].legend()
axs[2].grid(True)

# Improve layout and show plot
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig(f'{ocean}_test.png')


if __name__ == "__main__":
epilog = ["Usage examples: ./gdassoca_obsstats.py --exp gdas.*.csv"]
parser = argparse.ArgumentParser(description="Observation space RMSE's and BIAS's",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=os.linesep.join(epilog))
parser.add_argument("--exps", nargs='+', required=True,
help="Path to the experiment's COMROOT")
parser.add_argument("--variable", required=True, help="ombg_qc or ombg_noqc")
parser.add_argument("--figname", required=True, help="The name of the output figure")
args = parser.parse_args()

flist = []
for exp in args.exps:
print(exp)
wc = exp + '/*.*/??/analysis/ocean/*.t??z.stats.csv'
flist.append(glob.glob(wc))

flist = sum(flist, [])
obsStats = ObsStats()
obsStats.read_csv(flist)
for var, ocean in product(['ombg_noqc', 'ombg_qc'],
['Atlantic', 'Pacific', 'Indian', 'Arctic', 'Southern']):
obsStats.plot_timeseries(ocean, var)
6 changes: 6 additions & 0 deletions utils/soca/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,9 @@ ecbuild_add_executable( TARGET gdas_socahybridweights.x
SOURCES gdas_socahybridweights.cc )
target_compile_features( gdas_socahybridweights.x PUBLIC cxx_std_17)
target_link_libraries( gdas_socahybridweights.x PUBLIC NetCDF::NetCDF_CXX oops atlas soca)

# Obs Stats
ecbuild_add_executable( TARGET gdassoca_obsstats.x
SOURCES gdassoca_obsstats.cc )
target_compile_features( gdassoca_obsstats.x PUBLIC cxx_std_17)
target_link_libraries( gdassoca_obsstats.x PUBLIC NetCDF::NetCDF_CXX oops ioda)
10 changes: 10 additions & 0 deletions utils/soca/gdassoca_obsstats.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#include "gdassoca_obsstats.h"
#include "oops/runs/Run.h"

// this application computes a few basic statistics in obs space

int main(int argc, char ** argv) {
oops::Run run(argc, argv);
gdasapp::ObsStats obsStats;
return run.execute(obsStats);
}
Loading

0 comments on commit 420459c

Please sign in to comment.