work with GSI diag files

Contents

work with GSI diag files#

This notebook illustreate how to work with GSI diag files using a similar obsSpaceGSI class

%%time 

# autoload external python modules if they changed
%load_ext autoreload
%autoreload 2

import os, sys
pyDAmonitor_ROOT=os.getenv("pyDAmonitor_ROOT")
if pyDAmonitor_ROOT is None:
    print("!!! pyDAmonitor_ROOT is NOT set. Run `source ush/load_pyDAmonitor.sh`")
else:
    print(f"pyDAmonitor_ROOT={pyDAmonitor_ROOT}\n")
sys.path.insert(0, pyDAmonitor_ROOT)
    
# import modules
import warnings
import math
import numpy as np
import uxarray as ux
import xarray as xr
import pandas as pd
import seaborn as sns  # seaborn handles NaN values automatically
import matplotlib.pyplot as plt
from netCDF4 import Dataset
from DAmonitor.base import query_dataset, query_data, query_obj, to_dataframe

pyDAmonitor_ROOT=/home/Guoqing.Ge/pyDAmonitor

CPU times: user 3.23 s, sys: 1.11 s, total: 4.34 s
Wall time: 5.03 s

diag_ana=f"{pyDAmonitor_ROOT}/data/gsi/diag_conv_t_anl.2024050601.nc4"
diag_bkg=f"{pyDAmonitor_ROOT}/data/gsi/diag_conv_t_ges.2024050601.nc4"

Use `obsSpaceGSI` to read GSI diag files#

from DAmonitor.obs import obsSpaceGSI, fit_rate
tana=obsSpaceGSI(diag_ana)
tbkg=obsSpaceGSI(diag_bkg)

query_obj(tana)
# one can access diag data using `tana.data`
# query_data(tana.data)

['data', 'ds', 'filepath']

print(tana.data.Observation_Class)

['      t' '      t' '      t' ... '      t' '      t' '      t']

convert to Pandas DataFrame#

pd.set_option('display.max_columns', None)  # show all dataframe columns
dfana=to_dataframe(tana.data)
dfbkg=to_dataframe(tbkg.data)
pd.set_option("display.max_columns", None)
dfana

	Station_ID	Observation_Class	Observation_Type	Observation_Subtype	Latitude	Longitude	Station_Elevation	Pressure	Height	Time	Prep_QC_Mark	Setup_QC_Mark	Prep_Use_Flag	Analysis_Use_Flag	Nonlinear_QC_Rel_Wgt	Errinv_Input	Errinv_Adjust	Errinv_Final	Observation	Obs_Minus_Forecast_adjusted	Obs_Minus_Forecast_unadjusted	Data_Pof	Data_Vertical_Velocity	Bias_Correction_1	Bias_Correction_2	Bias_Correction_3
0	AU0330	t	131	0	30.026100	236.154800	12400.0	181.500000	12400.0	-0.91944	1.0	1.0	101.0	-1.0	4.0	1.029786	1.029786	1.029786e+00	207.149994	-1.627310	-1.627310	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09
1	AU0330	t	131	0	30.130800	236.333405	12400.0	181.500000	12400.0	-0.89500	1.0	1.0	101.0	-1.0	4.0	1.029786	1.029786	1.029786e+00	206.949997	-1.646028	-1.646028	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09
2	AU0330	t	131	0	30.636900	237.190308	12400.0	181.500000	12400.0	-0.77833	1.0	1.0	101.0	-1.0	4.0	1.029786	1.029786	1.029786e+00	207.149994	-0.819473	-0.819473	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09
3	XZRFGAZA	t	133	0	34.238998	236.003998	10669.0	238.399994	10669.0	-0.70000	1.0	1.0	0.0	1.0	4.0	1.130850	1.130850	1.130850e+00	221.649994	-0.078072	-0.078072	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09
4	XZRFGAZA	t	133	0	34.234001	235.731003	10669.0	238.399994	10669.0	-0.66667	1.0	1.0	0.0	1.0	4.0	1.130850	1.130850	1.130850e+00	221.449997	-0.189546	-0.189546	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
63560	F4586	t	188	0	46.589169	298.989838	47.0	1018.400024	47.0	0.00000	2.0	1.0	101.0	-1.0	4.0	0.250000	0.250000	1.000000e-10	282.049988	2.429230	2.446748	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09
63561	F9448	t	188	0	46.615002	298.967621	19.0	1021.500000	19.0	0.00000	2.0	1.0	101.0	-1.0	4.0	0.250000	0.250000	1.000000e-10	278.750000	0.310210	-1.171543	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09
63562	F3942	t	188	0	46.642830	298.998322	11.0	1022.700012	11.0	0.00000	2.0	1.0	101.0	-1.0	4.0	0.250000	0.250000	1.000000e-10	280.950012	1.367755	0.994733	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09
63563	G0274	t	188	0	46.645828	299.047485	10.0	1022.700012	10.0	0.00000	2.0	1.0	101.0	-1.0	4.0	0.250000	0.250000	1.000000e-10	282.049988	2.479401	2.336636	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09
63564	F2780	t	188	0	46.824669	299.187317	42.0	1019.000000	42.0	0.00000	2.0	1.0	101.0	-1.0	4.0	0.250000	0.250000	1.000000e-10	280.950012	2.195708	0.297975	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09	-9.990000e+09

63565 rows × 26 columns

Plot histrogram#

plt.figure(figsize=(8, 5))
#sns.histplot(df["oman"], bins=50, kde=True, color="steelblue")
sns.histplot(tana.data.Obs_Minus_Forecast_adjusted, bins=100, kde=False, color="steelblue")
plt.title("Histogram of oman")
plt.xlabel("oman values")
plt.ylabel("Density")
plt.tight_layout()
plt.show()

../_images/54069f1afdaa12b486c4680da8941caf221c2733a7bfb242bafb0e37a8f7a2ac.png

Plot fitting rate#

## assemble 133 OMB and OMA into a dictionary
df_133_a = dfana[dfana["Observation_Type"] == 133]
df_133_b = dfbkg[dfbkg["Observation_Type"] == 133]

t133={
    'oman': df_133_a["Obs_Minus_Forecast_adjusted"].to_numpy(),
    'ombg': df_133_b["Obs_Minus_Forecast_adjusted"].to_numpy(),
    'height': df_133_a["Station_Elevation"].to_numpy(),
}

dz = 1000
grouped = fit_rate(t133, dz=dz)

# 5. Plot vertical profile of fit_rate vs height
plt.figure(figsize=(7, 6))
plt.plot(grouped["fit_rate"], grouped["height_bin"], marker="o", color="blue")
# plt.axvline(x=0, color="gray", linestyle="--")  # ax vertical line

plt.xlabel("Fit Rate (%)")  # label change
plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x*100:.0f}%'))  # format as %
plt.ylabel("Height Bin (m)")
plt.title("Vertical Profile of Fit Rate")

# Fine-tune ticks
plt.xticks(np.arange(0, 0.25, 0.05))  #, fontsize=12)
plt.yticks(np.arange(0, 13000, dz))  #, , fontsize=12)
# Add minor ticks
from matplotlib.ticker import AutoMinorLocator
plt.gca().xaxis.set_minor_locator(AutoMinorLocator())
plt.gca().yaxis.set_minor_locator(AutoMinorLocator())
# plt.grid(which='both', linestyle='--', linewidth=0.5)
plt.grid(True)

plt.ylim(0, 13000)  # set y-axis from 0 (bottom) to 13,000 (top)
plt.tight_layout()
plt.show()

OMA: bias=0.0242 rms=1.2549
OMB: bias=0.2107 rms=1.4361
Overall fit_rate: 12.6145%
0, -1000, 1.2516, 1.7449, 28.2731%
1, 0, 1.6060, 1.8412, 12.7736%
2, 1000, 0.8703, 1.1811, 26.3083%
3, 2000, 0.7375, 1.0638, 30.6658%
4, 3000, 0.7399, 0.9380, 21.1196%
5, 4000, 0.7347, 0.9778, 24.8696%
6, 5000, 1.2334, 1.5148, 18.5773%
7, 6000, 1.4749, 1.6529, 10.7703%
8, 7000, 1.7494, 1.8635, 6.1250%
9, 8000, 1.5833, 1.6431, 3.6373%
10, 9000, 0.6370, 0.7869, 19.0587%
11, 10000, 1.7016, 1.7773, 4.2578%
12, 11000, 0.7794, 0.9396, 17.0520%
13, 12000, 1.0761, 1.5041, 28.4568%

../_images/7d5f8b172859bf7a5c606a117b0a1ad1e93738d97c02d66aa9987ad29666bfb6.png

For reference, check diag file contents using netCDF4 directly#

dataset=Dataset(diag_ana, mode='r')
query_dataset(dataset)

Station_ID, Observation_Class, Observation_Type, Observation_Subtype, Latitude, Longitude, Station_Elevation, Pressure, Height, Time, Prep_QC_Mark, Setup_QC_Mark, Prep_Use_Flag, Analysis_Use_Flag, Nonlinear_QC_Rel_Wgt, Errinv_Input, Errinv_Adjust, Errinv_Final, Observation, Obs_Minus_Forecast_adjusted, Obs_Minus_Forecast_unadjusted, Data_Pof, Data_Vertical_Velocity, Bias_Correction_Terms

check the shape/ndim of each variable in the netCDF4 dataset#

for var in dataset.variables:
    print(var, dataset.variables[var][:].shape) #ndim

Station_ID (63565, 8)
Observation_Class (63565, 7)
Observation_Type (63565,)
Observation_Subtype (63565,)
Latitude (63565,)
Longitude (63565,)
Station_Elevation (63565,)
Pressure (63565,)
Height (63565,)
Time (63565,)
Prep_QC_Mark (63565,)
Setup_QC_Mark (63565,)
Prep_Use_Flag (63565,)
Analysis_Use_Flag (63565,)
Nonlinear_QC_Rel_Wgt (63565,)
Errinv_Input (63565,)
Errinv_Adjust (63565,)
Errinv_Final (63565,)
Observation (63565,)
Obs_Minus_Forecast_adjusted (63565,)
Obs_Minus_Forecast_unadjusted (63565,)
Data_Pof (63565,)
Data_Vertical_Velocity (63565,)
Bias_Correction_Terms (63565, 3)

print(dataset.variables["Bias_Correction_Terms"][:])

[[-9.99e+09 -9.99e+09 -9.99e+09]
 [-9.99e+09 -9.99e+09 -9.99e+09]
 [-9.99e+09 -9.99e+09 -9.99e+09]
 ...
 [-9.99e+09 -9.99e+09 -9.99e+09]
 [-9.99e+09 -9.99e+09 -9.99e+09]
 [-9.99e+09 -9.99e+09 -9.99e+09]]

check attributes#

for stmp in dataset.variables["Data_Pof"].ncattrs():
    print(stmp)

print(dataset.variables["Data_Pof"].ncattrs())

[]