import xarray as xr
import numpy as np
import intake
import ast
import matplotlib.pyplot as plt
Run the cell below if the notebook is running on a NCAR supercomputer.
If the notebook is running on a different parallel computing environment, you will need
to replace the usage of NCARCluster
with a similar object from dask_jobqueue
or dask_gateway
.
import dask
from ncar_jobqueue import NCARCluster
num_jobs = 20
walltime = "1:00:00"
memory='20GB'
cluster = NCARCluster(walltime=walltime, memory=memory)
cluster.scale(jobs=num_jobs)
from distributed import Client
client = Client(cluster)
cluster
☝️ Link to Dask dashboard will appear above.
# Define the catalog description file location.
catalog_url = "https://ncar-na-cordex.s3-us-west-2.amazonaws.com/catalogs/aws-na-cordex.json"
# Have the catalog interpret the "na-cordex-models" column as a list of values.
col = intake.open_esm_datastore(catalog_url, csv_kwargs={"converters": {"na-cordex-models": ast.literal_eval}},)
col
aws-na-cordex catalog with 330 dataset(s) from 330 asset(s):
unique | |
---|---|
variable | 15 |
standard_name | 10 |
long_name | 18 |
units | 10 |
spatial_domain | 1 |
grid | 2 |
spatial_resolution | 2 |
scenario | 6 |
start_time | 3 |
end_time | 4 |
frequency | 1 |
vertical_levels | 1 |
bias_correction | 3 |
na-cordex-models | 26 |
path | 330 |
# Show the first few lines of the catalog
col.df.head(10)
variable | standard_name | long_name | units | spatial_domain | grid | spatial_resolution | scenario | start_time | end_time | frequency | vertical_levels | bias_correction | na-cordex-models | path | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | hurs | relative_humidity | Near-Surface Relative Humidity | % | north_america | NAM-22i | 0.25 deg | eval | 1979-01-01T12:00:00 | 2014-12-31T12:00:00 | day | 1 | raw | [ERA-Int.CRCM5-UQAM, ERA-Int.CRCM5-OUR, ERA-In... | s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i.... |
1 | hurs | relative_humidity | Near-Surface Relative Humidity | % | north_america | NAM-44i | 0.50 deg | eval | 1979-01-01T12:00:00 | 2015-12-31T12:00:00 | day | 1 | raw | [ERA-Int.CRCM5-UQAM, ERA-Int.RegCM4, ERA-Int.H... | s3://ncar-na-cordex/day/hurs.eval.day.NAM-44i.... |
2 | hurs | relative_humidity | Near-Surface Relative Humidity | % | north_america | NAM-22i | 0.25 deg | hist-rcp45 | 1949-01-01T12:00:00 | 2100-12-31T12:00:00 | day | 1 | mbcn-Daymet | [CanESM2.CanRCM4] | s3://ncar-na-cordex/day/hurs.hist-rcp45.day.NA... |
3 | hurs | relative_humidity | Near-Surface Relative Humidity | % | north_america | NAM-22i | 0.25 deg | hist-rcp45 | 1949-01-01T12:00:00 | 2100-12-31T12:00:00 | day | 1 | mbcn-gridMET | [CanESM2.CanRCM4] | s3://ncar-na-cordex/day/hurs.hist-rcp45.day.NA... |
4 | hurs | relative_humidity | Near-Surface Relative Humidity | % | north_america | NAM-22i | 0.25 deg | hist-rcp45 | 1949-01-01T12:00:00 | 2100-12-31T12:00:00 | day | 1 | raw | [GFDL-ESM2M.CRCM5-OUR, CanESM2.CRCM5-OUR, CanE... | s3://ncar-na-cordex/day/hurs.hist-rcp45.day.NA... |
5 | hurs | relative_humidity | Near-Surface Relative Humidity | % | north_america | NAM-44i | 0.50 deg | hist-rcp45 | 1949-01-01T12:00:00 | 2100-12-31T12:00:00 | day | 1 | mbcn-Daymet | [MPI-ESM-LR.CRCM5-UQAM, CanESM2.CRCM5-UQAM, EC... | s3://ncar-na-cordex/day/hurs.hist-rcp45.day.NA... |
6 | hurs | relative_humidity | Near-Surface Relative Humidity | % | north_america | NAM-44i | 0.50 deg | hist-rcp45 | 1949-01-01T12:00:00 | 2100-12-31T12:00:00 | day | 1 | mbcn-gridMET | [MPI-ESM-LR.CRCM5-UQAM, CanESM2.CRCM5-UQAM, EC... | s3://ncar-na-cordex/day/hurs.hist-rcp45.day.NA... |
7 | hurs | relative_humidity | Near-Surface Relative Humidity | % | north_america | NAM-44i | 0.50 deg | hist-rcp45 | 1949-01-01T12:00:00 | 2100-12-31T12:00:00 | day | 1 | raw | [MPI-ESM-LR.CRCM5-UQAM, CanESM2.CRCM5-UQAM, EC... | s3://ncar-na-cordex/day/hurs.hist-rcp45.day.NA... |
8 | hurs | relative_humidity | Near-Surface Relative Humidity | % | north_america | NAM-22i | 0.25 deg | hist-rcp85 | 1949-01-01T12:00:00 | 2100-12-31T12:00:00 | day | 1 | mbcn-Daymet | [MPI-ESM-MR.CRCM5-UQAM, GEMatm-Can.CRCM5-UQAM,... | s3://ncar-na-cordex/day/hurs.hist-rcp85.day.NA... |
9 | hurs | relative_humidity | Near-Surface Relative Humidity | % | north_america | NAM-22i | 0.25 deg | hist-rcp85 | 1949-01-01T12:00:00 | 2100-12-31T12:00:00 | day | 1 | mbcn-gridMET | [MPI-ESM-MR.CRCM5-UQAM, GEMatm-Can.CRCM5-UQAM,... | s3://ncar-na-cordex/day/hurs.hist-rcp85.day.NA... |
# Produce a catalog content summary.
import pprint
uniques = col.unique(
columns=["variable", "scenario", "grid", "na-cordex-models", "bias_correction"]
)
pprint.pprint(uniques, compact=True, indent=4)
{ 'bias_correction': { 'count': 3, 'values': ['mbcn-Daymet', 'raw', 'mbcn-gridMET']}, 'grid': {'count': 2, 'values': ['NAM-44i', 'NAM-22i']}, 'na-cordex-models': { 'count': 26, 'values': [ 'MPI-ESM-LR.CRCM5-OUR', 'CanESM2.CRCM5-OUR', 'GFDL-ESM2M.WRF', 'HadGEM2-ES.RegCM4', 'CanESM2.CRCM5-UQAM', 'ERA-Int.RCA4', 'ERA-Int.CRCM5-OUR', 'EC-EARTH.HIRHAM5', 'MPI-ESM-MR.CRCM5-UQAM', 'ERA-Int.HIRHAM5', 'MPI-ESM-LR.WRF', 'HadGEM2-ES.WRF', 'GEMatm-MPI.CRCM5-UQAM', 'EC-EARTH.RCA4', 'GEMatm-Can.CRCM5-UQAM', 'CNRM-CM5.CRCM5-OUR', 'ERA-Int.WRF', 'CanESM2.RCA4', 'MPI-ESM-LR.CRCM5-UQAM', 'ERA-Int.CanRCM4', 'ERA-Int.CRCM5-UQAM', 'GFDL-ESM2M.RegCM4', 'CanESM2.CanRCM4', 'GFDL-ESM2M.CRCM5-OUR', 'ERA-Int.RegCM4', 'MPI-ESM-LR.RegCM4']}, 'scenario': { 'count': 6, 'values': [ 'rcp45', 'hist-rcp45', 'hist-rcp85', 'rcp85', 'hist', 'eval']}, 'variable': { 'count': 15, 'values': [ 'ps', 'tmin', 'tasmin', 'tmax', 'vas', 'hurs', 'tasmax', 'rsds', 'pr', 'prec', 'uas', 'tas', 'sfcWind', 'huss', 'temp']}}
data_var = 'tmax'
col_subset = col.search(
variable=data_var,
grid="NAM-44i",
scenario="eval",
bias_correction="raw",
)
col_subset
aws-na-cordex catalog with 1 dataset(s) from 1 asset(s):
unique | |
---|---|
variable | 1 |
standard_name | 1 |
long_name | 1 |
units | 1 |
spatial_domain | 1 |
grid | 1 |
spatial_resolution | 1 |
scenario | 1 |
start_time | 1 |
end_time | 1 |
frequency | 1 |
vertical_levels | 1 |
bias_correction | 1 |
na-cordex-models | 6 |
path | 1 |
col_subset.df
variable | standard_name | long_name | units | spatial_domain | grid | spatial_resolution | scenario | start_time | end_time | frequency | vertical_levels | bias_correction | na-cordex-models | path | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | tmax | air_temperature | Daily Maximum Near-Surface Air Temperature | degC | north_america | NAM-44i | 0.50 deg | eval | 1979-01-01T12:00:00 | 2015-12-31T12:00:00 | day | 1 | raw | [ERA-Int.CRCM5-UQAM, ERA-Int.RegCM4, ERA-Int.H... | s3://ncar-na-cordex/day/tmax.eval.day.NAM-44i.... |
# Load catalog entries for subset into a dictionary of xarray datasets, and open the first one.
dsets = col_subset.to_dataset_dict(
zarr_kwargs={"consolidated": True}, storage_options={"anon": True}
)
print(f"\nDataset dictionary keys:\n {dsets.keys()}")
# Load the first dataset and display a summary.
dataset_key = list(dsets.keys())[0]
store_name = dataset_key + ".zarr"
ds = dsets[dataset_key]
ds
# Note that the summary includes a 'member_id' coordinate, which is a renaming of the
# 'na-cordex-models' column in the catalog.
--> The keys in the returned dictionary of datasets are constructed as follows: 'variable.frequency.scenario.grid.bias_correction'
Dataset dictionary keys: dict_keys(['tmax.day.eval.NAM-44i.raw'])
<xarray.Dataset> Dimensions: (bnds: 2, lat: 129, lon: 300, member_id: 6, time: 13514) Coordinates: * lat (lat) float64 12.25 12.75 13.25 13.75 ... 74.75 75.25 75.75 76.25 * lon (lon) float64 -171.8 -171.2 -170.8 ... -23.25 -22.75 -22.25 * member_id (member_id) <U18 'ERA-Int.CRCM5-UQAM' ... 'ERA-Int.WRF' * time (time) datetime64[ns] 1979-01-01T12:00:00 ... 2015-12-31T12:00:00 time_bnds (time, bnds) datetime64[ns] dask.array<chunksize=(13514, 2), meta=np.ndarray> Dimensions without coordinates: bnds Data variables: tmax (member_id, time, lat, lon) float32 dask.array<chunksize=(4, 1000, 65, 150), meta=np.ndarray> Attributes: (12/26) CORDEX_domain: NAM-44 contact: {"ERA-Int.CRCM5-UQAM": "Winger.Katja@uqam... contact_note: {"ERA-Int.RegCM4": "Simulations by Ray Ar... creation_date: {"ERA-Int.CRCM5-UQAM": "2015-06-18T10:56:... driving_experiment: {"ERA-Int.CRCM5-UQAM": "ECMWF-ERAINT, eva... driving_experiment_name: evaluation ... ... tracking_id: {"ERA-Int.CRCM5-UQAM": "a8d17321-0e16-46e... version: {"ERA-Int.CRCM5-UQAM": "1.2", "ERA-Int.Re... zarr-dataset-reference: For dataset documentation, see DOI https:... zarr-version: 1.0 intake_esm_varname: ['tmax'] intake_esm_dataset_key: tmax.day.eval.NAM-44i.raw
array([12.25, 12.75, 13.25, 13.75, 14.25, 14.75, 15.25, 15.75, 16.25, 16.75, 17.25, 17.75, 18.25, 18.75, 19.25, 19.75, 20.25, 20.75, 21.25, 21.75, 22.25, 22.75, 23.25, 23.75, 24.25, 24.75, 25.25, 25.75, 26.25, 26.75, 27.25, 27.75, 28.25, 28.75, 29.25, 29.75, 30.25, 30.75, 31.25, 31.75, 32.25, 32.75, 33.25, 33.75, 34.25, 34.75, 35.25, 35.75, 36.25, 36.75, 37.25, 37.75, 38.25, 38.75, 39.25, 39.75, 40.25, 40.75, 41.25, 41.75, 42.25, 42.75, 43.25, 43.75, 44.25, 44.75, 45.25, 45.75, 46.25, 46.75, 47.25, 47.75, 48.25, 48.75, 49.25, 49.75, 50.25, 50.75, 51.25, 51.75, 52.25, 52.75, 53.25, 53.75, 54.25, 54.75, 55.25, 55.75, 56.25, 56.75, 57.25, 57.75, 58.25, 58.75, 59.25, 59.75, 60.25, 60.75, 61.25, 61.75, 62.25, 62.75, 63.25, 63.75, 64.25, 64.75, 65.25, 65.75, 66.25, 66.75, 67.25, 67.75, 68.25, 68.75, 69.25, 69.75, 70.25, 70.75, 71.25, 71.75, 72.25, 72.75, 73.25, 73.75, 74.25, 74.75, 75.25, 75.75, 76.25])
array([-171.75, -171.25, -170.75, ..., -23.25, -22.75, -22.25])
array(['ERA-Int.CRCM5-UQAM', 'ERA-Int.RegCM4', 'ERA-Int.HIRHAM5', 'ERA-Int.RCA4', 'ERA-Int.CanRCM4', 'ERA-Int.WRF'], dtype='<U18')
array(['1979-01-01T12:00:00.000000000', '1979-01-02T12:00:00.000000000', '1979-01-03T12:00:00.000000000', ..., '2015-12-29T12:00:00.000000000', '2015-12-30T12:00:00.000000000', '2015-12-31T12:00:00.000000000'], dtype='datetime64[ns]')
|
|
def plotMap(ax, map_slice, date_object=None, member_id=None):
"""Create a map plot on the given axes, with min/max as text"""
ax.imshow(map_slice, origin='lower')
minval = map_slice.min(dim = ['lat', 'lon'])
maxval = map_slice.max(dim = ['lat', 'lon'])
# Format values to have at least 4 digits of precision.
ax.text(0.01, 0.03, "Min: %3g" % minval, transform=ax.transAxes, fontsize=12)
ax.text(0.99, 0.03, "Max: %3g" % maxval, transform=ax.transAxes, fontsize=12, horizontalalignment='right')
ax.set_xticks([])
ax.set_yticks([])
if date_object:
ax.set_title(date_object.values.astype(str)[:10], fontsize=12)
if member_id:
ax.set_ylabel(member_id, fontsize=12)
return ax
def getValidDateIndexes(member_slice):
"""Search for the first and last dates with finite values."""
min_values = member_slice.min(dim = ['lat', 'lon'])
is_finite = np.isfinite(min_values)
finite_indexes = np.where(is_finite)
start_index = finite_indexes[0][0]
end_index = finite_indexes[0][-1]
return start_index, end_index
def plot_first_mid_last(ds, data_var, store_name):
"""Plot the first, middle, and final time steps for several climate runs."""
num_members_to_plot = 4
member_names = ds.coords['member_id'].values[0:num_members_to_plot]
figWidth = 18
figHeight = 12
numPlotColumns = 3
fig, axs = plt.subplots(num_members_to_plot, numPlotColumns, figsize=(figWidth, figHeight), constrained_layout=True)
for index in np.arange(num_members_to_plot):
mem_id = member_names[index]
data_slice = ds[data_var].sel(member_id=mem_id)
start_index, end_index = getValidDateIndexes(data_slice)
midDateIndex = np.floor(len(ds.time) / 2).astype(int)
startDate = ds.time[start_index]
first_step = data_slice.sel(time=startDate)
ax = axs[index, 0]
plotMap(ax, first_step, startDate, mem_id)
midDate = ds.time[midDateIndex]
mid_step = data_slice.sel(time=midDate)
ax = axs[index, 1]
plotMap(ax, mid_step, midDate)
endDate = ds.time[end_index]
last_step = data_slice.sel(time=endDate)
ax = axs[index, 2]
plotMap(ax, last_step, endDate)
plt.suptitle(f'First, Middle, and Last Timesteps for Selected Runs in "{store_name}"', fontsize=20)
return fig
def plot_stat_maps(ds, data_var, store_name):
"""Plot the mean, min, max, and standard deviation values for several climate runs, aggregated over time."""
num_members_to_plot = 4
member_names = ds.coords['member_id'].values[0:num_members_to_plot]
figWidth = 25
figHeight = 12
numPlotColumns = 4
fig, axs = plt.subplots(num_members_to_plot, numPlotColumns, figsize=(figWidth, figHeight), constrained_layout=True)
for index in np.arange(num_members_to_plot):
mem_id = member_names[index]
data_slice = ds[data_var].sel(member_id=mem_id)
data_agg = data_slice.min(dim='time')
plotMap(axs[index, 0], data_agg, member_id=mem_id)
data_agg = data_slice.max(dim='time')
plotMap(axs[index, 1], data_agg)
data_agg = data_slice.mean(dim='time')
plotMap(axs[index, 2], data_agg)
data_agg = data_slice.std(dim='time')
plotMap(axs[index, 3], data_agg)
axs[0, 0].set_title(f'min({data_var})', fontsize=15)
axs[0, 1].set_title(f'max({data_var})', fontsize=15)
axs[0, 2].set_title(f'mean({data_var})', fontsize=15)
axs[0, 3].set_title(f'std({data_var})', fontsize=15)
plt.suptitle(f'Spatial Statistics for Selected Runs in "{store_name}"', fontsize=20)
return fig
Also show which dates have no available data values, as a rug plot.
def plot_timeseries(ds, data_var, store_name):
"""Plot the mean, min, max, and standard deviation values for several climate runs,
aggregated over lat/lon dimensions."""
num_members_to_plot = 4
member_names = ds.coords['member_id'].values[0:num_members_to_plot]
figWidth = 25
figHeight = 20
linewidth = 0.5
numPlotColumns = 1
fig, axs = plt.subplots(num_members_to_plot, numPlotColumns, figsize=(figWidth, figHeight))
for index in np.arange(num_members_to_plot):
mem_id = member_names[index]
data_slice = ds[data_var].sel(member_id=mem_id)
unit_string = ds[data_var].attrs['units']
min_vals = data_slice.min(dim = ['lat', 'lon'])
max_vals = data_slice.max(dim = ['lat', 'lon'])
mean_vals = data_slice.mean(dim = ['lat', 'lon'])
std_vals = data_slice.std(dim = ['lat', 'lon'])
missing_indexes = np.isnan(min_vals)
missing_times = ds.time[missing_indexes]
axs[index].plot(ds.time, max_vals, linewidth=linewidth, label='max', color='red')
axs[index].plot(ds.time, mean_vals, linewidth=linewidth, label='mean', color='black')
axs[index].fill_between(ds.time, (mean_vals - std_vals), (mean_vals + std_vals),
color='grey', linewidth=0, label='std', alpha=0.5)
axs[index].plot(ds.time, min_vals, linewidth=linewidth, label='min', color='blue')
ymin, ymax = axs[index].get_ylim()
rug_y = ymin + 0.01*(ymax-ymin)
axs[index].plot(missing_times, [rug_y]*len(missing_times), '|', color='m', label='missing')
axs[index].set_title(mem_id, fontsize=20)
axs[index].legend(loc='upper right')
axs[index].set_ylabel(unit_string)
plt.tight_layout(pad=10.2, w_pad=3.5, h_pad=3.5)
plt.suptitle(f'Temporal Statistics for Selected Runs in "{store_name}"', fontsize=20)
return fig
%%time
# Plot using the Zarr Store obtained from an earlier step in the notebook.
figure = plot_first_mid_last(ds, data_var, store_name)
plt.show()
CPU times: user 7.07 s, sys: 470 ms, total: 7.54 s Wall time: 1min 2s
Change the value of SAVE_PLOT to True to produce a PNG file of the plot. The file will be saved in the same folder as this notebook.
Then use Jupyter's file browser to locate the file and right-click the file to download it.
SAVE_PLOT = False
if SAVE_PLOT:
plotfile = f'./{dataset_key}_FML.png'
figure.savefig(plotfile, dpi=100)
%%time
# Plot using the Zarr Store obtained from an earlier step in the notebook.
figure = plot_stat_maps(ds, data_var, store_name)
plt.show()
CPU times: user 30.8 s, sys: 1.81 s, total: 32.6 s Wall time: 4min 20s
Change the value of SAVE_PLOT to True to produce a PNG file of the plot. The file will be saved in the same folder as this notebook.
Then use Jupyter's file browser to locate the file and right-click the file to download it.
SAVE_PLOT = False
if SAVE_PLOT:
plotfile = f'./{dataset_key}_MAPS.png'
figure.savefig(plotfile, dpi=100)
%%time
# Plot using the Zarr Store obtained from an earlier step in the notebook.
figure = plot_timeseries(ds, data_var, store_name)
plt.show()
CPU times: user 42.7 s, sys: 4.06 s, total: 46.8 s Wall time: 5min 49s
Change the value of SAVE_PLOT to True to produce a PNG file of the plot. The file will be saved in the same folder as this notebook.
Then use Jupyter's file browser to locate the file and right-click the file to download it.
SAVE_PLOT = False
if SAVE_PLOT:
plotfile = f'./{dataset_key}_TS.png'
figure.savefig(plotfile, dpi=100)
!date
cluster.close()