This Jupyter Notebook demonstrates how one might use the NCAR Community Earth System Model (CESM) Large Ensemble (LENS) data hosted on AWS S3 (doi:10.26024/wt24-5j82). The notebook shows how to reproduce figures 2 and 4 from the Kay et al. (2015) paper describing the CESM LENS dataset (doi:10.1175/BAMS-D-13-00255.1)
This resource is intended to be helpful for people not familiar with elements of the Pangeo framework including Jupyter Notebooks, Xarray, and Zarr data format, or with the original paper, so it includes additional explanation.
Notebook version 3.3 (2019 Dec 19)
# Display output of plots directly in Notebook
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
# Silence dask.distributed logs
import logging
logger = logging.getLogger("distributed.utils_perf")
logger.setLevel(logging.ERROR)
import dask.config
dask.config.set({'distributed.logging.distributed': 'error'})
import intake
import numpy as np
import pandas as pd
import xarray as xr
# Create cluster
from dask_kubernetes import KubeCluster
from dask.distributed import Client
cluster = KubeCluster()
cluster.adapt(minimum=2, maximum=100, wait_count=60)
# Connect to cluster
client = Client(cluster)
# Display cluster dashboard URL
cluster
☝️ Link to scheduler dashboard will appear above.
# Open collection description file
intakeEsmUrl = 'https://ncar-cesm-lens.s3-us-west-2.amazonaws.com/catalogs/aws-cesm1-le.json'
col = intake.open_esm_datastore(intakeEsmUrl)
print(col._col_data['description']) #Description of collection
print("Catalog file:", col._col_data['catalog_file'])
print(col) #Summary of collection structure
# Show expanded version of collection structure with details
import pprint
uniques = col.unique(columns=["component", "frequency", "experiment", "variable"])
pprint.pprint(uniques, compact=True, indent=4)
# Show the first few lines of the catalog
col.df.head(10)
# Optional: inspect the collection object
#import inspect
#inspect.getmembers(col)
# Search the catalog to find the desired data, in this case the reference height temperature
# of the atmosphere, at daily time resolution, for the Historical, 20th Century, and RCP8.5 (IPCC
# Representative Concentration Pathway 8.5) experiments
col_subset = col.search(frequency=["daily", "monthly"], component="atm", variable="TREFHT",
experiment=["20C", "RCP85", "HIST"])
print("Data subset:")
col_subset.df
# Load catalog entries for subset into a dictionary of xarray datasets
dsets = col_subset.to_dataset_dict(zarr_kwargs={"consolidated": True}, storage_options={"anon": True})
print("\nDataset dictionary keys:\n", dsets.keys())
# Define Xarray datasets corresponding to the three experiments
ds_HIST = dsets['atm.HIST.monthly']
ds_20C = dsets['atm.20C.daily']
ds_RCP85 = dsets['atm.RCP85.daily']
# Use Dask.Distributed utility function to display size of each dataset
from distributed.utils import format_bytes
print("\n",
"Historical:", format_bytes(ds_HIST.nbytes), "\n",
"20th Century:", format_bytes(ds_20C.nbytes), "\n",
"RCP8.5:", format_bytes(ds_RCP85.nbytes))
# Extract the Reference Height Temperature data variable
t_hist = ds_HIST["TREFHT"]
t_20c = ds_20C["TREFHT"]
t_rcp = ds_RCP85["TREFHT"]
print("Description of 20C data:\n", t_20c)
# The global surface temperature anomaly was computed relative to the 1961-90 base period
# in the Kay et al. paper, so extract that time slice
t_ref = t_20c.sel(time=slice("1961", "1990"))
Cell size varies with latitude, so this must be accounted for when computing the global mean.
Note: Each Zarr store includes area values and other ancillary information in addition to the actual data. A possible optimization to reduce data size would be to extract the duplicated information into separate objects.
cell_area = ds_20C.area
total_area = cell_area.sum()
cell_area
Note: resample(time="AS") does an Annual resampling based on Start of calendar year. See documentation for Pandas resampling options
t_ref_ts = (
(t_ref.resample(time="AS").mean("time") * cell_area).sum(dim=("lat", "lon"))
/ total_area
).mean(dim=("time", "member_id"))
t_hist_ts = (
(t_hist.resample(time="AS").mean("time") * cell_area).sum(dim=("lat", "lon"))
) / total_area
t_20c_ts = (
(t_20c.resample(time="AS").mean("time") * cell_area).sum(dim=("lat", "lon"))
) / total_area
t_rcp_ts = (
(t_rcp.resample(time="AS").mean("time") * cell_area).sum(dim=("lat", "lon"))
) / total_area
Note: Pangeo's "lazy execution" philosophy means that until this point we have not actually read the bulk of the data. These steps take a while to complete, so we include the Notebook "cell magic" directive %%time to display elapsed and CPU times after computation.
%%time
t_ref_mean = t_ref_ts.load()
t_ref_mean
%%time
t_hist_ts_df = t_hist_ts.to_series().T
t_hist_ts_df.head()
%%time
t_20c_ts_df = t_20c_ts.to_series().unstack().T
t_20c_ts_df.head()
%%time
t_rcp_ts_df = t_rcp_ts.to_series().unstack().T
t_rcp_ts_df.head()
# Observational time series data for comparison with ensemble average
obsDataURL = "https://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/cru/hadcrut4/air.mon.anom.median.nc"
ds = xr.open_dataset(obsDataURL).load()
ds
def weighted_temporal_mean(ds):
time_bound_diff = ds.time_bnds.diff(dim="nbnds")[:, 0]
wgts = time_bound_diff.groupby("time.year") / time_bound_diff.groupby(
"time.year"
).sum(xr.ALL_DIMS)
np.testing.assert_allclose(wgts.groupby("time.year").sum(xr.ALL_DIMS), 1.0)
obs = ds["air"]
cond = obs.isnull()
ones = xr.where(cond, 0.0, 1.0)
obs_sum = (obs * wgts).resample(time="AS").sum(dim="time")
ones_out = (ones * wgts).resample(time="AS").sum(dim="time")
obs_s = (obs_sum / ones_out).mean(("lat", "lon")).to_series()
return obs_s
obs_s = weighted_temporal_mean(ds)
obs_s = obs_s['1920':]
obs_s.head()
all_ts_anom = pd.concat([t_20c_ts_df, t_rcp_ts_df]) - t_ref_mean.data
years = [val.year for val in all_ts_anom.index]
obs_years = [val.year for val in obs_s.index]
# Combine ensemble member 1 data from historical and 20th century experiments
hist_anom = t_hist_ts_df - t_ref_mean.data
member1 = pd.concat([hist_anom.iloc[:-2], all_ts_anom.iloc[:,0]], verify_integrity=True)
member1_years = [val.year for val in member1.index]
import matplotlib.pyplot as plt
ax = plt.axes()
ax.tick_params(right=True, top=True, direction="out", length=6, width=2, grid_alpha=0.5)
ax.plot(years, all_ts_anom.iloc[:,1:], color="grey")
ax.plot(obs_years, obs_s['1920':], color="red")
ax.plot(member1_years, member1, color="black")
ax.text(
0.35,
0.4,
"observations",
verticalalignment="bottom",
horizontalalignment="left",
transform=ax.transAxes,
color="red",
fontsize=10,
)
ax.text(
0.35,
0.33,
"members 2-40",
verticalalignment="bottom",
horizontalalignment="left",
transform=ax.transAxes,
color="grey",
fontsize=10,
)
ax.text(
0.05,
0.2,
"member 1",
verticalalignment="bottom",
horizontalalignment="left",
transform=ax.transAxes,
color="black",
fontsize=10,
)
ax.set_xticks([1850, 1920, 1950, 2000, 2050, 2100])
plt.ylim(-1, 5)
plt.xlim(1850, 2100)
plt.ylabel("Global Surface\nTemperature Anomaly (K)")
plt.show()
Figure will appear above when ready. Compare with Fig.2 of Kay et al. 2015 (doi:10.1175/BAMS-D-13-00255.1)
def linear_trend(da, dim="time"):
da_chunk = da.chunk({dim: -1})
trend = xr.apply_ufunc(
calc_slope,
da_chunk,
vectorize=True,
input_core_dims=[[dim]],
output_core_dims=[[]],
output_dtypes=[np.float],
dask="parallelized",
)
return trend
def calc_slope(y):
"""ufunc to be used by linear_trend"""
x = np.arange(len(y))
# drop missing values (NaNs) from x and y
finite_indexes = ~np.isnan(y)
slope = np.nan if (np.sum(finite_indexes) < 2) else np.polyfit(x[finite_indexes], y[finite_indexes], 1)[0]
return slope
t = xr.concat([t_20c, t_rcp], dim="time")
seasons = t.sel(time=slice("1979", "2012")).resample(time="QS-DEC").mean("time")
# Include only full seasons from 1979 and 2012
seasons = seasons.sel(time=slice("1979", "2012")).load()
winter_seasons = seasons.sel(
time=seasons.time.where(seasons.time.dt.month == 12, drop=True)
)
winter_trends = linear_trend(
winter_seasons.chunk({"lat": 20, "lon": 20, "time": -1})
).load() * len(winter_seasons.time)
# Compute ensemble mean from the first 30 members
winter_trends_mean = winter_trends.isel(member_id=range(30)).mean(dim='member_id')
# Make sure that we have 34 seasons
assert len(winter_seasons.time) == 34
# Observational time series data for comparison with ensemble average
# NASA GISS Surface Temperature Analysis, https://data.giss.nasa.gov/gistemp/
obsDataURL = "https://data.giss.nasa.gov/pub/gistemp/gistemp1200_GHCNv4_ERSSTv5.nc.gz"
# Download, unzip, and load file
import os
os.system("wget " + obsDataURL)
obsDataFileName = obsDataURL.split('/')[-1]
os.system("gunzip " + obsDataFileName)
obsDataFileName = obsDataFileName[:-3]
ds = xr.open_dataset(obsDataFileName).load()
ds
# Remap longitude range from [-180, 180] to [0, 360] for plotting purposes
ds = ds.assign_coords(lon=((ds.lon + 360) % 360)).sortby('lon')
ds
obs_seasons = ds.sel(time=slice("1979", "2012")).resample(time="QS-DEC").mean("time")
# Include only full seasons from 1979 through 2012
obs_seasons = obs_seasons.sel(time=slice("1979", "2012")).load()
# Compute observed winter trends
obs_winter_seasons = obs_seasons.sel(
time=obs_seasons.time.where(obs_seasons.time.dt.month == 12, drop=True)
)
obs_winter_seasons
obs_winter_trends = linear_trend(
obs_winter_seasons.chunk({"lat": 20, "lon": 20, "time": -1})
).load() * len(obs_winter_seasons.time)
obs_winter_trends
import cmaps # for NCL colormaps
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
contour_levels = [-6, -5, -4, -3, -2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2, 3, 4, 5, 6]
color_map = cmaps.ncl_default
def make_map_plot(nplot_rows, nplot_cols, plot_index, data, plot_label):
""" Create a single map subplot. """
ax = plt.subplot(nplot_rows, nplot_cols, plot_index, projection = ccrs.Robinson(central_longitude = 180))
cplot = plt.contourf(lons, lats, data,
levels = contour_levels,
cmap = color_map,
extend = 'both',
transform = ccrs.PlateCarree())
ax.coastlines(color = 'grey')
ax.text(0.01, 0.01, plot_label, fontSize = 14, transform = ax.transAxes)
return cplot, ax
# Generate plot (may take a while as many individual maps are generated)
numPlotRows = 8
numPlotCols = 4
figWidth = 20
figHeight = 30
fig, axs = plt.subplots(numPlotRows, numPlotCols, figsize=(figWidth,figHeight))
lats = winter_trends.lat
lons = winter_trends.lon
# Create ensemble member plots
for ensemble_index in range(30):
plot_data = winter_trends.isel(member_id = ensemble_index)
plot_index = ensemble_index + 1
plot_label = str(plot_index)
plotRow = ensemble_index // numPlotCols
plotCol = ensemble_index % numPlotCols
# Retain axes objects for figure colorbar
cplot, axs[plotRow, plotCol] = make_map_plot(numPlotRows, numPlotCols, plot_index, plot_data, plot_label)
# Create plots for the ensemble mean, observations, and a figure color bar.
cplot, axs[7,2] = make_map_plot(numPlotRows, numPlotCols, 31, winter_trends_mean, 'EM')
lats = obs_winter_trends.lat
lons = obs_winter_trends.lon
cplot, axs[7,3] = make_map_plot(numPlotRows, numPlotCols, 32, obs_winter_trends.tempanomaly, 'OBS')
cbar = fig.colorbar(cplot, ax=axs, orientation='horizontal', shrink = 0.7, pad = 0.02)
cbar.ax.set_title('1979-2012 DJF surface air temperature trends (K/34 years)', fontSize = 16)
cbar.set_ticks(contour_levels)
cbar.set_ticklabels(contour_levels)
Figure will appear above when ready. Compare with Fig. 4 of Kay et al. 2015 (doi:10.1175/BAMS-D-13-00255.1).
# Gracefully destroy/close our cluster
client.close()
cluster.close()