import xarray as xr
import numpy as np
import intake
import ast

import matplotlib.pyplot as plt


import dask
from ncar_jobqueue import NCARCluster

num_jobs = 20
walltime = "1:00:00"
memory='20GB' 
cluster = NCARCluster(walltime=walltime, memory=memory)
cluster.scale(jobs=num_jobs)


from distributed import Client
client = Client(cluster)
cluster


# Define the catalog description file location.   
catalog_url = "https://ncar-na-cordex.s3-us-west-2.amazonaws.com/catalogs/aws-na-cordex.json"

# Have the catalog interpret the "na-cordex-models" column as a list of values.
col = intake.open_esm_datastore(catalog_url, csv_kwargs={"converters": {"na-cordex-models": ast.literal_eval}},)
col


# Show the first few lines of the catalog
col.df.head(10)


# Produce a catalog content summary.
import pprint

uniques = col.unique(
    columns=["variable", "scenario", "grid", "na-cordex-models", "bias_correction"]
)
pprint.pprint(uniques, compact=True, indent=4)

{   'bias_correction': {   'count': 3,
                           'values': ['mbcn-Daymet', 'raw', 'mbcn-gridMET']},
    'grid': {'count': 2, 'values': ['NAM-44i', 'NAM-22i']},
    'na-cordex-models': {   'count': 26,
                            'values': [   'MPI-ESM-LR.CRCM5-OUR',
                                          'CanESM2.CRCM5-OUR', 'GFDL-ESM2M.WRF',
                                          'HadGEM2-ES.RegCM4',
                                          'CanESM2.CRCM5-UQAM', 'ERA-Int.RCA4',
                                          'ERA-Int.CRCM5-OUR',
                                          'EC-EARTH.HIRHAM5',
                                          'MPI-ESM-MR.CRCM5-UQAM',
                                          'ERA-Int.HIRHAM5', 'MPI-ESM-LR.WRF',
                                          'HadGEM2-ES.WRF',
                                          'GEMatm-MPI.CRCM5-UQAM',
                                          'EC-EARTH.RCA4',
                                          'GEMatm-Can.CRCM5-UQAM',
                                          'CNRM-CM5.CRCM5-OUR', 'ERA-Int.WRF',
                                          'CanESM2.RCA4',
                                          'MPI-ESM-LR.CRCM5-UQAM',
                                          'ERA-Int.CanRCM4',
                                          'ERA-Int.CRCM5-UQAM',
                                          'GFDL-ESM2M.RegCM4',
                                          'CanESM2.CanRCM4',
                                          'GFDL-ESM2M.CRCM5-OUR',
                                          'ERA-Int.RegCM4',
                                          'MPI-ESM-LR.RegCM4']},
    'scenario': {   'count': 6,
                    'values': [   'rcp45', 'hist-rcp45', 'hist-rcp85', 'rcp85',
                                  'hist', 'eval']},
    'variable': {   'count': 15,
                    'values': [   'ps', 'tmin', 'tasmin', 'tmax', 'vas', 'hurs',
                                  'tasmax', 'rsds', 'pr', 'prec', 'uas', 'tas',
                                  'sfcWind', 'huss', 'temp']}}


data_var = 'tmax'

col_subset = col.search(
    variable=data_var,
    grid="NAM-44i",
    scenario="eval",
    bias_correction="raw",
)

col_subset


col_subset.df


# Load catalog entries for subset into a dictionary of xarray datasets, and open the first one.
dsets = col_subset.to_dataset_dict(
    zarr_kwargs={"consolidated": True}, storage_options={"anon": True}
)
print(f"\nDataset dictionary keys:\n {dsets.keys()}")

# Load the first dataset and display a summary.
dataset_key = list(dsets.keys())[0]
store_name = dataset_key + ".zarr"

ds = dsets[dataset_key]
ds

# Note that the summary includes a 'member_id' coordinate, which is a renaming of the 
# 'na-cordex-models' column in the catalog.

--> The keys in the returned dictionary of datasets are constructed as follows:
	'variable.frequency.scenario.grid.bias_correction'

Dataset dictionary keys:
 dict_keys(['tmax.day.eval.NAM-44i.raw'])

<xarray.Dataset>
Dimensions:    (bnds: 2, lat: 129, lon: 300, member_id: 6, time: 13514)
Coordinates:
  * lat        (lat) float64 12.25 12.75 13.25 13.75 ... 74.75 75.25 75.75 76.25
  * lon        (lon) float64 -171.8 -171.2 -170.8 ... -23.25 -22.75 -22.25
  * member_id  (member_id) <U18 'ERA-Int.CRCM5-UQAM' ... 'ERA-Int.WRF'
  * time       (time) datetime64[ns] 1979-01-01T12:00:00 ... 2015-12-31T12:00:00
    time_bnds  (time, bnds) datetime64[ns] dask.array<chunksize=(13514, 2), meta=np.ndarray>
Dimensions without coordinates: bnds
Data variables:
    tmax       (member_id, time, lat, lon) float32 dask.array<chunksize=(4, 1000, 65, 150), meta=np.ndarray>
Attributes: (12/26)
    CORDEX_domain:                  NAM-44
    contact:                        {"ERA-Int.CRCM5-UQAM": "Winger.Katja@uqam...
    contact_note:                   {"ERA-Int.RegCM4": "Simulations by Ray Ar...
    creation_date:                  {"ERA-Int.CRCM5-UQAM": "2015-06-18T10:56:...
    driving_experiment:             {"ERA-Int.CRCM5-UQAM": "ECMWF-ERAINT, eva...
    driving_experiment_name:        evaluation
    ...                             ...
    tracking_id:                    {"ERA-Int.CRCM5-UQAM": "a8d17321-0e16-46e...
    version:                        {"ERA-Int.CRCM5-UQAM": "1.2", "ERA-Int.Re...
    zarr-dataset-reference:         For dataset documentation, see DOI https:...
    zarr-version:                   1.0
    intake_esm_varname:             ['tmax']
    intake_esm_dataset_key:         tmax.day.eval.NAM-44i.raw

array([12.25, 12.75, 13.25, 13.75, 14.25, 14.75, 15.25, 15.75, 16.25, 16.75,
       17.25, 17.75, 18.25, 18.75, 19.25, 19.75, 20.25, 20.75, 21.25, 21.75,
       22.25, 22.75, 23.25, 23.75, 24.25, 24.75, 25.25, 25.75, 26.25, 26.75,
       27.25, 27.75, 28.25, 28.75, 29.25, 29.75, 30.25, 30.75, 31.25, 31.75,
       32.25, 32.75, 33.25, 33.75, 34.25, 34.75, 35.25, 35.75, 36.25, 36.75,
       37.25, 37.75, 38.25, 38.75, 39.25, 39.75, 40.25, 40.75, 41.25, 41.75,
       42.25, 42.75, 43.25, 43.75, 44.25, 44.75, 45.25, 45.75, 46.25, 46.75,
       47.25, 47.75, 48.25, 48.75, 49.25, 49.75, 50.25, 50.75, 51.25, 51.75,
       52.25, 52.75, 53.25, 53.75, 54.25, 54.75, 55.25, 55.75, 56.25, 56.75,
       57.25, 57.75, 58.25, 58.75, 59.25, 59.75, 60.25, 60.75, 61.25, 61.75,
       62.25, 62.75, 63.25, 63.75, 64.25, 64.75, 65.25, 65.75, 66.25, 66.75,
       67.25, 67.75, 68.25, 68.75, 69.25, 69.75, 70.25, 70.75, 71.25, 71.75,
       72.25, 72.75, 73.25, 73.75, 74.25, 74.75, 75.25, 75.75, 76.25])

array([-171.75, -171.25, -170.75, ...,  -23.25,  -22.75,  -22.25])

array(['ERA-Int.CRCM5-UQAM', 'ERA-Int.RegCM4', 'ERA-Int.HIRHAM5',
       'ERA-Int.RCA4', 'ERA-Int.CanRCM4', 'ERA-Int.WRF'], dtype='<U18')

array(['1979-01-01T12:00:00.000000000', '1979-01-02T12:00:00.000000000',
       '1979-01-03T12:00:00.000000000', ..., '2015-12-29T12:00:00.000000000',
       '2015-12-30T12:00:00.000000000', '2015-12-31T12:00:00.000000000'],
      dtype='datetime64[ns]')


def plotMap(ax, map_slice, date_object=None, member_id=None):
    """Create a map plot on the given axes, with min/max as text"""

    ax.imshow(map_slice, origin='lower')

    minval = map_slice.min(dim = ['lat', 'lon'])
    maxval = map_slice.max(dim = ['lat', 'lon'])

    # Format values to have at least 4 digits of precision.
    ax.text(0.01, 0.03, "Min: %3g" % minval, transform=ax.transAxes, fontsize=12)
    ax.text(0.99, 0.03, "Max: %3g" % maxval, transform=ax.transAxes, fontsize=12, horizontalalignment='right')
    ax.set_xticks([])
    ax.set_yticks([])
    
    if date_object:
        ax.set_title(date_object.values.astype(str)[:10], fontsize=12)
        
    if member_id:
        ax.set_ylabel(member_id, fontsize=12)
        
    return ax


def getValidDateIndexes(member_slice):
    """Search for the first and last dates with finite values."""
    min_values = member_slice.min(dim = ['lat', 'lon'])
    is_finite = np.isfinite(min_values)
    finite_indexes = np.where(is_finite)

    start_index = finite_indexes[0][0]
    end_index = finite_indexes[0][-1]
    return start_index, end_index


def plot_first_mid_last(ds, data_var, store_name):
    """Plot the first, middle, and final time steps for several climate runs."""
    num_members_to_plot = 4
    member_names = ds.coords['member_id'].values[0:num_members_to_plot]
    
    figWidth = 18 
    figHeight = 12 
    numPlotColumns = 3
    fig, axs = plt.subplots(num_members_to_plot, numPlotColumns, figsize=(figWidth, figHeight), constrained_layout=True)

    for index in np.arange(num_members_to_plot):
        mem_id = member_names[index]
        data_slice = ds[data_var].sel(member_id=mem_id)
           
        start_index, end_index = getValidDateIndexes(data_slice)
        midDateIndex = np.floor(len(ds.time) / 2).astype(int)

        startDate = ds.time[start_index]
        first_step = data_slice.sel(time=startDate) 
        ax = axs[index, 0]
        plotMap(ax, first_step, startDate, mem_id)

        midDate = ds.time[midDateIndex]
        mid_step = data_slice.sel(time=midDate)   
        ax = axs[index, 1]
        plotMap(ax, mid_step, midDate)

        endDate = ds.time[end_index]
        last_step = data_slice.sel(time=endDate)            
        ax = axs[index, 2]
        plotMap(ax, last_step, endDate)
        
        plt.suptitle(f'First, Middle, and Last Timesteps for Selected Runs in "{store_name}"', fontsize=20)

    return fig


def plot_stat_maps(ds, data_var, store_name):
    """Plot the mean, min, max, and standard deviation values for several climate runs, aggregated over time."""
    
    num_members_to_plot = 4
    member_names = ds.coords['member_id'].values[0:num_members_to_plot]

    figWidth = 25 
    figHeight = 12 
    numPlotColumns = 4
    
    fig, axs = plt.subplots(num_members_to_plot, numPlotColumns, figsize=(figWidth, figHeight), constrained_layout=True)

    for index in np.arange(num_members_to_plot):
        mem_id = member_names[index]
        data_slice = ds[data_var].sel(member_id=mem_id)

        data_agg = data_slice.min(dim='time')
        plotMap(axs[index, 0], data_agg, member_id=mem_id)

        data_agg = data_slice.max(dim='time')
        plotMap(axs[index, 1], data_agg)

        data_agg = data_slice.mean(dim='time')
        plotMap(axs[index, 2], data_agg)

        data_agg = data_slice.std(dim='time')
        plotMap(axs[index, 3], data_agg)

    axs[0, 0].set_title(f'min({data_var})', fontsize=15)
    axs[0, 1].set_title(f'max({data_var})', fontsize=15)
    axs[0, 2].set_title(f'mean({data_var})', fontsize=15)
    axs[0, 3].set_title(f'std({data_var})', fontsize=15)

    plt.suptitle(f'Spatial Statistics for Selected Runs in "{store_name}"', fontsize=20)

    return fig


def plot_timeseries(ds, data_var, store_name):
    """Plot the mean, min, max, and standard deviation values for several climate runs, 
       aggregated over lat/lon dimensions."""

    num_members_to_plot = 4
    member_names = ds.coords['member_id'].values[0:num_members_to_plot]

    figWidth = 25 
    figHeight = 20
    linewidth = 0.5

    numPlotColumns = 1
    fig, axs = plt.subplots(num_members_to_plot, numPlotColumns, figsize=(figWidth, figHeight))
        
    for index in np.arange(num_members_to_plot):
        mem_id = member_names[index]
        data_slice = ds[data_var].sel(member_id=mem_id)
        unit_string = ds[data_var].attrs['units']
        
        min_vals = data_slice.min(dim = ['lat', 'lon'])
        max_vals = data_slice.max(dim = ['lat', 'lon'])
        mean_vals = data_slice.mean(dim = ['lat', 'lon'])
        std_vals = data_slice.std(dim = ['lat', 'lon'])

        missing_indexes = np.isnan(min_vals)
        missing_times = ds.time[missing_indexes]

            
        axs[index].plot(ds.time, max_vals, linewidth=linewidth, label='max', color='red')
        axs[index].plot(ds.time, mean_vals, linewidth=linewidth, label='mean', color='black')
        axs[index].fill_between(ds.time, (mean_vals - std_vals), (mean_vals + std_vals), 
                                         color='grey', linewidth=0, label='std', alpha=0.5)
        axs[index].plot(ds.time, min_vals, linewidth=linewidth, label='min', color='blue')
            
        ymin, ymax = axs[index].get_ylim()
        rug_y = ymin + 0.01*(ymax-ymin)
        axs[index].plot(missing_times, [rug_y]*len(missing_times), '|', color='m', label='missing')
        axs[index].set_title(mem_id, fontsize=20)
        axs[index].legend(loc='upper right')
        axs[index].set_ylabel(unit_string)

    plt.tight_layout(pad=10.2, w_pad=3.5, h_pad=3.5)
    plt.suptitle(f'Temporal Statistics for Selected Runs in "{store_name}"', fontsize=20)

    return fig


%%time

# Plot using the Zarr Store obtained from an earlier step in the notebook.
figure = plot_first_mid_last(ds, data_var, store_name)

plt.show()

CPU times: user 7.07 s, sys: 470 ms, total: 7.54 s
Wall time: 1min 2s


SAVE_PLOT = False
if SAVE_PLOT:
    plotfile = f'./{dataset_key}_FML.png'
    figure.savefig(plotfile, dpi=100)


%%time

# Plot using the Zarr Store obtained from an earlier step in the notebook.
figure = plot_stat_maps(ds, data_var, store_name)

plt.show()

CPU times: user 30.8 s, sys: 1.81 s, total: 32.6 s
Wall time: 4min 20s


SAVE_PLOT = False
if SAVE_PLOT:
    plotfile = f'./{dataset_key}_MAPS.png'
    figure.savefig(plotfile, dpi=100)


%%time

# Plot using the Zarr Store obtained from an earlier step in the notebook.
figure = plot_timeseries(ds, data_var, store_name)

plt.show()

CPU times: user 42.7 s, sys: 4.06 s, total: 46.8 s
Wall time: 5min 49s


SAVE_PLOT = False
if SAVE_PLOT:
    plotfile = f'./{dataset_key}_TS.png'
    figure.savefig(plotfile, dpi=100)


!date


cluster.close()

	variable	standard_name	long_name	units	spatial_domain	grid	spatial_resolution	scenario	start_time	end_time	frequency	vertical_levels	bias_correction	na-cordex-models	path
0	hurs	relative_humidity	Near-Surface Relative Humidity	%	north_america	NAM-22i	0.25 deg	eval	1979-01-01T12:00:00	2014-12-31T12:00:00	day	1	raw	[ERA-Int.CRCM5-UQAM, ERA-Int.CRCM5-OUR, ERA-In...	s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i....
1	hurs	relative_humidity	Near-Surface Relative Humidity	%	north_america	NAM-44i	0.50 deg	eval	1979-01-01T12:00:00	2015-12-31T12:00:00	day	1	raw	[ERA-Int.CRCM5-UQAM, ERA-Int.RegCM4, ERA-Int.H...	s3://ncar-na-cordex/day/hurs.eval.day.NAM-44i....
2	hurs	relative_humidity	Near-Surface Relative Humidity	%	north_america	NAM-22i	0.25 deg	hist-rcp45	1949-01-01T12:00:00	2100-12-31T12:00:00	day	1	mbcn-Daymet	[CanESM2.CanRCM4]	s3://ncar-na-cordex/day/hurs.hist-rcp45.day.NA...
3	hurs	relative_humidity	Near-Surface Relative Humidity	%	north_america	NAM-22i	0.25 deg	hist-rcp45	1949-01-01T12:00:00	2100-12-31T12:00:00	day	1	mbcn-gridMET	[CanESM2.CanRCM4]	s3://ncar-na-cordex/day/hurs.hist-rcp45.day.NA...
4	hurs	relative_humidity	Near-Surface Relative Humidity	%	north_america	NAM-22i	0.25 deg	hist-rcp45	1949-01-01T12:00:00	2100-12-31T12:00:00	day	1	raw	[GFDL-ESM2M.CRCM5-OUR, CanESM2.CRCM5-OUR, CanE...	s3://ncar-na-cordex/day/hurs.hist-rcp45.day.NA...
5	hurs	relative_humidity	Near-Surface Relative Humidity	%	north_america	NAM-44i	0.50 deg	hist-rcp45	1949-01-01T12:00:00	2100-12-31T12:00:00	day	1	mbcn-Daymet	[MPI-ESM-LR.CRCM5-UQAM, CanESM2.CRCM5-UQAM, EC...	s3://ncar-na-cordex/day/hurs.hist-rcp45.day.NA...
6	hurs	relative_humidity	Near-Surface Relative Humidity	%	north_america	NAM-44i	0.50 deg	hist-rcp45	1949-01-01T12:00:00	2100-12-31T12:00:00	day	1	mbcn-gridMET	[MPI-ESM-LR.CRCM5-UQAM, CanESM2.CRCM5-UQAM, EC...	s3://ncar-na-cordex/day/hurs.hist-rcp45.day.NA...
7	hurs	relative_humidity	Near-Surface Relative Humidity	%	north_america	NAM-44i	0.50 deg	hist-rcp45	1949-01-01T12:00:00	2100-12-31T12:00:00	day	1	raw	[MPI-ESM-LR.CRCM5-UQAM, CanESM2.CRCM5-UQAM, EC...	s3://ncar-na-cordex/day/hurs.hist-rcp45.day.NA...
8	hurs	relative_humidity	Near-Surface Relative Humidity	%	north_america	NAM-22i	0.25 deg	hist-rcp85	1949-01-01T12:00:00	2100-12-31T12:00:00	day	1	mbcn-Daymet	[MPI-ESM-MR.CRCM5-UQAM, GEMatm-Can.CRCM5-UQAM,...	s3://ncar-na-cordex/day/hurs.hist-rcp85.day.NA...
9	hurs	relative_humidity	Near-Surface Relative Humidity	%	north_america	NAM-22i	0.25 deg	hist-rcp85	1949-01-01T12:00:00	2100-12-31T12:00:00	day	1	mbcn-gridMET	[MPI-ESM-MR.CRCM5-UQAM, GEMatm-Can.CRCM5-UQAM,...	s3://ncar-na-cordex/day/hurs.hist-rcp85.day.NA...

Make Diagnostic Plots for NA-CORDEX Zarr Stores¶

Create and Connect to a Dask Distributed Cluster¶

Find and Obtain Data Using an Intake Catalog¶

Open catalog and produce a content summary¶

Load data into xarray using the catalog¶

Functions for Plotting¶

Helper Function to Create a Single Map Plot¶

Helper Function for Finding Dates with Available Data¶

Function Producing Maps of First, Middle, and Final Timesteps¶

Function Producing Statistical Map Plots¶

Function Producing Time Series Plots¶

Produce Diagnostic Plots¶

Plot First, Middle, and Final Timesteps for Several Output Runs (less compute intensive)¶

Optional: save figure to a PNG file¶

Create Statistical Map Plots for Several Output Runs (more compute intensive)¶

Optional: save figure to a PNG file¶

Plot Time Series for Several Output Runs (more compute intensive)¶

Optional: save figure to a PNG file¶

Release the workers.¶