Lecture 13: Cloud computing¶

Reference: https://github.com/pangeo-gallery/cmip6

Install python packages: zarr, fsspec, gcsfs, nc-time-axis

In [ ]:

Copied!





from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
import zarr
import fsspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
import zarr
import fsspec

Browse Catalog¶

The data catatalog is stored as a CSV file. Here we read it with Pandas.

In [3]:

Copied!

df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')
df.head()
df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv')
df.head()

Out[3]:

	activity_id	institution_id	source_id	experiment_id	member_id	table_id	variable_id	grid_label	zstore	dcpp_init_year	version
0	HighResMIP	CMCC	CMCC-CM2-HR4	highresSST-present	r1i1p1f1	Amon	ps	gn	gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/...	NaN	20170706
1	HighResMIP	CMCC	CMCC-CM2-HR4	highresSST-present	r1i1p1f1	Amon	rsds	gn	gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/...	NaN	20170706
2	HighResMIP	CMCC	CMCC-CM2-HR4	highresSST-present	r1i1p1f1	Amon	rlus	gn	gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/...	NaN	20170706
3	HighResMIP	CMCC	CMCC-CM2-HR4	highresSST-present	r1i1p1f1	Amon	rlds	gn	gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/...	NaN	20170706
4	HighResMIP	CMCC	CMCC-CM2-HR4	highresSST-present	r1i1p1f1	Amon	psl	gn	gs://cmip6/CMIP6/HighResMIP/CMCC/CMCC-CM2-HR4/...	NaN	20170706

In [4]:

Copied!

df['source_id'].unique()
df['source_id'].unique()

Out[4]:

array(['CMCC-CM2-HR4', 'EC-Earth3P-HR', 'HadGEM3-GC31-MM',
       'HadGEM3-GC31-HM', 'HadGEM3-GC31-LM', 'EC-Earth3P', 'ECMWF-IFS-HR',
       'ECMWF-IFS-LR', 'HadGEM3-GC31-LL', 'CMCC-CM2-VHR4', 'GFDL-CM4',
       'GFDL-AM4', 'IPSL-CM6A-LR', 'E3SM-1-0', 'CNRM-CM6-1', 'GFDL-ESM4',
       'GFDL-ESM2M', 'GFDL-CM4C192', 'GFDL-OM4p5B', 'GISS-E2-1-G',
       'GISS-E2-1-H', 'CNRM-ESM2-1', 'BCC-CSM2-MR', 'BCC-ESM1', 'MIROC6',
       'AWI-CM-1-1-MR', 'EC-Earth3-LR', 'IPSL-CM6A-ATM-HR', 'CESM2',
       'CESM2-WACCM', 'CNRM-CM6-1-HR', 'MRI-ESM2-0', 'CanESM5',
       'SAM0-UNICON', 'GISS-E2-1-G-CC', 'UKESM1-0-LL', 'EC-Earth3',
       'EC-Earth3-Veg', 'FGOALS-f3-L', 'CanESM5-CanOE', 'INM-CM4-8',
       'INM-CM5-0', 'NESM3', 'MPI-ESM-1-2-HAM', 'CAMS-CSM1-0',
       'MPI-ESM1-2-LR', 'MPI-ESM1-2-HR', 'MRI-AGCM3-2-H', 'MRI-AGCM3-2-S',
       'MCM-UA-1-0', 'INM-CM5-H', 'KACE-1-0-G', 'NorESM2-LM',
       'FGOALS-f3-H', 'FGOALS-g3', 'MIROC-ES2L', 'FIO-ESM-2-0', 'NorCPM1',
       'NorESM1-F', 'MPI-ESM1-2-XR', 'CESM1-1-CAM5-CMIP5', 'E3SM-1-1',
       'KIOST-ESM', 'ACCESS-CM2', 'NorESM2-MM', 'ACCESS-ESM1-5',
       'IITM-ESM', 'GISS-E2-2-G', 'CESM2-FV2', 'GISS-E2-2-H',
       'CESM2-WACCM-FV2', 'CIESM', 'E3SM-1-1-ECA', 'TaiESM1',
       'AWI-ESM-1-1-LR', 'EC-Earth3-Veg-LR', 'CMCC-ESM2', 'CAS-ESM2-0',
       'CMCC-CM2-SR5', 'EC-Earth3-AerChem', 'IPSL-CM6A-LR-INCA',
       'IPSL-CM5A2-INCA', 'BCC-CSM2-HR', 'EC-Earth3P-VHR',
       'CESM1-WACCM-SC', 'EC-Earth3-CC', 'MIROC-ES2H', 'ICON-ESM-LR'],
      dtype=object)

In [5]:

Copied!

df['experiment_id'].unique()
df['experiment_id'].unique()

Out[5]:

array(['highresSST-present', 'piControl', 'control-1950', 'hist-1950',
       'historical', 'amip', 'abrupt-4xCO2', 'abrupt-2xCO2',
       'abrupt-0p5xCO2', '1pctCO2', 'ssp585', 'esm-piControl', 'esm-hist',
       'hist-piAer', 'histSST-1950HC', 'ssp245', 'hist-1950HC', 'histSST',
       'piClim-2xVOC', 'piClim-2xNOx', 'piClim-2xdust', 'piClim-2xss',
       'piClim-histall', 'hist-piNTCF', 'histSST-piNTCF',
       'aqua-control-lwoff', 'piClim-lu', 'histSST-piO3', 'piClim-CH4',
       'piClim-NTCF', 'piClim-NOx', 'piClim-O3', 'piClim-HC',
       'faf-heat-NA0pct', 'ssp370SST-lowCH4', 'piClim-VOC',
       'ssp370-lowNTCF', 'piClim-control', 'piClim-aer', 'hist-aer',
       'faf-heat', 'faf-heat-NA50pct', 'ssp370SST-lowNTCF',
       'ssp370SST-ssp126Lu', 'ssp370SST', 'ssp370pdSST', 'histSST-piAer',
       'piClim-ghg', 'piClim-anthro', 'faf-all', 'hist-nat', 'hist-GHG',
       'ssp119', 'piClim-histnat', 'piClim-4xCO2', 'ssp370',
       'piClim-histghg', 'highresSST-future', 'esm-ssp585-ssp126Lu',
       'ssp126-ssp370Lu', 'ssp370-ssp126Lu', 'land-noLu', 'histSST-piCH4',
       'ssp126', 'esm-pi-CO2pulse', 'amip-hist', 'piClim-histaer',
       'amip-4xCO2', 'faf-water', 'faf-passiveheat', '1pctCO2-rad',
       'faf-stress', '1pctCO2-bgc', 'aqua-control', 'amip-future4K',
       'amip-p4K', 'aqua-p4K', 'amip-lwoff', 'amip-m4K', 'aqua-4xCO2',
       'amip-p4K-lwoff', 'hist-noLu', '1pctCO2-cdr',
       'land-hist-altStartYear', 'land-hist', 'omip1', 'esm-pi-cdr-pulse',
       'esm-ssp585', 'abrupt-solp4p', 'piControl-spinup', 'hist-stratO3',
       'abrupt-solm4p', 'midHolocene', 'lig127k', 'aqua-p4K-lwoff',
       'esm-piControl-spinup', 'ssp245-GHG', 'ssp245-nat',
       'dcppC-amv-neg', 'dcppC-amv-ExTrop-neg', 'dcppC-atl-control',
       'dcppC-amv-pos', 'dcppC-ipv-NexTrop-neg', 'dcppC-ipv-NexTrop-pos',
       'dcppC-atl-pacemaker', 'dcppC-amv-ExTrop-pos',
       'dcppC-amv-Trop-neg', 'dcppC-pac-control', 'dcppC-ipv-pos',
       'dcppC-pac-pacemaker', 'dcppC-ipv-neg', 'dcppC-amv-Trop-pos',
       'piClim-BC', 'piClim-2xfire', 'piClim-SO2', 'piClim-OC',
       'piClim-N2O', 'piClim-2xDMS', 'ssp460', 'ssp434', 'ssp534-over',
       'deforest-globe', 'historical-cmip5', 'hist-bgc',
       'piControl-cmip5', 'rcp26-cmip5', 'rcp45-cmip5', 'rcp85-cmip5',
       'pdSST-piArcSIC', 'pdSST-piAntSIC', 'piSST-piSIC', 'piSST-pdSIC',
       'ssp245-stratO3', 'hist-sol', 'hist-CO2', 'hist-volc',
       'hist-totalO3', 'hist-nat-cmip5', 'hist-aer-cmip5',
       'hist-GHG-cmip5', 'pdSST-futAntSIC', 'futSST-pdSIC', 'pdSST-pdSIC',
       'ssp245-aer', 'pdSST-futArcSIC', 'dcppA-hindcast', 'dcppA-assim',
       'dcppC-hindcast-noPinatubo', 'dcppC-hindcast-noElChichon',
       'dcppC-hindcast-noAgung', 'hist-resIPO', 'ssp245-cov-modgreen',
       'ssp245-cov-fossil', 'ssp245-cov-strgreen', 'ssp245-covid', 'lgm',
       'ssp585-bgc', '1pctCO2to4x-withism', '1pctCO2-4xext', 'past1000',
       'pa-futArcSIC', 'pa-pdSIC', 'historical-ext', 'pdSST-futArcSICSIT',
       'pdSST-futOkhotskSIC', 'pdSST-futBKSeasSIC', 'pa-piArcSIC',
       'pa-piAntSIC', 'pa-futAntSIC', 'pdSST-pdSICSIT'], dtype=object)

In [6]:

Copied!

df['institution_id'].unique()
df['institution_id'].unique()

Out[6]:

array(['CMCC', 'EC-Earth-Consortium', 'MOHC', 'ECMWF', 'NOAA-GFDL',
       'IPSL', 'E3SM-Project', 'CNRM-CERFACS', 'NASA-GISS', 'BCC',
       'MIROC', 'AWI', 'NCAR', 'MRI', 'CCCma', 'SNU', 'CAS', 'INM',
       'NUIST', 'HAMMOZ-Consortium', 'CAMS', 'MPI-M', 'DKRZ', 'DWD', 'UA',
       'NIMS-KMA', 'NCC', 'FIO-QLNM', 'KIOST', 'CSIRO-ARCCSS', 'CSIRO',
       'CCCR-IITM', 'THU', 'AS-RCEC', 'NERC', 'RUBISCO'], dtype=object)

The columns of the dataframe correspond to the CMI6 controlled vocabulary. A beginners' guide to these terms is available in this document.

Here we filter the data to find monthly surface air temperature for historical experiments.

In [7]:

Copied!

df_ta = df.query("activity_id=='CMIP' & table_id == 'Amon' & variable_id == 'tas' & experiment_id == 'historical'")
df_ta
df_ta = df.query("activity_id=='CMIP' & table_id == 'Amon' & variable_id == 'tas' & experiment_id == 'historical'")
df_ta

Out[7]:

	activity_id	institution_id	source_id	experiment_id	member_id	table_id	variable_id	grid_label	zstore	dcpp_init_year	version
973	CMIP	NOAA-GFDL	GFDL-ESM4	historical	r3i1p1f1	Amon	tas	gr1	gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/hist...	NaN	20180701
1766	CMIP	NOAA-GFDL	GFDL-ESM4	historical	r2i1p1f1	Amon	tas	gr1	gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/hist...	NaN	20180701
8074	CMIP	NOAA-GFDL	GFDL-CM4	historical	r1i1p1f1	Amon	tas	gr1	gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/histo...	NaN	20180701
22185	CMIP	IPSL	IPSL-CM6A-LR	historical	r8i1p1f1	Amon	tas	gr	gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor...	NaN	20180803
22298	CMIP	IPSL	IPSL-CM6A-LR	historical	r2i1p1f1	Amon	tas	gr	gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor...	NaN	20180803
...	...	...	...	...	...	...	...	...	...	...	...
522952	CMIP	MRI	MRI-ESM2-0	historical	r7i1p1f1	Amon	tas	gn	gs://cmip6/CMIP6/CMIP/MRI/MRI-ESM2-0/historica...	NaN	20210813
523274	CMIP	MRI	MRI-ESM2-0	historical	r6i1p1f1	Amon	tas	gn	gs://cmip6/CMIP6/CMIP/MRI/MRI-ESM2-0/historica...	NaN	20210907
523712	CMIP	CMCC	CMCC-CM2-SR5	historical	r3i1p2f1	Amon	tas	gn	gs://cmip6/CMIP6/CMIP/CMCC/CMCC-CM2-SR5/histor...	NaN	20211108
523721	CMIP	CMCC	CMCC-CM2-SR5	historical	r2i1p2f1	Amon	tas	gn	gs://cmip6/CMIP6/CMIP/CMCC/CMCC-CM2-SR5/histor...	NaN	20211109
523769	CMIP	EC-Earth-Consortium	EC-Earth3-Veg	historical	r1i1p1f1	Amon	tas	gr	gs://cmip6/CMIP6/CMIP/EC-Earth-Consortium/EC-E...	NaN	20211207

635 rows × 11 columns

Now we do further filtering to find just the models from NOAA-GFDL.

In [17]:

Copied!

df_ta_gfdl = df_ta.query('institution_id == "NOAA-GFDL"')
df_ta_gfdl
df_ta_gfdl = df_ta.query('institution_id == "NOAA-GFDL"')
df_ta_gfdl

Out[17]:

	activity_id	institution_id	source_id	experiment_id	member_id	table_id	variable_id	grid_label	zstore	dcpp_init_year	version
973	CMIP	NOAA-GFDL	GFDL-ESM4	historical	r3i1p1f1	Amon	tas	gr1	gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/hist...	NaN	20180701
1766	CMIP	NOAA-GFDL	GFDL-ESM4	historical	r2i1p1f1	Amon	tas	gr1	gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/hist...	NaN	20180701
8074	CMIP	NOAA-GFDL	GFDL-CM4	historical	r1i1p1f1	Amon	tas	gr1	gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/histo...	NaN	20180701
244695	CMIP	NOAA-GFDL	GFDL-ESM4	historical	r1i1p1f1	Amon	tas	gr1	gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/hist...	NaN	20190726

Load Data¶

Now we will load a single store using fsspec, zarr, and xarray.

In [18]:

Copied!

df_ta_gfdl
df_ta_gfdl

Out[18]:

	activity_id	institution_id	source_id	experiment_id	member_id	table_id	variable_id	grid_label	zstore	dcpp_init_year	version
973	CMIP	NOAA-GFDL	GFDL-ESM4	historical	r3i1p1f1	Amon	tas	gr1	gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/hist...	NaN	20180701
1766	CMIP	NOAA-GFDL	GFDL-ESM4	historical	r2i1p1f1	Amon	tas	gr1	gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/hist...	NaN	20180701
8074	CMIP	NOAA-GFDL	GFDL-CM4	historical	r1i1p1f1	Amon	tas	gr1	gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/histo...	NaN	20180701
244695	CMIP	NOAA-GFDL	GFDL-ESM4	historical	r1i1p1f1	Amon	tas	gr1	gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/hist...	NaN	20190726

In [19]:

Copied!





# get the path to a specific zarr store (the first one from the dataframe above)
zstore = df_ta_gfdl.zstore.values[-1]
print(zstore)

# create an interface to the store
mapper = fsspec.get_mapper(zstore)

# open it using xarray and zarr
ds = xr.open_zarr(mapper, consolidated=True)
ds
# get the path to a specific zarr store (the first one from the dataframe above)
zstore = df_ta_gfdl.zstore.values[-1]
print(zstore)

# create an interface to the store
mapper = fsspec.get_mapper(zstore)

# open it using xarray and zarr
ds = xr.open_zarr(mapper, consolidated=True)
ds

gs://cmip6/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/historical/r1i1p1f1/Amon/tas/gr1/v20190726/

Out[19]:

<xarray.Dataset>
Dimensions:    (bnds: 2, lat: 180, lon: 288, time: 1980)
Coordinates:
  * bnds       (bnds) float64 1.0 2.0
    height     float64 ...
  * lat        (lat) float64 -89.5 -88.5 -87.5 -86.5 ... 86.5 87.5 88.5 89.5
    lat_bnds   (lat, bnds) float64 dask.array<chunksize=(180, 2), meta=np.ndarray>
  * lon        (lon) float64 0.625 1.875 3.125 4.375 ... 355.6 356.9 358.1 359.4
    lon_bnds   (lon, bnds) float64 dask.array<chunksize=(288, 2), meta=np.ndarray>
  * time       (time) object 1850-01-16 12:00:00 ... 2014-12-16 12:00:00
    time_bnds  (time, bnds) object dask.array<chunksize=(1980, 2), meta=np.ndarray>
Data variables:
    tas        (time, lat, lon) float32 dask.array<chunksize=(600, 180, 288), meta=np.ndarray>
Attributes: (12/49)
    Conventions:            CF-1.7 CMIP-6.0 UGRID-1.0
    activity_id:            CMIP
    branch_method:          standard
    branch_time_in_child:   0.0
    branch_time_in_parent:  36500.0
    comment:                <null ref>
    ...                     ...
    variable_id:            tas
    variant_info:           N/A
    variant_label:          r1i1p1f1
    status:                 2019-09-10;created;by nhn2@columbia.edu
    netcdf_tracking_ids:    hdl:21.14100/75e5c5a7-d7c4-4860-beb1-db454f25f13a...
    version_id:             v20190726

Plot a map from a specific date.

In [20]:

Copied!

ds.tas.sel(time='1950-01').plot()
ds.tas.sel(time='1950-01').plot()

Out[20]:

<matplotlib.collections.QuadMesh at 0x7fb548d59790>

No description has been provided for this image

Create a timeseries of global-average surface air temperature. For this we need the area weighting factor for each gridpoint.

In [22]:

Copied!

df_area = df.query("variable_id == 'areacella' & source_id == 'GFDL-ESM4'")
ds_area = xr.open_zarr(fsspec.get_mapper(df_area.zstore.values[0]), consolidated=True)
ds_area
df_area = df.query("variable_id == 'areacella' & source_id == 'GFDL-ESM4'")
ds_area = xr.open_zarr(fsspec.get_mapper(df_area.zstore.values[0]), consolidated=True)
ds_area

Out[22]:

<xarray.Dataset>
Dimensions:    (lat: 180, lon: 288, bnds: 2)
Coordinates:
  * bnds       (bnds) float64 1.0 2.0
  * lat        (lat) float64 -89.5 -88.5 -87.5 -86.5 ... 86.5 87.5 88.5 89.5
    lat_bnds   (lat, bnds) float64 dask.array<chunksize=(180, 2), meta=np.ndarray>
  * lon        (lon) float64 0.625 1.875 3.125 4.375 ... 355.6 356.9 358.1 359.4
    lon_bnds   (lon, bnds) float64 dask.array<chunksize=(288, 2), meta=np.ndarray>
Data variables:
    areacella  (lat, lon) float32 dask.array<chunksize=(180, 288), meta=np.ndarray>
Attributes: (12/48)
    Conventions:            CF-1.7 CMIP-6.0 UGRID-1.0
    activity_id:            CMIP
    branch_method:          standard
    branch_time_in_child:   0.0
    branch_time_in_parent:  36500.0
    comment:                <null ref>
    ...                     ...
    tracking_id:            hdl:21.14100/deeb5d3d-bbcc-464c-bb7d-ade2ac4d7ed0
    variable_id:            areacella
    variant_info:           N/A
    variant_label:          r1i1p1f1
    netcdf_tracking_ids:    hdl:21.14100/deeb5d3d-bbcc-464c-bb7d-ade2ac4d7ed0
    version_id:             v20180701

In [23]:

Copied!

total_area = ds_area.areacella.sum(dim=['lon', 'lat'])
ta_timeseries = (ds.tas * ds_area.areacella).sum(dim=['lon', 'lat']) / total_area
ta_timeseries
total_area = ds_area.areacella.sum(dim=['lon', 'lat'])
ta_timeseries = (ds.tas * ds_area.areacella).sum(dim=['lon', 'lat']) / total_area
ta_timeseries

Out[23]:

<xarray.DataArray (time: 1980)>
dask.array<truediv, shape=(1980,), dtype=float32, chunksize=(600,), chunktype=numpy.ndarray>
Coordinates:
  * time     (time) object 1850-01-16 12:00:00 ... 2014-12-16 12:00:00
    height   float64 ...

By default the data are loaded lazily, as Dask arrays. Here we trigger computation explicitly.

In [24]:

Copied!

%time ta_timeseries.load()
%time ta_timeseries.load()

CPU times: user 6.3 s, sys: 1.56 s, total: 7.87 s
Wall time: 40.4 s

Out[24]:

<xarray.DataArray (time: 1980)>
array([224.00783, 224.21144, 224.76314, ..., 226.22311, 225.26149,
       224.80829], dtype=float32)
Coordinates:
  * time     (time) object 1850-01-16 12:00:00 ... 2014-12-16 12:00:00
    height   float64 2.0

In [25]:

Copied!





ta_timeseries.plot(label='monthly')
ta_timeseries.rolling(time=12).mean().plot(label='12 month rolling mean')
plt.legend()
plt.title('Global Mean Surface Air Temperature')
ta_timeseries.plot(label='monthly')
ta_timeseries.rolling(time=12).mean().plot(label='12 month rolling mean')
plt.legend()
plt.title('Global Mean Surface Air Temperature')

Out[25]:

Text(0.5, 1.0, 'Global Mean Surface Air Temperature')

In [ ]: