WDCC CMIP6 data download generator on DKRZ’s lustre#
This Notebook builds interactive panels app1
and app2
that allow you to
get an overview over all fully archived CMIP6 experiments in WDCC
select a download wish list on file level with a GUI
generate a download script with
jblob
We heavily make use of pandas
, panel
and the python interface to the WDCC.
This Notebook works well on Levante with the /work/bm1344/conda-evns/py_312
environment.
This link will lead you to a deployed version of both Apps. It will start a web worker in the background which runs the code and finally reloads the web page.
import panel as pn
pn.extension('tabulator',sizing_mode="stretch_width", template="fast", inline=True)
from panel.widgets import Tqdm
from bokeh.models.widgets.tables import HTMLTemplateFormatter
import pandas as pd
import json
import requests
import tempfile
import hvplot.pandas
from pywdcc import WDCCClient
client=WDCCClient()
ALLOWED_NO_OF_DATASETS=700
LANDING_PAGE_TRUNK_URL="https://www.wdc-climate.de/ui/entry?acronym="
drs='Project_id/Activity_id/Institution_id/Source_id/Experiment_id/Member_id/Table_id/Variable_id/Grid_label/Version_id/Filename'
to_hide=["Path","Filename", "Record"]
Fully archived CMIP6 experiments#
app1
fetches data from the WDCC API to generate an interactive table for all fully archived CMIP6 experiments. The table contains a column for each DRS element. A identifier column contains a link to the WDCC landing page so that one can proceed with the GUI interface at any point.
We also add filters to both allow to subset the table and show the unique values of each column.
Create a table#
def map_drs(row:list,start:int=4,end:int=999)->dict:
global drs
drs_dict={}
for key,value in zip(
drs.split('/')[start:end],
row["Path"].split('/')[start:end]
):
drs_dict[key]=value
return drs_dict
link_formatter = HTMLTemplateFormatter(
template=f'<a target="_blank" href="{LANDING_PAGE_TRUNK_URL}<%= value %>"><%= value %></a>'
)
Now we fetch data for the experiments. To not overload the API, we do it in a loop. model_dump
will return a dict
object.
total_docs=[]
estimated_total_number_of_exps=1500
for s in range(0,estimated_total_number_of_exps,100):
cmip6_solr_dict=await client.get_all_fully_archived_cmip6_exps(start=s)
total_docs+=cmip6_solr_dict.model_dump()["docs"]
Next, we parse the information as a pandas DataFrame table:
df=pd.DataFrame.from_records(
total_docs,
exclude=[
'id',
#'entry_acronym_s',
'textSuggest',
#'entry_name_s',
'title_sort',
'creation_date_dt', 'entry_type_s', 'entry_type_l',
'publication_type_s', 'publication_type_l', 'progress_acronym_s',
'summary_s', 'data_size_l', 'geo', 'date_range_rdt', 'general_key_ss',
'institute_info_ss', 'pinstitute_info_ss', 'person_name_ss',
'project_name_ss', 'project_acronym_ss', 'hierarchy_ss',
'hierarchy_steps_ss', 'access_s', 'authors_s', 'is_downloadable',
'_version_', 'score']
).rename(columns=dict(
entry_acronym_s="Acronym",
entry_name_s="Path"
))
df["Path"]=df["Path"].str.lstrip("WCRP").str.strip().str.replace(' ','/')
We can map the Path
column to the DRS so that we can create more columns: One for each DRS element:
applied_df=df.apply(lambda row: map_drs(row,start=1,end=5), axis='columns', result_type='expand')
c6df = pd.concat([df, applied_df], axis='columns')
c6df
Acronym | Path | Activity_id | Institution_id | Source_id | Experiment_id | |
---|---|---|---|---|---|---|
0 | C6_4097176 | CMIP6/AerChemMIP/BCC/BCC-ESM1/hist-piAer | AerChemMIP | BCC | BCC-ESM1 | hist-piAer |
1 | C6_4097185 | CMIP6/AerChemMIP/BCC/BCC-ESM1/hist-piNTCF | AerChemMIP | BCC | BCC-ESM1 | hist-piNTCF |
2 | C6_4097181 | CMIP6/AerChemMIP/BCC/BCC-ESM1/histSST | AerChemMIP | BCC | BCC-ESM1 | histSST |
3 | C6_4097190 | CMIP6/AerChemMIP/BCC/BCC-ESM1/histSST-piCH4 | AerChemMIP | BCC | BCC-ESM1 | histSST-piCH4 |
4 | C6_4097177 | CMIP6/AerChemMIP/BCC/BCC-ESM1/histSST-piNTCF | AerChemMIP | BCC | BCC-ESM1 | histSST-piNTCF |
... | ... | ... | ... | ... | ... | ... |
1163 | C6_4138740 | CMIP6/ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp245 | ScenarioMIP | MPI-M | MPI-ESM1-2-LR | ssp245 |
1164 | C6_4138737 | CMIP6/ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp370 | ScenarioMIP | MPI-M | MPI-ESM1-2-LR | ssp370 |
1165 | C6_4138738 | CMIP6/ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp585 | ScenarioMIP | MPI-M | MPI-ESM1-2-LR | ssp585 |
1166 | C6_5168193 | CMIP6/VolMIP/MPI-M/MPI-ESM1-2-LR/volc-long-eq | VolMIP | MPI-M | MPI-ESM1-2-LR | volc-long-eq |
1167 | C6_5168194 | CMIP6/VolMIP/MPI-M/MPI-ESM1-2-LR/volc-pinatubo... | VolMIP | MPI-M | MPI-ESM1-2-LR | volc-pinatubo-full |
1168 rows × 6 columns
Make the table interactive#
At this stage, we could continue to work with c6df
programmatically. To make it even more convenient, we parse the table with the pn.widgets.Tabulator
to make it interactive. For the final app1
, we add filters on the left to show the unique values for each column.
def get_filter_columns(df) -> dict:
column_filter_dict={}
for col in list(df.columns):
if col in to_hide or "Size" in col or "blob" in col.lower():
continue
select=pn.widgets.MultiChoice(
options=sorted(list([a for a in df[col].unique() if a])),
name=f'{col}',
search_option_limit=100
)
column_filter_dict[col]=select
return column_filter_dict
c6tabu = pn.widgets.Tabulator(
c6df,
show_index=False,
formatters={"Acronym": link_formatter},
hidden_columns=to_hide,
header_filters=True,
selectable=1,
page_size=20,
disabled=True
)
c6filters=get_filter_columns(c6df)
for col,cf in c6filters.items():
c6tabu.add_filter(cf,col)
app1=pn.Column(
pn.Row(
pn.Column(
*list(c6filters.values())
),
pn.Column(
pn.pane.HTML(f'</h3> Fully archived CMIP6 experiments</h3>'),
c6tabu
),
name="prerendered"
)
)
app1.servable()
File subsetting and download script generation for a specific dataset#
For our app2
, we use two widgets in a row: One TextInput for a WDCC Dataset Landing Page, and one trigger button to create a row including the interactive table for that input.
app2=pn.Row()
dataset_landing_page = pn.widgets.TextInput(
name='WDCC Dataset Landing Page',
placeholder='https://www.wdc-climate.de/ui/entry?acronym=C6_5189604',
description="Specify a URL to a WDCC Dataset Landing Page"
)
op=pn.widgets.Button(
name="Create a interactive table",
description="This will fetch the WDCC Dataset and children"
)
Each created table should be removable. For that, we define the remove_row
function and bind it to a button that comes with the interactive table.
def remove_row(event,name):
global app2
if not event:
return ""
idxl=[]
for i,elem in enumerate(app2):
if name in elem.name :
idxl.append(i)
if idxl:
for idx in idxl:
app2.pop(idx)
return ""
For a selection of the interactive table, you can click another button that will generate a download script that will work on Levante.
def create_jblob_script(event,name):
global app2
if not event:
return ""
ltabu=None
dow=None
for i,elem in enumerate(app2):
if name in elem.name :
ltabu=elem[0][-1][-1]
if ltabu:
recent_df=ltabu.current_view
tmp_file=tempfile.NamedTemporaryFile(suffix=".sh",delete=False)
# tmp_file.name,
fname=f"jblob-script-{name}.sh"
with open(tmp_file.name, 'w') as f:
f.write('module load jblob\n')
for local_acro in recent_df["Acronym"].unique():
subtabu=recent_df[recent_df["Acronym"]==local_acro]
for rec in subtabu["Record"].unique():
subsubtabu=subtabu[subtabu["Record"]==rec]
f.write(f"mkdir -p {'/'.join(subsubtabu['Path'].values[0].split('/')[:-1])}\n")
f.write(f"jblob --dataset {local_acro} --rmin {rec} --rmax {rec} --file {subsubtabu['Path'].values[0]}\n")
dow=pn.widgets.FileDownload(
file=tmp_file.name,
filename=fname
)
return dow
def refine_df(df):
df=df.rename(
columns=dict(
FILE_NAME="Path",
FILE_SIZE="Size [MB]",
CACHED_CONTAINER="Cached",
BLOB_ID="Record"
)
)
df["Record"]=df["Record"].astype(int)
df["Filename"]=df["Path"].str.split('/').str[-1]
df["Size [MB]"]=(df["Size [MB]"]/1024**2).astype(int)
df.loc[~df["Cached"].isnull(),"Cached"]="Yes"
df.loc[df["Cached"].isnull(),"Cached"]="No"
df["Time slice"]=df["Filename"].str.split('_').str[-1].str.strip('.nc')
df.loc[df["Time slice"].str[0]=='g',"Time slice"]="Fix"
df=df.drop(columns="Filename")
applied_df=df.apply(lambda row: map_drs(row), axis='columns', result_type='expand')
df = pd.concat([df, applied_df], axis='columns')
df["Grid_label"]=df["Grid_label"].astype(str)
return df
If you do not want to work with the app but rather only with the DataFrame, you can just run the create_df
function for a specific epxeriment acronym main_acronym
.
async def create_df(client, main_acronym):
global ALLOWED_NO_OF_DATASETS, app2
all_acronyms=await client.get_dataset_acronyms(main_acronym)
no_all_acronyms=len(all_acronyms)
if no_all_acronyms > ALLOWED_NO_OF_DATASETS:
raise ValueError(f"More than {ALLOWED_NO_OF_DATASETS} acronyms are not allowed. Found {no_all_acronyms} for acronym {main_acronym}")
df=pd.DataFrame()
print("Start for loop...")
tqdm=Tqdm()
app2.append(tqdm)
for acronym in tqdm(all_acronyms):
download_form=await client.get_download_form(acronym)
#print(download_form)
newdf=pd.DataFrame.from_records(
download_form["downloadInfo"]["metaTable"],
exclude=[
'ENTRY_ID',
#'BLOB_ID',
'CHECKSUM',
'CHECKSUM_TYPE',
'EXTERNAL_IDENTIFIER',
'START_DATE',
'UPLOAD_DATE',
#'CACHED_CONTAINER'
]
)
newdf["Acronym"]=acronym
df=pd.concat(
[
df,
newdf
]
)
df=refine_df(df)
return df
async def create_app2(event):
global to_hide, app2, link_formatter, ALLOWED_NO_OF_DATASETS, client
landing_page=dataset_landing_page.value
headers = {
"User-Agent": "Mozilla/5.0"
}
resp=requests.get(landing_page,allow_redirects=True,headers=headers)
if resp.status_code >500:
raise ValueError(f"Landing page {landing_page} returned >500")
landing_page=resp.url
main_acronym=await client.get_acronym_from_landing_page(landing_page)
if not main_acronym:
raise ValueError(f"Failed to find acronym for landing page {landing_page}")
#all_md=get_all_metadata_for_acronym(main_acronym)
if await client.check_project_for_acronym(main_acronym,"CMIP6"):
df = await create_df(client, main_acronym)
tabu = pn.widgets.Tabulator(
df,
show_index=False,
formatters={"Acronym": link_formatter},
hidden_columns=to_hide,
header_filters=True,
#selectable=1,
page_size=20,
disabled=True
)
column_filter_dict=get_filter_columns(df)
for col,select in column_filter_dict.items():
tabu.add_filter(select, col)
rem=pn.widgets.Button(
name="Remove table"
)
jblob=pn.widgets.Button(
name="Create jblob script"
)
avw=pn.pane.HTML(f'</h3> Volume of files of dataset: {str(df["Size [MB]"].sum())} MB </h3>')
uvn=pn.pane.HTML(f'</h3> Unique variable names of dataset: {str(len(df["Variable_id"].unique()))} </h3>')
app2.pop(-1)
app2.append(
pn.Column(
pn.Row(
pn.Column(
*list(column_filter_dict.values()),
rem,
pn.bind(
remove_row,
rem,
main_acronym
),
jblob,
pn.bind(
create_jblob_script,
jblob,
main_acronym
)
),
pn.Column(
pn.pane.HTML(f'</h3> <a target="_blank" href="{landing_page}"> {main_acronym} </a> </h3>'),
tabu
)
),
pn.Row(
avw,
uvn,
),
name=main_acronym
)
)
else:
raise ValueError(f"Acronym {main_acronym} not a CMIP6 dataset")
def opclick(event):
open_dataset()
op.on_click(create_app2)
Watcher(inst=Button(description='This will fetch t..., name='Create a interactive t..., sizing_mode='stretch_width'), cls=<class 'panel.widgets.button.Button'>, fn=<function create_app2 at 0x7f07720e98a0>, mode='args', onlychanged=False, parameter_names=('clicks',), what='value', queued=False, precedence=0)
app2=pn.Column(pn.Row(dataset_landing_page,op))
app2.servable()