WDCC CMIP6 data download generator on DKRZ’s lustre

WDCC CMIP6 data download generator on DKRZ’s lustre#

This Notebook builds interactive panels app1 and app2 that allow you to

get an overview over all fully archived CMIP6 experiments in WDCC
select a download wish list on file level with a GUI
generate a download script with jblob

We heavily make use of pandas, panel and the python interface to the WDCC. This Notebook works well on Levante with the /work/bm1344/conda-evns/py_312 environment.

This link will lead you to a deployed version of both Apps. It will start a web worker in the background which runs the code and finally reloads the web page.

import panel as pn
pn.extension('tabulator',sizing_mode="stretch_width", template="fast", inline=True)
from panel.widgets import Tqdm
from bokeh.models.widgets.tables import HTMLTemplateFormatter
import pandas as pd
import json
import requests
import tempfile
import hvplot.pandas
from pywdcc import WDCCClient
client=WDCCClient()

ALLOWED_NO_OF_DATASETS=700
LANDING_PAGE_TRUNK_URL="https://www.wdc-climate.de/ui/entry?acronym="
drs='Project_id/Activity_id/Institution_id/Source_id/Experiment_id/Member_id/Table_id/Variable_id/Grid_label/Version_id/Filename'
to_hide=["Path","Filename", "Record"]

Fully archived CMIP6 experiments#

app1 fetches data from the WDCC API to generate an interactive table for all fully archived CMIP6 experiments. The table contains a column for each DRS element. A identifier column contains a link to the WDCC landing page so that one can proceed with the GUI interface at any point.

We also add filters to both allow to subset the table and show the unique values of each column.

Create a table#

def map_drs(row:list,start:int=4,end:int=999)->dict:
    global drs
    drs_dict={}
    for key,value in zip(
        drs.split('/')[start:end],
        row["Path"].split('/')[start:end]
    ):
        drs_dict[key]=value
    return drs_dict

link_formatter = HTMLTemplateFormatter(
    template=f'<a target="_blank" href="{LANDING_PAGE_TRUNK_URL}<%= value %>"><%= value %></a>'
)

Now we fetch data for the experiments. To not overload the API, we do it in a loop. model_dump will return a dict object.

total_docs=[]
estimated_total_number_of_exps=1500
for s in range(0,estimated_total_number_of_exps,100):
    cmip6_solr_dict=await client.get_all_fully_archived_cmip6_exps(start=s)
    total_docs+=cmip6_solr_dict.model_dump()["docs"]

Next, we parse the information as a pandas DataFrame table:

df=pd.DataFrame.from_records(
    total_docs,
    exclude=[
        'id',
        #'entry_acronym_s',
        'textSuggest',
        #'entry_name_s',
        'title_sort',
       'creation_date_dt', 'entry_type_s', 'entry_type_l',
       'publication_type_s', 'publication_type_l', 'progress_acronym_s',
       'summary_s', 'data_size_l', 'geo', 'date_range_rdt', 'general_key_ss',
       'institute_info_ss', 'pinstitute_info_ss', 'person_name_ss',
       'project_name_ss', 'project_acronym_ss', 'hierarchy_ss',
       'hierarchy_steps_ss', 'access_s', 'authors_s', 'is_downloadable',
       '_version_', 'score']
).rename(columns=dict(
    entry_acronym_s="Acronym",
    entry_name_s="Path"
))

df["Path"]=df["Path"].str.lstrip("WCRP").str.strip().str.replace(' ','/')

We can map the Path column to the DRS so that we can create more columns: One for each DRS element:

applied_df=df.apply(lambda row: map_drs(row,start=1,end=5), axis='columns', result_type='expand')
c6df = pd.concat([df, applied_df], axis='columns')    

c6df

	Acronym	Path	Activity_id	Institution_id	Source_id	Experiment_id
0	C6_4097176	CMIP6/AerChemMIP/BCC/BCC-ESM1/hist-piAer	AerChemMIP	BCC	BCC-ESM1	hist-piAer
1	C6_4097185	CMIP6/AerChemMIP/BCC/BCC-ESM1/hist-piNTCF	AerChemMIP	BCC	BCC-ESM1	hist-piNTCF
2	C6_4097181	CMIP6/AerChemMIP/BCC/BCC-ESM1/histSST	AerChemMIP	BCC	BCC-ESM1	histSST
3	C6_4097190	CMIP6/AerChemMIP/BCC/BCC-ESM1/histSST-piCH4	AerChemMIP	BCC	BCC-ESM1	histSST-piCH4
4	C6_4097177	CMIP6/AerChemMIP/BCC/BCC-ESM1/histSST-piNTCF	AerChemMIP	BCC	BCC-ESM1	histSST-piNTCF
...	...	...	...	...	...	...
1163	C6_4138740	CMIP6/ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp245	ScenarioMIP	MPI-M	MPI-ESM1-2-LR	ssp245
1164	C6_4138737	CMIP6/ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp370	ScenarioMIP	MPI-M	MPI-ESM1-2-LR	ssp370
1165	C6_4138738	CMIP6/ScenarioMIP/MPI-M/MPI-ESM1-2-LR/ssp585	ScenarioMIP	MPI-M	MPI-ESM1-2-LR	ssp585
1166	C6_5168193	CMIP6/VolMIP/MPI-M/MPI-ESM1-2-LR/volc-long-eq	VolMIP	MPI-M	MPI-ESM1-2-LR	volc-long-eq
1167	C6_5168194	CMIP6/VolMIP/MPI-M/MPI-ESM1-2-LR/volc-pinatubo...	VolMIP	MPI-M	MPI-ESM1-2-LR	volc-pinatubo-full

1168 rows × 6 columns

Make the table interactive#

At this stage, we could continue to work with c6df programmatically. To make it even more convenient, we parse the table with the pn.widgets.Tabulator to make it interactive. For the final app1, we add filters on the left to show the unique values for each column.

def get_filter_columns(df) -> dict:
    column_filter_dict={}
    for col in list(df.columns):
        if col in to_hide or "Size" in col or "blob" in col.lower():
            continue
        select=pn.widgets.MultiChoice(
            options=sorted(list([a for a in df[col].unique() if a])),
            name=f'{col}',
            search_option_limit=100
        )
        column_filter_dict[col]=select
    return column_filter_dict

c6tabu = pn.widgets.Tabulator(
    c6df,
    show_index=False,
    formatters={"Acronym": link_formatter},
    hidden_columns=to_hide,
    header_filters=True,
    selectable=1,
    page_size=20,
    disabled=True
)
c6filters=get_filter_columns(c6df)
for col,cf in c6filters.items():
    c6tabu.add_filter(cf,col)

app1=pn.Column(
    pn.Row(
        pn.Column(
            *list(c6filters.values())
        ),
        pn.Column(
            pn.pane.HTML(f'</h3> Fully archived CMIP6 experiments</h3>'),
            c6tabu
        ),
        name="prerendered"
    )
)

app1.servable()

File subsetting and download script generation for a specific dataset#

For our app2, we use two widgets in a row: One TextInput for a WDCC Dataset Landing Page, and one trigger button to create a row including the interactive table for that input.

app2=pn.Row()

dataset_landing_page = pn.widgets.TextInput(
    name='WDCC Dataset Landing Page',
    placeholder='https://www.wdc-climate.de/ui/entry?acronym=C6_5189604',
    description="Specify a URL to a WDCC Dataset Landing Page"
)

op=pn.widgets.Button(
    name="Create a interactive table",
    description="This will fetch the WDCC Dataset and children"
)

Each created table should be removable. For that, we define the remove_row function and bind it to a button that comes with the interactive table.

def remove_row(event,name):
    global app2
    if not event:
        return ""
    idxl=[]
    for i,elem in enumerate(app2):
        if name in elem.name :
            idxl.append(i)
    if idxl:
        for idx in idxl:
            app2.pop(idx)
    return ""

For a selection of the interactive table, you can click another button that will generate a download script that will work on Levante.

def create_jblob_script(event,name):
    global app2
    if not event:
        return ""
    ltabu=None
    dow=None
    for i,elem in enumerate(app2):
        if name in elem.name :
            ltabu=elem[0][-1][-1]
    if ltabu:    
        recent_df=ltabu.current_view
        tmp_file=tempfile.NamedTemporaryFile(suffix=".sh",delete=False)
    #        tmp_file.name,
        fname=f"jblob-script-{name}.sh"

        with open(tmp_file.name, 'w') as f:
            f.write('module load jblob\n')
            for local_acro in recent_df["Acronym"].unique():
                subtabu=recent_df[recent_df["Acronym"]==local_acro]
                for rec in subtabu["Record"].unique():
                    subsubtabu=subtabu[subtabu["Record"]==rec]
                    f.write(f"mkdir -p {'/'.join(subsubtabu['Path'].values[0].split('/')[:-1])}\n")
                    f.write(f"jblob --dataset {local_acro} --rmin {rec} --rmax {rec} --file {subsubtabu['Path'].values[0]}\n")

        dow=pn.widgets.FileDownload(
            file=tmp_file.name,
            filename=fname
        )
    return dow            

def refine_df(df):
    df=df.rename(
        columns=dict(
            FILE_NAME="Path",
            FILE_SIZE="Size [MB]",
            CACHED_CONTAINER="Cached",
            BLOB_ID="Record"
        )
    )    
    df["Record"]=df["Record"].astype(int)
    df["Filename"]=df["Path"].str.split('/').str[-1]
    df["Size [MB]"]=(df["Size [MB]"]/1024**2).astype(int)
    df.loc[~df["Cached"].isnull(),"Cached"]="Yes"    
    df.loc[df["Cached"].isnull(),"Cached"]="No"
    df["Time slice"]=df["Filename"].str.split('_').str[-1].str.strip('.nc')
    df.loc[df["Time slice"].str[0]=='g',"Time slice"]="Fix"
    df=df.drop(columns="Filename")
    applied_df=df.apply(lambda row: map_drs(row), axis='columns', result_type='expand')
    df = pd.concat([df, applied_df], axis='columns')    
    df["Grid_label"]=df["Grid_label"].astype(str)
    return df

If you do not want to work with the app but rather only with the DataFrame, you can just run the create_df function for a specific epxeriment acronym main_acronym.

async def create_df(client, main_acronym):
    global ALLOWED_NO_OF_DATASETS, app2
    all_acronyms=await client.get_dataset_acronyms(main_acronym)
    no_all_acronyms=len(all_acronyms)
    if no_all_acronyms > ALLOWED_NO_OF_DATASETS:
        raise ValueError(f"More than {ALLOWED_NO_OF_DATASETS} acronyms are not allowed. Found {no_all_acronyms} for acronym {main_acronym}")
    df=pd.DataFrame()
    print("Start for loop...")
    tqdm=Tqdm()
    app2.append(tqdm)    
    for acronym in tqdm(all_acronyms):
        download_form=await client.get_download_form(acronym)
        #print(download_form)
        newdf=pd.DataFrame.from_records(
                    download_form["downloadInfo"]["metaTable"],
                    exclude=[
                        'ENTRY_ID',
                        #'BLOB_ID',
                        'CHECKSUM',
                        'CHECKSUM_TYPE',
                        'EXTERNAL_IDENTIFIER',
                        'START_DATE',
                        'UPLOAD_DATE',
                        #'CACHED_CONTAINER'
                    ]
                )
        newdf["Acronym"]=acronym
        df=pd.concat(
            [
                df,
                newdf
            ]
        )
    df=refine_df(df)
    return df

async def create_app2(event):
    global to_hide, app2, link_formatter, ALLOWED_NO_OF_DATASETS, client
    landing_page=dataset_landing_page.value
    headers = {
        "User-Agent": "Mozilla/5.0"
    }    
    resp=requests.get(landing_page,allow_redirects=True,headers=headers)
    if resp.status_code >500:
        raise ValueError(f"Landing page {landing_page} returned >500")
    landing_page=resp.url
    main_acronym=await client.get_acronym_from_landing_page(landing_page)
    if not main_acronym:
        raise ValueError(f"Failed to find acronym for landing page {landing_page}")
    #all_md=get_all_metadata_for_acronym(main_acronym)
    if await client.check_project_for_acronym(main_acronym,"CMIP6"):
        df = await create_df(client, main_acronym)
        tabu = pn.widgets.Tabulator(
            df,
            show_index=False,
            formatters={"Acronym": link_formatter},
            hidden_columns=to_hide,
            header_filters=True,
            #selectable=1,
            page_size=20,
            disabled=True
        )
        column_filter_dict=get_filter_columns(df)
        for col,select in column_filter_dict.items():
            tabu.add_filter(select, col)   
        rem=pn.widgets.Button(
            name="Remove table"
        )    
        jblob=pn.widgets.Button(
            name="Create jblob script"
        )    
        avw=pn.pane.HTML(f'</h3> Volume of files of dataset: {str(df["Size [MB]"].sum())} MB </h3>')
        uvn=pn.pane.HTML(f'</h3> Unique variable names of dataset: {str(len(df["Variable_id"].unique()))} </h3>')
        app2.pop(-1)
        app2.append(
            pn.Column(
                pn.Row(
                    pn.Column(
                        *list(column_filter_dict.values()),
                        rem,
                        pn.bind(
                            remove_row,
                            rem,
                            main_acronym
                        ),
                        jblob,
                        pn.bind(
                            create_jblob_script,
                            jblob,
                            main_acronym
                        )
                    ),
                    pn.Column(
                        pn.pane.HTML(f'</h3> <a target="_blank" href="{landing_page}"> {main_acronym} </a> </h3>'),
                        tabu
                    )
                ),
                pn.Row(
                    avw,
                    uvn,
                ),
                name=main_acronym                
            )
        )
    else:
        raise ValueError(f"Acronym {main_acronym} not a CMIP6 dataset")

def opclick(event):
    open_dataset()

op.on_click(create_app2)

Watcher(inst=Button(description='This will fetch t..., name='Create a interactive t..., sizing_mode='stretch_width'), cls=<class 'panel.widgets.button.Button'>, fn=<function create_app2 at 0x7f07720e98a0>, mode='args', onlychanged=False, parameter_names=('clicks',), what='value', queued=False, precedence=0)

app2=pn.Column(pn.Row(dataset_landing_page,op))

app2.servable()