Daily check of availability of 30 min raw data

Author

Matthias Cuntz

Published

October 8, 2025

Define help functions

Code
debug = False
Code
import configparser
import datetime as dt
import glob
import os
import platform
import re
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

def _get_datadir(config):
    '''
    Get datadir from config file and return Windows and macOS full path

    Parameters
    ----------
    config : ConfigParser
        A ConfigParser having read a config file

    Returns
    -------
    directory of BD_Hesse on current computer

    '''
    datadir = config['GENERAL'].get('datadir', '')

    if datadir and (not datadir.startswith('/')):
        ios = platform.system()  # Windows, Darwin, Linux
        if ios == 'Windows':
            datadir = '//pnas1.stockage.inra.fr/nancy-eef-prnas/' + datadir
        elif ios == 'Darwin':
            datadir = '/Volumes/nancy-eef-prnas/' + datadir
        else:
            print('Operating system not known:', ios)

    # # MC local
    # datadir = '/Users/cuntz/data/inrae/hesse/BD_Hesse'
    # # MC local

    return datadir


def _get_logger_files(config, loggers, datadir, level='raw'):
    '''
    Get file names of loggers from config file

    Parameters
    ----------
    config : ConfigParser
        A ConfigParser having read a config file
    logger : list
        List of logger names in config file
    datadir : str
        Directory of BD_Hesse on current computer
    level : str, optional
        Data level raw, DB1, calib (default: raw)

    Returns
    -------
    dictionary with logger names as keys and file names as values.
    Non-existing entries in config file have empty string as file name.

    '''
    if level.lower().startswith('raw'):
        section = 'RAWFILES'
    elif level.lower().startswith('db1'):
        section = 'DB1FILES'
    elif level.lower().startswith('cal'):
        section = 'CALIBFILES'
    else:
        raise ValueError(f'level not known: {level}')

    ifiles = {}
    for mm in loggers:
        ff = config[section].get(mm, '')
        if ff:
            ifiles.update({mm: datadir + '/' + ff})
        else:
            ifiles.update({mm: ''})

    return ifiles


def _get_smartflux_files(datadir, year=None):
    '''
    Get filenames of Smartflux ghg files for year

    Parameters
    ----------
    datadir : str
        Directory of BD_Hesse on current computer
    year : int, optional
        Smartflux ghg files for year. Must be >= 2019
        (default: current year)

    Returns
    -------
    list with filenames

    '''
    if year is None:
        today = dt.datetime.today()
        year = today.year

    if year >= 2024:
        idir = f'{datadir}/Donnees_brutes/{year}/Auto/SmartfluxNT/raw'
    elif year == 2023:
        idir = f'{datadir}/Donnees_brutes/{year}/Auto/SmartfluxNT_{year}/raw'
    elif year >= 2019:
        idir = f'{datadir}/Donnees_brutes/{year}/Auto/SmartfluxNT_{year}'

    ifiles = glob.glob(f'{idir}/*.ghg')

    return ifiles


def _read_csv(ifile, remove_cols=[], rename_cols=[], level='raw'):
    '''
    Read values from standard logger file from crbasic2db1.py,
    optionally remove remove columns and rename columns, and then
    select only columns with standard names such as TA_1_1_1.

    Parameters
    ----------
    ifile : str
        Input logger file
    remove_cols : list, optional
        List of column names to delete
    rename_cols : list, optional
        List of column names to rename to `logger + '_' + column_name`
    level : str, optional
        Data level raw, DB1 (default: raw)

    Returns
    -------
    pandas.DataFrame

    '''
    if level.lower().startswith('raw'):
        skiprows = [0, 2, 3]
        na_values = 'NAN'
    elif level.lower().startswith('db1'):
        skiprows = None
        na_values = '-9999'
    else:
        raise ValueError(f'level not known: {level}')

    logger = '_'.join(os.path.basename(ifile).split('_')[:-2])

    df = pd.read_csv(ifile, sep=',', header='infer', skiprows=skiprows,
                     index_col=0, parse_dates=True, na_values=na_values)

    rm_cols = [ nn for nn in remove_cols if nn in df.columns ]
    if len(rm_cols) > 0:
        df.drop(columns=rm_cols, inplace=True)
    ren_cols = { nn: logger + '_' + nn for nn in rename_cols
                 if nn in df.columns }
    if len(ren_cols) > 0:
        df.rename(columns=ren_cols, inplace=True)

    # # take only variables with standard names
    # drops = [ cc for cc in df.columns
    #           if re.fullmatch('.+_[0-9]+_[0-9]+_[0-9]+', cc) is None ]
    # if len(drops) > 0:
    #     df.drop(columns=drops, inplace=True)

    return df

# Seaborn's Oranges color palette starting with white
oranges = mpl.colors.LinearSegmentedColormap.from_list(
    'oranges', sns.color_palette('Oranges'))
ocols = oranges(np.arange(255, dtype=int))
ocols[0] = [1., 1., 1., 1.]  # add white at start
cmap = mpl.colors.ListedColormap(ocols)

Check availability of raw data

Check config file

Code
ndays = 7
today = dt.datetime.today()
fromday = today - dt.timedelta(days=ndays)
year = today.year

configfile = f'FR-Hes_{year}.cfg'
print(f"Read config file: {configfile}")
config = configparser.ConfigParser(interpolation=None)
config.read(configfile)

# options
datadir = _get_datadir(config)

# loggers
loggers = list(config['RAWFILES'].keys())
loggers.remove('year')
loggers.remove('units')
if debug:
    print(f"Loggers: {loggers}")

rfiles = _get_logger_files(config, loggers, datadir, level='raw')
rfilenames = { ll: os.path.basename(rfiles[ll]) for ll in rfiles }
rnames = { ll: '_'.join(rfilenames[ll].split('_')[0:2]) for ll in rfilenames }
if debug:
    print(f"Raw filenames: {rfilenames}")

dfiles = _get_logger_files(config, loggers, datadir, level='DB1')
dfilenames = [ os.path.basename(dfiles[ll]) for ll in dfiles ]
if debug:
    print(f"DB1 filenames: {dfilenames}")
Read config file: FR-Hes_2025.cfg

Missing 30 min raw data

Code
# loggers + smartflux
ndata = np.full((len(loggers) + 1, ndays), 48, dtype=int)
# loggers
for ii, ll in enumerate(loggers):
    if debug:
        print(f"Read file: {rfiles[ll]}")
    df = _read_csv(rfiles[ll], level='raw')
    for dd in range(ndays):
        isday = today - dt.timedelta(days=dd + 1)
        ndata[ii, -dd-1] -= len(df[df.index.date == isday.date()])
        if ll == 'profile':
            ndata[ii, -dd-1] += 422 - 48
    # if ll == 'profile':
    #         ndata[ii, -1] //= 8
# smartflux
sfiles = [ os.path.basename(ff)[0:10]
           for ff in _get_smartflux_files(datadir, year=year) ]
for dd in range(ndays):
    isday = today - dt.timedelta(days=dd + 1)
    sisday = isday.strftime('%Y-%m-%d')
    ndata[-1, -dd-1] = 48 - sfiles.count(sisday)
    
loggersmart = loggers.copy()
loggersmart.append('smartflux')
prevdays = np.linspace(-ndays, -1, ndays, dtype=int)
df = pd.DataFrame(ndata, index=loggersmart,
                  columns=prevdays)
vmax = 48
fig, ax = plt.subplots(figsize=(6.4, df.shape[0]/4.))
sns.heatmap(axes=ax, data=df, vmax=vmax, annot=True, linewidths=0.5,
                cmap=cmap, xticklabels=prevdays,
                yticklabels=df.index)
ax.set_xlabel('Days before today')
ax.set_ylabel('Logger')
plt.show()

NaN raw data per logger

Code
# loggers
firstday = today - dt.timedelta(days=ndays)
prevdays = np.linspace(-ndays, -1, ndays, dtype=int)
for ll in loggers:
    print(f'{ll}')
    df = _read_csv(rfiles[ll], level='raw')
    df = df[(df.index.date >= firstday.date()) & (df.index.date < today.date())].isna()
    sf = df.groupby(df.index.date).sum()
    sf = sf.T
    sf.columns = prevdays[:len(sf.columns)]
    if ll == 'profile':
        vmax = 422
    else:
        vmax = 48
    fig, ax = plt.subplots(figsize=(6.4, sf.shape[0]/4.))
    sns.heatmap(axes=ax, data=sf, vmax=vmax, cmap=cmap, linewidths=0.5,
                xticklabels=prevdays, yticklabels=sf.index,
                annot=True)  # , fmt=':d')
    ax.set_xlabel('Days before today')
    ax.set_ylabel('Variable')
    plt.show()
h1

h1_meteobackup

h1_cr310

meteo

radiation

radiation2

circonf

profile

soil_ab

soil_cd

soil_e

soil_f

soil_g

cp_01

cp_02