Daily check of availability of 30 min raw data

Define help functions

Code

debug = False

Code

import configparser
import datetime as dt
import glob
import os
import platform
import re
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

def _get_datadir(config):
    '''
    Get datadir from config file and return Windows and macOS full path

    Parameters
    ----------
    config : ConfigParser
        A ConfigParser having read a config file

    Returns
    -------
    directory of BD_Hesse on current computer

    '''
    datadir = config['GENERAL'].get('datadir', '')

    if datadir and (not datadir.startswith('/')):
        ios = platform.system()  # Windows, Darwin, Linux
        if ios == 'Windows':
            datadir = '//pnas1.stockage.inra.fr/nancy-eef-prnas/' + datadir
        elif ios == 'Darwin':
            datadir = '/Volumes/nancy-eef-prnas/' + datadir
        else:
            print('Operating system not known:', ios)

    # # MC local
    # datadir = '/Users/cuntz/data/inrae/hesse/BD_Hesse'
    # # MC local

    return datadir


def _get_logger_files(config, loggers, datadir, level='raw'):
    '''
    Get file names of loggers from config file

    Parameters
    ----------
    config : ConfigParser
        A ConfigParser having read a config file
    logger : list
        List of logger names in config file
    datadir : str
        Directory of BD_Hesse on current computer
    level : str, optional
        Data level raw, DB1, calib (default: raw)

    Returns
    -------
    dictionary with logger names as keys and file names as values.
    Non-existing entries in config file have empty string as file name.

    '''
    if level.lower().startswith('raw'):
        section = 'RAWFILES'
    elif level.lower().startswith('db1'):
        section = 'DB1FILES'
    elif level.lower().startswith('cal'):
        section = 'CALIBFILES'
    else:
        raise ValueError(f'level not known: {level}')

    ifiles = {}
    for mm in loggers:
        ff = config[section].get(mm, '')
        if ff:
            ifiles.update({mm: datadir + '/' + ff})
        else:
            ifiles.update({mm: ''})

    return ifiles


def _get_smartflux_files(datadir, year=None):
    '''
    Get filenames of Smartflux ghg files for year

    Parameters
    ----------
    datadir : str
        Directory of BD_Hesse on current computer
    year : int, optional
        Smartflux ghg files for year. Must be >= 2019
        (default: current year)

    Returns
    -------
    list with filenames

    '''
    if year is None:
        today = dt.datetime.today()
        year = today.year

    if year >= 2024:
        idir = f'{datadir}/Donnees_brutes/{year}/Auto/SmartfluxNT/raw'
    elif year == 2023:
        idir = f'{datadir}/Donnees_brutes/{year}/Auto/SmartfluxNT_{year}/raw'
    elif year >= 2019:
        idir = f'{datadir}/Donnees_brutes/{year}/Auto/SmartfluxNT_{year}'

    ifiles = glob.glob(f'{idir}/*.ghg')

    return ifiles


def _read_csv(ifile, remove_cols=[], rename_cols=[], level='raw'):
    '''
    Read values from standard logger file from crbasic2db1.py,
    optionally remove remove columns and rename columns, and then
    select only columns with standard names such as TA_1_1_1.

    Parameters
    ----------
    ifile : str
        Input logger file
    remove_cols : list, optional
        List of column names to delete
    rename_cols : list, optional
        List of column names to rename to `logger + '_' + column_name`
    level : str, optional
        Data level raw, DB1 (default: raw)

    Returns
    -------
    pandas.DataFrame

    '''
    if level.lower().startswith('raw'):
        skiprows = [0, 2, 3]
        na_values = 'NAN'
    elif level.lower().startswith('db1'):
        skiprows = None
        na_values = '-9999'
    else:
        raise ValueError(f'level not known: {level}')

    logger = '_'.join(os.path.basename(ifile).split('_')[:-2])

    df = pd.read_csv(ifile, sep=',', header='infer', skiprows=skiprows,
                     index_col=0, parse_dates=True, na_values=na_values)

    rm_cols = [ nn for nn in remove_cols if nn in df.columns ]
    if len(rm_cols) > 0:
        df.drop(columns=rm_cols, inplace=True)
    ren_cols = { nn: logger + '_' + nn for nn in rename_cols
                 if nn in df.columns }
    if len(ren_cols) > 0:
        df.rename(columns=ren_cols, inplace=True)

    # # take only variables with standard names
    # drops = [ cc for cc in df.columns
    #           if re.fullmatch('.+_[0-9]+_[0-9]+_[0-9]+', cc) is None ]
    # if len(drops) > 0:
    #     df.drop(columns=drops, inplace=True)

    return df

# Seaborn's Oranges color palette starting with white
oranges = mpl.colors.LinearSegmentedColormap.from_list(
    'oranges', sns.color_palette('Oranges'))
ocols = oranges(np.arange(255, dtype=int))
ocols[0] = [1., 1., 1., 1.]  # add white at start
cmap = mpl.colors.ListedColormap(ocols)

Check availability of raw data

Check config file

Code

ndays = 7
today = dt.datetime.today()
fromday = today - dt.timedelta(days=ndays)
year = today.year

configfile = f'FR-Hes_{year}.cfg'
print(f"Read config file: {configfile}")
config = configparser.ConfigParser(interpolation=None)
config.read(configfile)

# options
datadir = _get_datadir(config)

# loggers
loggers = list(config['RAWFILES'].keys())
loggers.remove('year')
loggers.remove('units')
if debug:
    print(f"Loggers: {loggers}")

rfiles = _get_logger_files(config, loggers, datadir, level='raw')
rfilenames = { ll: os.path.basename(rfiles[ll]) for ll in rfiles }
rnames = { ll: '_'.join(rfilenames[ll].split('_')[0:2]) for ll in rfilenames }
if debug:
    print(f"Raw filenames: {rfilenames}")

dfiles = _get_logger_files(config, loggers, datadir, level='DB1')
dfilenames = [ os.path.basename(dfiles[ll]) for ll in dfiles ]
if debug:
    print(f"DB1 filenames: {dfilenames}")

Read config file: FR-Hes_2025.cfg

Missing 30 min raw data

Code

# loggers + smartflux
ndata = np.full((len(loggers) + 1, ndays), 48, dtype=int)
# loggers
for ii, ll in enumerate(loggers):
    if debug:
        print(f"Read file: {rfiles[ll]}")
    df = _read_csv(rfiles[ll], level='raw')
    for dd in range(ndays):
        isday = today - dt.timedelta(days=dd + 1)
        ndata[ii, -dd-1] -= len(df[df.index.date == isday.date()])
        if ll == 'profile':
            ndata[ii, -dd-1] += 422 - 48
    # if ll == 'profile':
    #         ndata[ii, -1] //= 8
# smartflux
sfiles = [ os.path.basename(ff)[0:10]
           for ff in _get_smartflux_files(datadir, year=year) ]
for dd in range(ndays):
    isday = today - dt.timedelta(days=dd + 1)
    sisday = isday.strftime('%Y-%m-%d')
    ndata[-1, -dd-1] = 48 - sfiles.count(sisday)
    
loggersmart = loggers.copy()
loggersmart.append('smartflux')
prevdays = np.linspace(-ndays, -1, ndays, dtype=int)
df = pd.DataFrame(ndata, index=loggersmart,
                  columns=prevdays)
vmax = 48
fig, ax = plt.subplots(figsize=(6.4, df.shape[0]/4.))
sns.heatmap(axes=ax, data=df, vmax=vmax, annot=True, linewidths=0.5,
                cmap=cmap, xticklabels=prevdays,
                yticklabels=df.index)
ax.set_xlabel('Days before today')
ax.set_ylabel('Logger')
plt.show()

NaN raw data per logger

Code

# loggers
firstday = today - dt.timedelta(days=ndays)
prevdays = np.linspace(-ndays, -1, ndays, dtype=int)
for ll in loggers:
    print(f'{ll}')
    df = _read_csv(rfiles[ll], level='raw')
    df = df[(df.index.date >= firstday.date()) & (df.index.date < today.date())].isna()
    sf = df.groupby(df.index.date).sum()
    sf = sf.T
    sf.columns = prevdays[:len(sf.columns)]
    if ll == 'profile':
        vmax = 422
    else:
        vmax = 48
    fig, ax = plt.subplots(figsize=(6.4, sf.shape[0]/4.))
    sns.heatmap(axes=ax, data=sf, vmax=vmax, cmap=cmap, linewidths=0.5,
                xticklabels=prevdays, yticklabels=sf.index,
                annot=True)  # , fmt=':d')
    ax.set_xlabel('Days before today')
    ax.set_ylabel('Variable')
    plt.show()

h1

h1_meteobackup

h1_cr310

meteo

radiation

radiation2

circonf

profile

soil_ab

soil_cd

soil_e

soil_f

soil_g

cp_01

cp_02

--- title: "Daily check of availability of 30 min raw data" title-block-banner: true date: today format: html: code-fold: true code-tools: true self-contained: true embed-resources: true author: - Matthias Cuntz execute: freeze: false jupyter: python3 --- Define help functions ```{python} debug = False ``` ```{python} import configparser import datetime as dt import glob import os import platform import re import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns def _get_datadir(config): ''' Get datadir from config file and return Windows and macOS full path Parameters ---------- config : ConfigParser A ConfigParser having read a config file Returns ------- directory of BD_Hesse on current computer ''' datadir = config['GENERAL'].get('datadir', '') if datadir and (not datadir.startswith('/')): ios = platform.system() # Windows, Darwin, Linux if ios == 'Windows': datadir = '//pnas1.stockage.inra.fr/nancy-eef-prnas/' + datadir elif ios == 'Darwin': datadir = '/Volumes/nancy-eef-prnas/' + datadir else: print('Operating system not known:', ios) # # MC local # datadir = '/Users/cuntz/data/inrae/hesse/BD_Hesse' # # MC local return datadir def _get_logger_files(config, loggers, datadir, level='raw'): ''' Get file names of loggers from config file Parameters ---------- config : ConfigParser A ConfigParser having read a config file logger : list List of logger names in config file datadir : str Directory of BD_Hesse on current computer level : str, optional Data level raw, DB1, calib (default: raw) Returns ------- dictionary with logger names as keys and file names as values. Non-existing entries in config file have empty string as file name. ''' if level.lower().startswith('raw'): section = 'RAWFILES' elif level.lower().startswith('db1'): section = 'DB1FILES' elif level.lower().startswith('cal'): section = 'CALIBFILES' else: raise ValueError(f'level not known: {level}') ifiles = {} for mm in loggers: ff = config[section].get(mm, '') if ff: ifiles.update({mm: datadir + '/' + ff}) else: ifiles.update({mm: ''}) return ifiles def _get_smartflux_files(datadir, year=None): ''' Get filenames of Smartflux ghg files for year Parameters ---------- datadir : str Directory of BD_Hesse on current computer year : int, optional Smartflux ghg files for year. Must be >= 2019 (default: current year) Returns ------- list with filenames ''' if year is None: today = dt.datetime.today() year = today.year if year >= 2024: idir = f'{datadir}/Donnees_brutes/{year}/Auto/SmartfluxNT/raw' elif year == 2023: idir = f'{datadir}/Donnees_brutes/{year}/Auto/SmartfluxNT_{year}/raw' elif year >= 2019: idir = f'{datadir}/Donnees_brutes/{year}/Auto/SmartfluxNT_{year}' ifiles = glob.glob(f'{idir}/*.ghg') return ifiles def _read_csv(ifile, remove_cols=[], rename_cols=[], level='raw'): ''' Read values from standard logger file from crbasic2db1.py, optionally remove remove columns and rename columns, and then select only columns with standard names such as TA_1_1_1. Parameters ---------- ifile : str Input logger file remove_cols : list, optional List of column names to delete rename_cols : list, optional List of column names to rename to `logger + '_' + column_name` level : str, optional Data level raw, DB1 (default: raw) Returns ------- pandas.DataFrame ''' if level.lower().startswith('raw'): skiprows = [0, 2, 3] na_values = 'NAN' elif level.lower().startswith('db1'): skiprows = None na_values = '-9999' else: raise ValueError(f'level not known: {level}') logger = '_'.join(os.path.basename(ifile).split('_')[:-2]) df = pd.read_csv(ifile, sep=',', header='infer', skiprows=skiprows, index_col=0, parse_dates=True, na_values=na_values) rm_cols = [ nn for nn in remove_cols if nn in df.columns ] if len(rm_cols) > 0: df.drop(columns=rm_cols, inplace=True) ren_cols = { nn: logger + '_' + nn for nn in rename_cols if nn in df.columns } if len(ren_cols) > 0: df.rename(columns=ren_cols, inplace=True) # # take only variables with standard names # drops = [ cc for cc in df.columns # if re.fullmatch('.+_[0-9]+_[0-9]+_[0-9]+', cc) is None ] # if len(drops) > 0: # df.drop(columns=drops, inplace=True) return df # Seaborn's Oranges color palette starting with white oranges = mpl.colors.LinearSegmentedColormap.from_list( 'oranges', sns.color_palette('Oranges')) ocols = oranges(np.arange(255, dtype=int)) ocols[0] = [1., 1., 1., 1.] # add white at start cmap = mpl.colors.ListedColormap(ocols) ``` ## Check availability of raw data ### Check config file ```{python} ndays = 7 today = dt.datetime.today() fromday = today - dt.timedelta(days=ndays) year = today.year configfile = f'FR-Hes_{year}.cfg' print(f"Read config file: {configfile}") config = configparser.ConfigParser(interpolation=None) config.read(configfile) # options datadir = _get_datadir(config) # loggers loggers = list(config['RAWFILES'].keys()) loggers.remove('year') loggers.remove('units') if debug: print(f"Loggers: {loggers}") rfiles = _get_logger_files(config, loggers, datadir, level='raw') rfilenames = { ll: os.path.basename(rfiles[ll]) for ll in rfiles } rnames = { ll: '_'.join(rfilenames[ll].split('_')[0:2]) for ll in rfilenames } if debug: print(f"Raw filenames: {rfilenames}") dfiles = _get_logger_files(config, loggers, datadir, level='DB1') dfilenames = [ os.path.basename(dfiles[ll]) for ll in dfiles ] if debug: print(f"DB1 filenames: {dfilenames}") ``` ### Missing 30 min raw data ```{python} # loggers + smartflux ndata = np.full((len(loggers) + 1, ndays), 48, dtype=int) # loggers for ii, ll in enumerate(loggers): if debug: print(f"Read file: {rfiles[ll]}") df = _read_csv(rfiles[ll], level='raw') for dd in range(ndays): isday = today - dt.timedelta(days=dd + 1) ndata[ii, -dd-1] -= len(df[df.index.date == isday.date()]) if ll == 'profile': ndata[ii, -dd-1] += 422 - 48 # if ll == 'profile': # ndata[ii, -1] //= 8 # smartflux sfiles = [ os.path.basename(ff)[0:10] for ff in _get_smartflux_files(datadir, year=year) ] for dd in range(ndays): isday = today - dt.timedelta(days=dd + 1) sisday = isday.strftime('%Y-%m-%d') ndata[-1, -dd-1] = 48 - sfiles.count(sisday) loggersmart = loggers.copy() loggersmart.append('smartflux') prevdays = np.linspace(-ndays, -1, ndays, dtype=int) df = pd.DataFrame(ndata, index=loggersmart, columns=prevdays) vmax = 48 fig, ax = plt.subplots(figsize=(6.4, df.shape[0]/4.)) sns.heatmap(axes=ax, data=df, vmax=vmax, annot=True, linewidths=0.5, cmap=cmap, xticklabels=prevdays, yticklabels=df.index) ax.set_xlabel('Days before today') ax.set_ylabel('Logger') plt.show() ``` ### NaN raw data per logger ```{python} # loggers firstday = today - dt.timedelta(days=ndays) prevdays = np.linspace(-ndays, -1, ndays, dtype=int) for ll in loggers: print(f'{ll}') df = _read_csv(rfiles[ll], level='raw') df = df[(df.index.date >= firstday.date()) & (df.index.date < today.date())].isna() sf = df.groupby(df.index.date).sum() sf = sf.T sf.columns = prevdays[:len(sf.columns)] if ll == 'profile': vmax = 422 else: vmax = 48 fig, ax = plt.subplots(figsize=(6.4, sf.shape[0]/4.)) sns.heatmap(axes=ax, data=sf, vmax=vmax, cmap=cmap, linewidths=0.5, xticklabels=prevdays, yticklabels=sf.index, annot=True) # , fmt=':d') ax.set_xlabel('Days before today') ax.set_ylabel('Variable') plt.show() ```