Commit 9bcd7de8 authored by Jonathan Minz's avatar Jonathan Minz
Browse files

created cf compliance checker script

parent dcbd99e2
Loading
Loading
Loading
Loading

2NETCDF.py

deleted100644 → 0
+0 −330
Original line number Diff line number Diff line
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Conversion tool for StreamLine .hpl files into netCDF (.nc) files:
    
    - hpl2dict(): import .hpl files and save as dictionary
    - hpl_to_netcdf(): save .hpl files into level0 (l0) .nc files
    - to_netcdf_l1(): correct raw data (l0) and create level1 (l1) netCDF files
    
"""
import numpy as np
from netCDF4 import Dataset
import os
import datetime
import xarray as xr
import matplotlib.dates as mdates
import glob # Added by Jonathan Minz - 25.04.2024

'''
Import of StreamLine .hpl (txt) files and save locally in directory. Therefore
the data is converted into matrices with dimension "number of range gates" x "time stamp/rays".
In newer versions of the StreamLine software, the spectral width can be 
stored as additional parameter in the .hpl files.
'''
def hpl2dict(file_path):
    #import hpl files into intercal storage
    with open(file_path, 'r') as text_file:
        lines=text_file.readlines()

    #write lines into Dictionary
    data_temp=dict()

    header_n=17 #length of header
    data_temp['filename']=lines[0].split()[-1]
    data_temp['system_id']=int(lines[1].split()[-1])
    data_temp['number_of_gates']=int(lines[2].split()[-1])
    data_temp['range_gate_length_m']=float(lines[3].split()[-1])
    data_temp['gate_length_pts']=int(lines[4].split()[-1])
    data_temp['pulses_per_ray']=int(lines[5].split()[-1])
    data_temp['number_of_waypoints_in_file']=int(lines[6].split()[-1])
    rays_n=(len(lines)-header_n)/(data_temp['number_of_gates']+1)
    
    '''
    number of lines does not match expected format if the number of range gates 
    was changed in the measuring period of the data file (especially possible for stare data)
    '''
    if not rays_n.is_integer():
        print('Number of lines does not match expected format')
        return np.nan
    
    data_temp['no_of_rays_in_file']=int(rays_n)
    data_temp['scan_type']=' '.join(lines[7].split()[2:])
    data_temp['focus_range']=lines[8].split()[-1]
    data_temp['start_time']=' '.join(lines[9].split()[-2:])
    data_temp['resolution']=('%s %s' % (lines[10].split()[-1],'m s-1'))
    data_temp['range_gates']=np.arange(0,data_temp['number_of_gates'])
    data_temp['center_of_gates']=(data_temp['range_gates']+0.5)*data_temp['range_gate_length_m']

    #dimensions of data set
    gates_n=data_temp['number_of_gates']
    rays_n=data_temp['no_of_rays_in_file']

    # item of measurement variables are predefined as symetric numpy arrays filled with NaN values
    data_temp['radial_velocity'] = np.full([gates_n,rays_n],np.nan) #m s-1
    data_temp['intensity'] = np.full([gates_n,rays_n],np.nan) #SNR+1
    data_temp['beta'] = np.full([gates_n,rays_n],np.nan) #m-1 sr-1
    data_temp['spectral_width'] = np.full([gates_n,rays_n],np.nan)
    data_temp['elevation'] = np.full(rays_n,np.nan) #degrees
    data_temp['azimuth'] = np.full(rays_n,np.nan) #degrees
    data_temp['decimal_time'] = np.full(rays_n,np.nan) #hours
    data_temp['pitch'] = np.full(rays_n,np.nan) #degrees
    data_temp['roll'] = np.full(rays_n,np.nan) #degrees
    
    for ri in range(0,rays_n): #loop rays
        lines_temp = lines[header_n+(ri*gates_n)+ri+1:header_n+(ri*gates_n)+gates_n+ri+1]
        header_temp = np.asarray(lines[header_n+(ri*gates_n)+ri].split(),dtype=float)
        data_temp['decimal_time'][ri] = header_temp[0]
        data_temp['azimuth'][ri] = header_temp[1]
        data_temp['elevation'][ri] = header_temp[2]
        data_temp['pitch'][ri] = header_temp[3]
        data_temp['roll'][ri] = header_temp[4]
        for gi in range(0,gates_n): #loop range gates
            line_temp=np.asarray(lines_temp[gi].split(),dtype=float)
            data_temp['radial_velocity'][gi,ri] = line_temp[1]
            data_temp['intensity'][gi,ri] = line_temp[2]
            data_temp['beta'][gi,ri] = line_temp[3]
            if line_temp.size>4:
                data_temp['spectral_width'][gi,ri] = line_temp[4]

    return data_temp

'''
Write .hpl into netCDF l0 data; no data is added, changed or removed;
additional information about institution and contact are optional; 
'''
def hpl_to_netcdf(file_path,path_out,institution=None,contact=None,overwrite=False):
    #check if import file exists
    if not os.path.exists(file_path):
        print('%s cannot be found' %os.path.basename(file_path))
        return
    
    # import data as dictionary
    data_temp = hpl2dict(file_path)
    if type(data_temp) is not dict: return
    
    '''
    Level0 netCDF files will be stored in folder structure which equals the 
    structure on the StreamLine software
    path_out\level0\lidar_id\yyyy\yyyymm\yyyymmdd\file_name.nc
    '''
    datestr=[fns for fns in data_temp['filename'].split('_') if len(fns)==8][0]
    path_out_date=os.path.join(path_out, datestr[0:4], datestr[0:6], datestr)
    # test if file already exists
    if not os.path.exists(path_out_date):
        os.makedirs(path_out_date)

    file_name_out = data_temp['filename'].split('.')[0]+'_l0.nc'
    path_file = os.path.join(path_out_date,file_name_out)
    if os.path.isfile(path_file):
        if overwrite == False:
            raise Exception('%s already exists' % path_file)
        else:
            os.remove(path_file)

    dataset_temp = Dataset(path_file,'w',format ='NETCDF4')
    # define dimensions
    dataset_temp.createDimension('NUMBER_OF_GATES',data_temp['number_of_gates'])
    dataset_temp.createDimension('NUMBER_OF_RAYS',data_temp['no_of_rays_in_file'])
    # Metadata
    dataset_temp.description = "non-processed data of Halo Photonics Streamline"
    if institution:
        dataset_temp.institution = institution
    if contact:
        dataset_temp.contact = contact
    dataset_temp.focus_range = '%s m' % data_temp['focus_range']
    dataset_temp.range_gate_length = '%i m' % data_temp['range_gate_length_m']
    dataset_temp.pulses_per_ray = data_temp['pulses_per_ray']
    dataset_temp.start_time = data_temp['start_time']
    dataset_temp.system_id = data_temp['system_id']
    dataset_temp.scan_type = data_temp['scan_type']
    dataset_temp.resolution = data_temp['resolution']
    dataset_temp.number_waypoint = data_temp['number_of_waypoints_in_file']
    dataset_temp.history = 'File created on %s ' % datetime.datetime.now().strftime('%d %b %Y %H:%M')
    
    decimal_time = dataset_temp.createVariable('decimal_time', np.float64, ('NUMBER_OF_RAYS'))
    decimal_time.units = 'decimal time (hours) UTC'
    decimal_time.long_name = 'start time of each ray'
    decimal_time[:] = data_temp['decimal_time']
    
    azi = dataset_temp.createVariable('azimuth', np.float32, ('NUMBER_OF_RAYS'))
    azi.units = 'degrees, meteorologically'
    azi.long_name = 'azimuth angle'
    azi[:] = data_temp['azimuth']
    
    ele = dataset_temp.createVariable('elevation', np.float32, ('NUMBER_OF_RAYS'))
    ele.units = 'degrees, meteorologically'
    ele.long_name = 'elevation angle'
    ele[:] = data_temp['elevation']
    
    pitch = dataset_temp.createVariable('pitch_angle', np.float32, ('NUMBER_OF_RAYS'))
    pitch.units = 'degrees'
    pitch.long_name = 'pitch angle'
    pitch[:] = data_temp['pitch']
    
    roll = dataset_temp.createVariable('roll_angle', np.float32, ('NUMBER_OF_RAYS'))
    roll.units = 'degrees'
    roll.long_name = 'roll angle'
    roll[:] = data_temp['roll']
    
    rv = dataset_temp.createVariable('radial_velocity', np.float32,('NUMBER_OF_GATES','NUMBER_OF_RAYS'))
    rv.units = 'm s-1'
    rv.long_name = 'Doppler velocity along line of sight'
    rv[:,:] = data_temp['radial_velocity']

    intensity = dataset_temp.createVariable('intensity', np.float32,('NUMBER_OF_GATES','NUMBER_OF_RAYS'))
    intensity.units = 'unitless'
    intensity.long_name = 'SNR + 1'
    intensity[:,:] = data_temp['intensity']
    
    beta = dataset_temp.createVariable('beta', np.float32,('NUMBER_OF_GATES','NUMBER_OF_RAYS'))
    beta.units = 'm-1 sr-1'
    beta.long_name = 'attenuated backscatter'
    beta[:,:] = data_temp['beta']
    
    gate_centers = dataset_temp.createVariable('gate_centers',np.float32,('NUMBER_OF_GATES'))
    gate_centers.units = 'm'
    gate_centers.long_name = 'center of range gates'
    gate_centers[:] = data_temp['center_of_gates']
    dataset_temp.close()
    print('%s is created succesfully' % path_file)

'''
class that contains information about the doppler lidar instrument. 
'''

class lidar_location:
    def __init__(self,lat,lon,zsl,name,lidar_id,bearing,diff_WGS84=np.nan,diff_geoid=np.nan,diff_bessel=np.nan):
        self.lat = lat
        self.lon = lon
        self.zsl = zsl
        self.name = name
        self.lidar_id = lidar_id
        self.bearing = bearing
        self.diff_WGS84 = diff_WGS84
        self.diff_geoid = diff_geoid
        self.diff_bessel = diff_bessel
        
'''
convert level0 netCDF data into level1 netCDF data
input variables:
    file_path       - string
    file_name_out   - string
    lidar_info      - class lidar_location(lat,lon,zsl,name,lidar_id,bearing,gc_corr,pulse_frequency,start_str,end_str,diff_WGS84=np.nan,diff_geoid=np.nan,diff_bessel=np.nan)
    path_out        - string
The lidar_info variables needs the 
'''
def to_netcdf_l1(file_path,file_name_out,lidar_info,path_out):
    
    ds_temp=xr.open_dataset(file_path)
    
    if not os.path.exists(path_out): os.makedirs(path_out)
                    
    ds_temp.azimuth.values=ds_temp.azimuth.values-lidar_info.bearing
    ds_temp.azimuth.attrs['comment']='corrected azimuth angle (az_corrected=az_measured-bearing) --> az_corrected = 0 deg points to geographical North'
    if lidar_info.bearing!=0:
        ds_temp.attrs['description']='corrected data of Halo Photonics Streamline, corrected variables: azimuth'
    elif lidar_info.gc_corr!=0 and lidar_info.bearing!=0:
        ds_temp.attrs['description']='corrected data of Halo Photonics Streamline, corrected variables: azimuth, gate_centers'
    elif lidar_info.gc_corr==0 and lidar_info.bearing==0:
        ds_temp.attrs['description']='corrected data of Halo Photonics Streamline, corrected variables: none'
             
    lat_var=xr.Variable([],lidar_info.lat,\
                        attrs={'units':'decimal degree north',\
                                'long_name':'latitude',\
                                'description':'latitude, north is positive',\
                                'missing_value':-999.})
    ds_temp=ds_temp.assign(lat=lat_var)
    
    lon_var=xr.Variable([],lidar_info.lon,\
                        attrs={'units':'decimal degrees east',\
                                'long_name':'longitude',\
                                'description':'longitude, east is positive',\
                                'missing_value':-999.})
    ds_temp=ds_temp.assign(lon=lon_var)
    
    
    zsl_var=xr.Variable([],lidar_info.zsl,\
                        attrs={'units':'m',\
                                'long_name':'altitude',\
                                'description':'Height above mean sea leval, Gebrauchshoehe Adria; Transformation to other references is possible by using the additive factor supplied in the fields \"difference_to_geoid\", \"difference_to_bessel\" and \"difference_to_WGS84\", which are added as a list of values, with each entry corresponging to one station, e.g. zsl[1]+zsl.differenve_to_geoid[1] = height above Geoid for the first station',\
                                'missing_value':-999.,\
                                'difference_to_geoid':lidar_info.diff_geoid,\
                                'difference_to_geoid_descr':'Difference between zsl and height above geoid for each STATION_KEY, e.g. zsl[1] + zsl.difference_to_geoid[1] = Height above Geoid in m',\
                                'difference_to_bessel':lidar_info.diff_bessel,\
                                'difference_to_bessel_descr':'Difference between zsl and height above Bessel 1841 reference ellipsoid for geodetic datum MGI for each STATION_KEY; e.g. zsl[1] + zsl.difference_to_bessel[1] = Height above Bessel ellipsoid in m',\
                                'difference_to_WGS84':lidar_info.diff_WGS84,\
                                'difference_to_WGS_descr':'Difference between zsl and height above WGS84 reference ellipsoid  for each STATION_KEY, e.g., zsl[1] + zsl.difference_to_WGS84[1] = Height above WGS84 ellispoid'})
    ds_temp=ds_temp.assign(zsl=zsl_var)
    
    bearing_var=xr.Variable([],lidar_info.bearing,\
                        attrs={'units':'degrees',\
                                'long_name':'angle of bearing offset',\
                                'description':'angle between the theoretical position of North direction in projected plane (epsg:31254) and North alignment of the instrument: bearing = North_lidar - North_projected ',\
                                'missing_value':-999.})
    ds_temp=ds_temp.assign(bearing=bearing_var)

    dec_time_temp=ds_temp.decimal_time.values
    
    # if data from the next day is collected
    import sys
    if dec_time_temp[-1]<dec_time_temp[0]:
        sys.exit()
    
    date_num=mdates.datestr2num(ds_temp.start_time.split()[0])
    dn_time_temp=date_num+dec_time_temp/24
    date_time=mdates.num2date(dn_time_temp)
    unix_time=[(dt- datetime.datetime(1970,1,1,tzinfo=datetime.timezone.utc)).total_seconds() for dt in date_time]
    
    dn_time_temp_var=xr.Variable(['NUMBER_OF_RAYS'],dn_time_temp,\
                        attrs={'units': 'Days since 01-01-0001 00:00:00',\
                                'long_name':'start time number of each ray'})
    ds_temp=ds_temp.assign(datenum_time=dn_time_temp_var)
    
    dt_time_temp_var=xr.Variable(['NUMBER_OF_RAYS'],unix_time,\
                        attrs={'units': 'Seconds since 01-01-1970 00:00:00',\
                                'long_name':'start time number of each ray',\
                                'description':'UNIX timestamp'})
    ds_temp=ds_temp.assign(time=dt_time_temp_var)
       
    path_out_temp = os.path.join(path_out,file_name_out)

    if os.path.isfile(path_out_temp): os.remove(path_out_temp)
    
    ds_temp.to_netcdf(path_out_temp)
    ds_temp.close()

# Added by Jonathan Minz - 25.04.2024
if __name__ == '__main__':
    # location where l0 netcdf files are stored.
    # This location is created by the above defined function - NOT user defined.
    file_path = '/Users/jminz/Documents/Project/2024/DataManagement_LAFI/Script_testing/Data Lidar/2024/202404/20240423'
    
    # location where l1 netcdf files should be stored. 
    # User defined. 
    path_out = '/Users/jminz/Documents/Project/2024/DataManagement_LAFI/Script_testing/Data Lidar'
    print('finding hpl files to netcdf')
    list_of_hpl_files = glob.glob('Stare*')
    
    # instantiate the UHOH Doppler Lidar
    lidar_info = lidar_location(48.710612, 9.190746, 388.0, 115, 0.0, 10000)

    # convert hpl files to l0 netcdf files
    print(f'found {len(list_of_hpl_files)} files')
    for idx, file in enumerate(list_of_hpl_files):
        print(f'convert file no.{idx+1} {file} to l0 netcdf')
        hpl_to_netcdf(file, path_out)
        print('converted hpl files to l0 netcdf files')
    
    # convert l0 netcdf files to l1 netcdf files
    print('converting l0 to l1 netcdf files')
    list_of_l0_ncfiles = glob.glob(os.path.join(file_path,'Stare*'))
    print(f'found {len(list_of_l0_ncfiles)} files')
    for idx_nc, file_nc in enumerate(list_of_l0_ncfiles):
        print(f'converting file {file_nc}')
        file_name_l0 = file_nc.split('/')[12].split('l')[0]
        file_out_name_l1 = file_name_l0+'l1.nc'
        print(f'{file_out_name_l1}') 
        print('converting l0 netcdf to l1 file')
        to_netcdf_l1(file_nc, file_out_name_l1, lidar_info, path_out)
    print('SUCCESS: conversion from hpl to l0 netcdf complete')
 No newline at end of file

Phython_hpl_plots.py

deleted100644 → 0
+0 −476

File deleted.

Preview size limit exceeded, changes collapsed.

scripts/.DS_Store

0 → 100644
+6 KiB

File added.

No diff preview for this file type.

+177 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3
"""
check_cf_compliance.py

Finds the first NetCDF file in each subdirectory of a given folder
and runs CF-Checker (cfchecks) to test compliance.

- Detailed results (per file) are saved in cf_compliance_details.log
- Summary table is saved in cf_compliance_summary.log
- Optionally export summary to CSV/Excel (with timestamped filenames)
"""

import os
import glob
import subprocess
import sys
from datetime import datetime
from tabulate import tabulate
import pandas as pd

DETAILS_LOG = "cf_compliance_details.log"
SUMMARY_LOG = "cf_compliance_summary.log"


def first_netcdf_in_subdirs(parent_dir):
    """
    Return a list of the first NetCDF file (alphabetically) in each subdirectory.
    """
    first_files = []

    for subdir in glob.glob(os.path.join(parent_dir, "*/")):
        files = glob.glob(os.path.join(subdir, "*.nc"))
        files = [f for f in files if os.path.isfile(f)]

        if files:
            files.sort()  # alphabetically
            first_files.append(files[0])

    return first_files


def check_cf_compliance(file_path, cf_version=None):
    """
    Run cfchecks on a single NetCDF file and return results.
    """
    cmd = ["cfchecks"]

    if cf_version:
        cmd += ["-v", cf_version]

    cmd.append(file_path)

    try:
        completed = subprocess.run(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            check=False
        )
        return completed.returncode, completed.stdout, completed.stderr
    except FileNotFoundError:
        msg = "Error: 'cfchecks' command not found. Install cfchecker with pip or conda."
        return None, "", msg


def parse_cf_output(output):
    """
    Parse cfchecks output by reading the final summary lines.
    """
    errors, warnings, infos = 0, 0, 0
    for line in output.splitlines():
        line = line.strip()
        if line.startswith("ERRORS detected:"):
            errors = int(line.split(":")[1].strip())
        elif line.startswith("WARNINGS given:"):
            warnings = int(line.split(":")[1].strip())
        elif line.startswith("INFORMATION messages:"):
            infos = int(line.split(":")[1].strip())
    return errors, warnings, infos


def main():
    if len(sys.argv) < 2:
        print("Usage: python check_cf_compliance.py <parent_directory> [CF_version] [--csv] [--excel]")
        sys.exit(1)

    parent_dir = sys.argv[1]
    cf_version = None
    export_csv = False
    export_excel = False

    # Parse args
    for arg in sys.argv[2:]:
        if arg.lower() == "--csv":
            export_csv = True
        elif arg.lower() == "--excel":
            export_excel = True
        elif not arg.startswith("--"):
            cf_version = arg

    files_to_check = first_netcdf_in_subdirs(parent_dir)

    if not files_to_check:
        print("No NetCDF files (*.nc) found in subdirectories of:", parent_dir)
        sys.exit(0)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Start logs fresh
    with open(DETAILS_LOG, "w") as f:
        f.write("===== CF-Checker Detailed Run: {} =====\n".format(datetime.now()))
        f.write(f"Parent directory: {parent_dir}\n")
        if cf_version:
            f.write(f"CF version: {cf_version}\n")
        f.write("\n")

    with open(SUMMARY_LOG, "w") as f:
        f.write("===== CF-Checker Summary Run: {} =====\n".format(datetime.now()))
        f.write(f"Parent directory: {parent_dir}\n")
        if cf_version:
            f.write(f"CF version: {cf_version}\n")
        f.write("\n")

    print(f"Checking {len(files_to_check)} NetCDF files with CF-Checker...\n")

    summary = []

    for fpath in files_to_check:
        print(f"--- Checking file: {fpath} ---")

        rc, out, err = check_cf_compliance(fpath, cf_version)

        if rc is None:
            with open(DETAILS_LOG, "a") as f:
                f.write(f"--- {fpath} ---\n{err}\n\n")
            continue

        errors, warnings, infos = parse_cf_output(out + err)

        summary.append([os.path.basename(fpath), errors, warnings, infos, "PASS" if errors == 0 else "FAIL"])

        # Save detailed output
        with open(DETAILS_LOG, "a") as f:
            f.write(f"--- Checking file: {fpath} ---\n")
            f.write(out + "\n")
            if err:
                f.write("STDERR:\n" + err + "\n")
            f.write("\n")

    # Build summary table
    headers = ["File", "Errors", "Warnings", "Info", "Result"]
    table = tabulate(summary, headers, tablefmt="github")

    with open(SUMMARY_LOG, "a") as f:
        f.write(table + "\n")

    print("\n===== Summary Table =====")
    print(table)

    # Export options with timestamped filenames
    df = pd.DataFrame(summary, columns=headers)

    if export_csv:
        csv_file = f"cf_compliance_summary_{timestamp}.csv"
        df.to_csv(csv_file, index=False)
        print(f"\n📄 Summary exported to CSV: {csv_file}")

    if export_excel:
        excel_file = f"cf_compliance_summary_{timestamp}.xlsx"
        df.to_excel(excel_file, index=False)
        print(f"\n📄 Summary exported to Excel: {excel_file}")


if __name__ == "__main__":
    main()