created cf compliance checker script (9bcd7de8) · Commits · JonathanMinz_at_IPM / LAFI Data Management

2NETCDF.py

deleted100644 → 0

+0 −330

Original line number	Diff line number	Diff line
		#!/usr/bin/env python3
		# -- coding: utf-8 --
		"""
		Conversion tool for StreamLine .hpl files into netCDF (.nc) files:

		- hpl2dict(): import .hpl files and save as dictionary
		- hpl_to_netcdf(): save .hpl files into level0 (l0) .nc files
		- to_netcdf_l1(): correct raw data (l0) and create level1 (l1) netCDF files

		"""
		import numpy as np
		from netCDF4 import Dataset
		import os
		import datetime
		import xarray as xr
		import matplotlib.dates as mdates
		import glob # Added by Jonathan Minz - 25.04.2024

		'''
		Import of StreamLine .hpl (txt) files and save locally in directory. Therefore
		the data is converted into matrices with dimension "number of range gates" x "time stamp/rays".
		In newer versions of the StreamLine software, the spectral width can be
		stored as additional parameter in the .hpl files.
		'''
		def hpl2dict(file_path):
		#import hpl files into intercal storage
		with open(file_path, 'r') as text_file:
		lines=text_file.readlines()

		#write lines into Dictionary
		data_temp=dict()

		header_n=17 #length of header
		data_temp['filename']=lines[0].split()[-1]
		data_temp['system_id']=int(lines[1].split()[-1])
		data_temp['number_of_gates']=int(lines[2].split()[-1])
		data_temp['range_gate_length_m']=float(lines[3].split()[-1])
		data_temp['gate_length_pts']=int(lines[4].split()[-1])
		data_temp['pulses_per_ray']=int(lines[5].split()[-1])
		data_temp['number_of_waypoints_in_file']=int(lines[6].split()[-1])
		rays_n=(len(lines)-header_n)/(data_temp['number_of_gates']+1)

		'''
		number of lines does not match expected format if the number of range gates
		was changed in the measuring period of the data file (especially possible for stare data)
		'''
		if not rays_n.is_integer():
		print('Number of lines does not match expected format')
		return np.nan

		data_temp['no_of_rays_in_file']=int(rays_n)
		data_temp['scan_type']=' '.join(lines[7].split()[2:])
		data_temp['focus_range']=lines[8].split()[-1]
		data_temp['start_time']=' '.join(lines[9].split()[-2:])
		data_temp['resolution']=('%s %s' % (lines[10].split()[-1],'m s-1'))
		data_temp['range_gates']=np.arange(0,data_temp['number_of_gates'])
		data_temp['center_of_gates']=(data_temp['range_gates']+0.5)*data_temp['range_gate_length_m']

		#dimensions of data set
		gates_n=data_temp['number_of_gates']
		rays_n=data_temp['no_of_rays_in_file']

		# item of measurement variables are predefined as symetric numpy arrays filled with NaN values
		data_temp['radial_velocity'] = np.full([gates_n,rays_n],np.nan) #m s-1
		data_temp['intensity'] = np.full([gates_n,rays_n],np.nan) #SNR+1
		data_temp['beta'] = np.full([gates_n,rays_n],np.nan) #m-1 sr-1
		data_temp['spectral_width'] = np.full([gates_n,rays_n],np.nan)
		data_temp['elevation'] = np.full(rays_n,np.nan) #degrees
		data_temp['azimuth'] = np.full(rays_n,np.nan) #degrees
		data_temp['decimal_time'] = np.full(rays_n,np.nan) #hours
		data_temp['pitch'] = np.full(rays_n,np.nan) #degrees
		data_temp['roll'] = np.full(rays_n,np.nan) #degrees

		for ri in range(0,rays_n): #loop rays
		lines_temp = lines[header_n+(rigates_n)+ri+1:header_n+(rigates_n)+gates_n+ri+1]
		header_temp = np.asarray(lines[header_n+(ri*gates_n)+ri].split(),dtype=float)
		data_temp['decimal_time'][ri] = header_temp[0]
		data_temp['azimuth'][ri] = header_temp[1]
		data_temp['elevation'][ri] = header_temp[2]
		data_temp['pitch'][ri] = header_temp[3]
		data_temp['roll'][ri] = header_temp[4]
		for gi in range(0,gates_n): #loop range gates
		line_temp=np.asarray(lines_temp[gi].split(),dtype=float)
		data_temp['radial_velocity'][gi,ri] = line_temp[1]
		data_temp['intensity'][gi,ri] = line_temp[2]
		data_temp['beta'][gi,ri] = line_temp[3]
		if line_temp.size>4:
		data_temp['spectral_width'][gi,ri] = line_temp[4]

		return data_temp

		'''
		Write .hpl into netCDF l0 data; no data is added, changed or removed;
		additional information about institution and contact are optional;
		'''
		def hpl_to_netcdf(file_path,path_out,institution=None,contact=None,overwrite=False):
		#check if import file exists
		if not os.path.exists(file_path):
		print('%s cannot be found' %os.path.basename(file_path))
		return

		# import data as dictionary
		data_temp = hpl2dict(file_path)
		if type(data_temp) is not dict: return

		'''
		Level0 netCDF files will be stored in folder structure which equals the
		structure on the StreamLine software
		path_out\level0\lidar_id\yyyy\yyyymm\yyyymmdd\file_name.nc
		'''
		datestr=[fns for fns in data_temp['filename'].split('_') if len(fns)==8][0]
		path_out_date=os.path.join(path_out, datestr[0:4], datestr[0:6], datestr)
		# test if file already exists
		if not os.path.exists(path_out_date):
		os.makedirs(path_out_date)

		file_name_out = data_temp['filename'].split('.')[0]+'_l0.nc'
		path_file = os.path.join(path_out_date,file_name_out)
		if os.path.isfile(path_file):
		if overwrite == False:
		raise Exception('%s already exists' % path_file)
		else:
		os.remove(path_file)

		dataset_temp = Dataset(path_file,'w',format ='NETCDF4')
		# define dimensions
		dataset_temp.createDimension('NUMBER_OF_GATES',data_temp['number_of_gates'])
		dataset_temp.createDimension('NUMBER_OF_RAYS',data_temp['no_of_rays_in_file'])
		# Metadata
		dataset_temp.description = "non-processed data of Halo Photonics Streamline"
		if institution:
		dataset_temp.institution = institution
		if contact:
		dataset_temp.contact = contact
		dataset_temp.focus_range = '%s m' % data_temp['focus_range']
		dataset_temp.range_gate_length = '%i m' % data_temp['range_gate_length_m']
		dataset_temp.pulses_per_ray = data_temp['pulses_per_ray']
		dataset_temp.start_time = data_temp['start_time']
		dataset_temp.system_id = data_temp['system_id']
		dataset_temp.scan_type = data_temp['scan_type']
		dataset_temp.resolution = data_temp['resolution']
		dataset_temp.number_waypoint = data_temp['number_of_waypoints_in_file']
		dataset_temp.history = 'File created on %s ' % datetime.datetime.now().strftime('%d %b %Y %H:%M')

		decimal_time = dataset_temp.createVariable('decimal_time', np.float64, ('NUMBER_OF_RAYS'))
		decimal_time.units = 'decimal time (hours) UTC'
		decimal_time.long_name = 'start time of each ray'
		decimal_time[:] = data_temp['decimal_time']

		azi = dataset_temp.createVariable('azimuth', np.float32, ('NUMBER_OF_RAYS'))
		azi.units = 'degrees, meteorologically'
		azi.long_name = 'azimuth angle'
		azi[:] = data_temp['azimuth']

		ele = dataset_temp.createVariable('elevation', np.float32, ('NUMBER_OF_RAYS'))
		ele.units = 'degrees, meteorologically'
		ele.long_name = 'elevation angle'
		ele[:] = data_temp['elevation']

		pitch = dataset_temp.createVariable('pitch_angle', np.float32, ('NUMBER_OF_RAYS'))
		pitch.units = 'degrees'
		pitch.long_name = 'pitch angle'
		pitch[:] = data_temp['pitch']

		roll = dataset_temp.createVariable('roll_angle', np.float32, ('NUMBER_OF_RAYS'))
		roll.units = 'degrees'
		roll.long_name = 'roll angle'
		roll[:] = data_temp['roll']

		rv = dataset_temp.createVariable('radial_velocity', np.float32,('NUMBER_OF_GATES','NUMBER_OF_RAYS'))
		rv.units = 'm s-1'
		rv.long_name = 'Doppler velocity along line of sight'
		rv[:,:] = data_temp['radial_velocity']

		intensity = dataset_temp.createVariable('intensity', np.float32,('NUMBER_OF_GATES','NUMBER_OF_RAYS'))
		intensity.units = 'unitless'
		intensity.long_name = 'SNR + 1'
		intensity[:,:] = data_temp['intensity']

		beta = dataset_temp.createVariable('beta', np.float32,('NUMBER_OF_GATES','NUMBER_OF_RAYS'))
		beta.units = 'm-1 sr-1'
		beta.long_name = 'attenuated backscatter'
		beta[:,:] = data_temp['beta']

		gate_centers = dataset_temp.createVariable('gate_centers',np.float32,('NUMBER_OF_GATES'))
		gate_centers.units = 'm'
		gate_centers.long_name = 'center of range gates'
		gate_centers[:] = data_temp['center_of_gates']
		dataset_temp.close()
		print('%s is created succesfully' % path_file)

		'''
		class that contains information about the doppler lidar instrument.
		'''

		class lidar_location:
		def __init__(self,lat,lon,zsl,name,lidar_id,bearing,diff_WGS84=np.nan,diff_geoid=np.nan,diff_bessel=np.nan):
		self.lat = lat
		self.lon = lon
		self.zsl = zsl
		self.name = name
		self.lidar_id = lidar_id
		self.bearing = bearing
		self.diff_WGS84 = diff_WGS84
		self.diff_geoid = diff_geoid
		self.diff_bessel = diff_bessel

		'''
		convert level0 netCDF data into level1 netCDF data
		input variables:
		file_path - string
		file_name_out - string
		lidar_info - class lidar_location(lat,lon,zsl,name,lidar_id,bearing,gc_corr,pulse_frequency,start_str,end_str,diff_WGS84=np.nan,diff_geoid=np.nan,diff_bessel=np.nan)
		path_out - string
		The lidar_info variables needs the
		'''
		def to_netcdf_l1(file_path,file_name_out,lidar_info,path_out):

		ds_temp=xr.open_dataset(file_path)

		if not os.path.exists(path_out): os.makedirs(path_out)

		ds_temp.azimuth.values=ds_temp.azimuth.values-lidar_info.bearing
		ds_temp.azimuth.attrs['comment']='corrected azimuth angle (az_corrected=az_measured-bearing) --> az_corrected = 0 deg points to geographical North'
		if lidar_info.bearing!=0:
		ds_temp.attrs['description']='corrected data of Halo Photonics Streamline, corrected variables: azimuth'
		elif lidar_info.gc_corr!=0 and lidar_info.bearing!=0:
		ds_temp.attrs['description']='corrected data of Halo Photonics Streamline, corrected variables: azimuth, gate_centers'
		elif lidar_info.gc_corr==0 and lidar_info.bearing==0:
		ds_temp.attrs['description']='corrected data of Halo Photonics Streamline, corrected variables: none'

		lat_var=xr.Variable([],lidar_info.lat,\
		attrs={'units':'decimal degree north',\
		'long_name':'latitude',\
		'description':'latitude, north is positive',\
		'missing_value':-999.})
		ds_temp=ds_temp.assign(lat=lat_var)

		lon_var=xr.Variable([],lidar_info.lon,\
		attrs={'units':'decimal degrees east',\
		'long_name':'longitude',\
		'description':'longitude, east is positive',\
		'missing_value':-999.})
		ds_temp=ds_temp.assign(lon=lon_var)


		zsl_var=xr.Variable([],lidar_info.zsl,\
		attrs={'units':'m',\
		'long_name':'altitude',\
		'description':'Height above mean sea leval, Gebrauchshoehe Adria; Transformation to other references is possible by using the additive factor supplied in the fields \"difference_to_geoid\", \"difference_to_bessel\" and \"difference_to_WGS84\", which are added as a list of values, with each entry corresponging to one station, e.g. zsl[1]+zsl.differenve_to_geoid[1] = height above Geoid for the first station',\
		'missing_value':-999.,\
		'difference_to_geoid':lidar_info.diff_geoid,\
		'difference_to_geoid_descr':'Difference between zsl and height above geoid for each STATION_KEY, e.g. zsl[1] + zsl.difference_to_geoid[1] = Height above Geoid in m',\
		'difference_to_bessel':lidar_info.diff_bessel,\
		'difference_to_bessel_descr':'Difference between zsl and height above Bessel 1841 reference ellipsoid for geodetic datum MGI for each STATION_KEY; e.g. zsl[1] + zsl.difference_to_bessel[1] = Height above Bessel ellipsoid in m',\
		'difference_to_WGS84':lidar_info.diff_WGS84,\
		'difference_to_WGS_descr':'Difference between zsl and height above WGS84 reference ellipsoid for each STATION_KEY, e.g., zsl[1] + zsl.difference_to_WGS84[1] = Height above WGS84 ellispoid'})
		ds_temp=ds_temp.assign(zsl=zsl_var)

		bearing_var=xr.Variable([],lidar_info.bearing,\
		attrs={'units':'degrees',\
		'long_name':'angle of bearing offset',\
		'description':'angle between the theoretical position of North direction in projected plane (epsg:31254) and North alignment of the instrument: bearing = North_lidar - North_projected ',\
		'missing_value':-999.})
		ds_temp=ds_temp.assign(bearing=bearing_var)

		dec_time_temp=ds_temp.decimal_time.values

		# if data from the next day is collected
		import sys
		if dec_time_temp[-1]<dec_time_temp[0]:
		sys.exit()

		date_num=mdates.datestr2num(ds_temp.start_time.split()[0])
		dn_time_temp=date_num+dec_time_temp/24
		date_time=mdates.num2date(dn_time_temp)
		unix_time=[(dt- datetime.datetime(1970,1,1,tzinfo=datetime.timezone.utc)).total_seconds() for dt in date_time]

		dn_time_temp_var=xr.Variable(['NUMBER_OF_RAYS'],dn_time_temp,\
		attrs={'units': 'Days since 01-01-0001 00:00:00',\
		'long_name':'start time number of each ray'})
		ds_temp=ds_temp.assign(datenum_time=dn_time_temp_var)

		dt_time_temp_var=xr.Variable(['NUMBER_OF_RAYS'],unix_time,\
		attrs={'units': 'Seconds since 01-01-1970 00:00:00',\
		'long_name':'start time number of each ray',\
		'description':'UNIX timestamp'})
		ds_temp=ds_temp.assign(time=dt_time_temp_var)

		path_out_temp = os.path.join(path_out,file_name_out)

		if os.path.isfile(path_out_temp): os.remove(path_out_temp)

		ds_temp.to_netcdf(path_out_temp)
		ds_temp.close()

		# Added by Jonathan Minz - 25.04.2024
		if __name__ == '__main__':
		# location where l0 netcdf files are stored.
		# This location is created by the above defined function - NOT user defined.
		file_path = '/Users/jminz/Documents/Project/2024/DataManagement_LAFI/Script_testing/Data Lidar/2024/202404/20240423'

		# location where l1 netcdf files should be stored.
		# User defined.
		path_out = '/Users/jminz/Documents/Project/2024/DataManagement_LAFI/Script_testing/Data Lidar'
		print('finding hpl files to netcdf')
		list_of_hpl_files = glob.glob('Stare*')

		# instantiate the UHOH Doppler Lidar
		lidar_info = lidar_location(48.710612, 9.190746, 388.0, 115, 0.0, 10000)

		# convert hpl files to l0 netcdf files
		print(f'found {len(list_of_hpl_files)} files')
		for idx, file in enumerate(list_of_hpl_files):
		print(f'convert file no.{idx+1} {file} to l0 netcdf')
		hpl_to_netcdf(file, path_out)
		print('converted hpl files to l0 netcdf files')

		# convert l0 netcdf files to l1 netcdf files
		print('converting l0 to l1 netcdf files')
		list_of_l0_ncfiles = glob.glob(os.path.join(file_path,'Stare*'))
		print(f'found {len(list_of_l0_ncfiles)} files')
		for idx_nc, file_nc in enumerate(list_of_l0_ncfiles):
		print(f'converting file {file_nc}')
		file_name_l0 = file_nc.split('/')[12].split('l')[0]
		file_out_name_l1 = file_name_l0+'l1.nc'
		print(f'{file_out_name_l1}')
		print('converting l0 netcdf to l1 file')
		to_netcdf_l1(file_nc, file_out_name_l1, lidar_info, path_out)
		print('SUCCESS: conversion from hpl to l0 netcdf complete')
		No newline at end of file

Phython_hpl_plots.py

deleted100644 → 0

+0 −476

File deleted.

Preview size limit exceeded, changes collapsed.

scripts/.DS_Store

0 → 100644

+6 KiB

File added.

No diff preview for this file type.

View file

scripts/cf_compliance/check_cf_compliance.py

0 → 100644

+177 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python3
		"""
		check_cf_compliance.py

		Finds the first NetCDF file in each subdirectory of a given folder
		and runs CF-Checker (cfchecks) to test compliance.

		- Detailed results (per file) are saved in cf_compliance_details.log
		- Summary table is saved in cf_compliance_summary.log
		- Optionally export summary to CSV/Excel (with timestamped filenames)
		"""

		import os
		import glob
		import subprocess
		import sys
		from datetime import datetime
		from tabulate import tabulate
		import pandas as pd

		DETAILS_LOG = "cf_compliance_details.log"
		SUMMARY_LOG = "cf_compliance_summary.log"


		def first_netcdf_in_subdirs(parent_dir):
		"""
		Return a list of the first NetCDF file (alphabetically) in each subdirectory.
		"""
		first_files = []

		for subdir in glob.glob(os.path.join(parent_dir, "*/")):
		files = glob.glob(os.path.join(subdir, "*.nc"))
		files = [f for f in files if os.path.isfile(f)]

		if files:
		files.sort() # alphabetically
		first_files.append(files[0])

		return first_files


		def check_cf_compliance(file_path, cf_version=None):
		"""
		Run cfchecks on a single NetCDF file and return results.
		"""
		cmd = ["cfchecks"]

		if cf_version:
		cmd += ["-v", cf_version]

		cmd.append(file_path)

		try:
		completed = subprocess.run(
		cmd,
		stdout=subprocess.PIPE,
		stderr=subprocess.PIPE,
		text=True,
		check=False
		)
		return completed.returncode, completed.stdout, completed.stderr
		except FileNotFoundError:
		msg = "Error: 'cfchecks' command not found. Install cfchecker with pip or conda."
		return None, "", msg


		def parse_cf_output(output):
		"""
		Parse cfchecks output by reading the final summary lines.
		"""
		errors, warnings, infos = 0, 0, 0
		for line in output.splitlines():
		line = line.strip()
		if line.startswith("ERRORS detected:"):
		errors = int(line.split(":")[1].strip())
		elif line.startswith("WARNINGS given:"):
		warnings = int(line.split(":")[1].strip())
		elif line.startswith("INFORMATION messages:"):
		infos = int(line.split(":")[1].strip())
		return errors, warnings, infos


		def main():
		if len(sys.argv) < 2:
		print("Usage: python check_cf_compliance.py <parent_directory> [CF_version] [--csv] [--excel]")
		sys.exit(1)

		parent_dir = sys.argv[1]
		cf_version = None
		export_csv = False
		export_excel = False

		# Parse args
		for arg in sys.argv[2:]:
		if arg.lower() == "--csv":
		export_csv = True
		elif arg.lower() == "--excel":
		export_excel = True
		elif not arg.startswith("--"):
		cf_version = arg

		files_to_check = first_netcdf_in_subdirs(parent_dir)

		if not files_to_check:
		print("No NetCDF files (*.nc) found in subdirectories of:", parent_dir)
		sys.exit(0)

		timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

		# Start logs fresh
		with open(DETAILS_LOG, "w") as f:
		f.write("===== CF-Checker Detailed Run: {} =====\n".format(datetime.now()))
		f.write(f"Parent directory: {parent_dir}\n")
		if cf_version:
		f.write(f"CF version: {cf_version}\n")
		f.write("\n")

		with open(SUMMARY_LOG, "w") as f:
		f.write("===== CF-Checker Summary Run: {} =====\n".format(datetime.now()))
		f.write(f"Parent directory: {parent_dir}\n")
		if cf_version:
		f.write(f"CF version: {cf_version}\n")
		f.write("\n")

		print(f"Checking {len(files_to_check)} NetCDF files with CF-Checker...\n")

		summary = []

		for fpath in files_to_check:
		print(f"--- Checking file: {fpath} ---")

		rc, out, err = check_cf_compliance(fpath, cf_version)

		if rc is None:
		with open(DETAILS_LOG, "a") as f:
		f.write(f"--- {fpath} ---\n{err}\n\n")
		continue

		errors, warnings, infos = parse_cf_output(out + err)

		summary.append([os.path.basename(fpath), errors, warnings, infos, "PASS" if errors == 0 else "FAIL"])

		# Save detailed output
		with open(DETAILS_LOG, "a") as f:
		f.write(f"--- Checking file: {fpath} ---\n")
		f.write(out + "\n")
		if err:
		f.write("STDERR:\n" + err + "\n")
		f.write("\n")

		# Build summary table
		headers = ["File", "Errors", "Warnings", "Info", "Result"]
		table = tabulate(summary, headers, tablefmt="github")

		with open(SUMMARY_LOG, "a") as f:
		f.write(table + "\n")

		print("\n===== Summary Table =====")
		print(table)

		# Export options with timestamped filenames
		df = pd.DataFrame(summary, columns=headers)

		if export_csv:
		csv_file = f"cf_compliance_summary_{timestamp}.csv"
		df.to_csv(csv_file, index=False)
		print(f"\n📄 Summary exported to CSV: {csv_file}")

		if export_excel:
		excel_file = f"cf_compliance_summary_{timestamp}.xlsx"
		df.to_excel(excel_file, index=False)
		print(f"\n📄 Summary exported to Excel: {excel_file}")


		if __name__ == "__main__":
		main()