From 0ae76894ed123cb11f924eeb4ef3aafa2dc90caa Mon Sep 17 00:00:00 2001 From: Geet George Date: Mon, 11 Dec 2023 16:50:38 +0100 Subject: [PATCH 1/7] change Paths object name to Flight --- src/halodrops/helper/paths.py | 2 +- src/halodrops/pipeline.py | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/halodrops/helper/paths.py b/src/halodrops/helper/paths.py index 9fb06c2..6a58a36 100644 --- a/src/halodrops/helper/paths.py +++ b/src/halodrops/helper/paths.py @@ -11,7 +11,7 @@ module_logger = logging.getLogger("halodrops.helper.paths") -class Paths: +class Flight: """ Deriving paths from the provided directory diff --git a/src/halodrops/pipeline.py b/src/halodrops/pipeline.py index fdeda93..649a63b 100644 --- a/src/halodrops/pipeline.py +++ b/src/halodrops/pipeline.py @@ -1,4 +1,4 @@ -from .helper.paths import Paths +from .helper.paths import Flight from .sonde import Sonde import configparser import inspect @@ -137,9 +137,9 @@ def get_args_for_function(config, function): return args -def create_and_populate_Paths_object(config: configparser.ConfigParser) -> Paths: +def create_and_populate_flight_object(config: configparser.ConfigParser) -> Flight: """ - Creates a Paths object and populates it with A-files. + Creates a Flight object and populates it with A-files. Parameters ---------- @@ -148,15 +148,15 @@ def create_and_populate_Paths_object(config: configparser.ConfigParser) -> Paths Returns ------- - Paths - A Paths object. + Flight + A Flight object. """ output = {} - mandatory = get_mandatory_args(Paths) + mandatory = get_mandatory_args(Flight) mandatory_args = get_mandatory_values_from_config(config, mandatory) - output["paths"] = Paths(**mandatory_args) - output["sondes"] = output["paths"].populate_sonde_instances() - return output["paths"], output["sondes"] + output["flight"] = Flight(**mandatory_args) + output["sondes"] = output["flight"].populate_sonde_instances() + return output["flight"], output["sondes"] def iterate_Sonde_method_over_dict_of_Sondes_objects( @@ -299,10 +299,10 @@ def run_pipeline(pipeline: dict, config: configparser.ConfigParser): pipeline = { - "create_paths": { + "create_flight": { "intake": None, - "apply": create_and_populate_Paths_object, - "output": ["paths", "sondes"], + "apply": create_and_populate_flight_object, + "output": ["flight", "sondes"], }, "qc": { "intake": "sondes", From 29096b163dfe3e3aaa180dbea011bdc2b637e10d Mon Sep 17 00:00:00 2001 From: Geet George Date: Mon, 11 Dec 2023 17:09:41 +0100 Subject: [PATCH 2/7] create Platform object --- src/halodrops/helper/paths.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/halodrops/helper/paths.py b/src/halodrops/helper/paths.py index 6a58a36..a4c377a 100644 --- a/src/halodrops/helper/paths.py +++ b/src/halodrops/helper/paths.py @@ -11,6 +11,38 @@ module_logger = logging.getLogger("halodrops.helper.paths") +class Platform: + """ + Deriving flight paths from the provided platform directory + + The input should align in terms of hierarchy and nomenclature + with the {doc}`Directory Structure ` that `halodrops` expects. + """ + + def __init__( + self, data_directory, platform_id, platform_directory_name=None + ) -> None: + self.platform_id = platform_id + self.platform_directory_name = platform_directory_name + self.data_directory = data_directory + self.flight_ids = self.get_flight_ids() + + def get_flight_ids(self): + """Returns a list of flight IDs for the given platform directory""" + if self.platform_directory_name is None: + platform_dir = os.path.join(self.data_directory, self.platform_id) + else: + platform_dir = os.path.join( + self.data_directory, self.platform_directory_name + ) + + flight_ids = [] + for flight_dir in os.listdir(platform_dir): + if os.path.isdir(os.path.join(platform_dir, flight_dir)): + flight_ids.append(flight_dir) + return flight_ids + + class Flight: """ Deriving paths from the provided directory From 5cde681f18928d12b2105720f2bd0a0fc5b35954 Mon Sep 17 00:00:00 2001 From: Geet George Date: Mon, 11 Dec 2023 17:49:02 +0100 Subject: [PATCH 3/7] add get_platforms function --- src/halodrops/pipeline.py | 62 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/src/halodrops/pipeline.py b/src/halodrops/pipeline.py index 649a63b..e1e37d7 100644 --- a/src/halodrops/pipeline.py +++ b/src/halodrops/pipeline.py @@ -1,7 +1,8 @@ -from .helper.paths import Flight +from .helper.paths import Platform, Flight from .sonde import Sonde import configparser import inspect +import os import xarray as xr @@ -137,6 +138,65 @@ def get_args_for_function(config, function): return args +def get_platforms(data_directory, config): + """ + Get platforms based on the directory names in `data_directory` or the user-provided `platforms` values. + + Parameters + ---------- + data_directory : str + The directory where platform data is stored. + config : ConfigParser instance + The configuration file parser. + + Returns + ------- + dict + A dictionary where keys are platform names and values are `Platform` objects. + + Raises + ------ + ValueError + If `platforms` is specified in the config file but `platform_directory_names` is not, or + if a value in `platform_directory_names` does not correspond to a directory in `data_directory`. + + """ + if config.has_option("MANDATORY", "platforms"): + if not config.has_option("MANDATORY", "platform_directory_names"): + raise ValueError( + "platform_directory_names must be provided in the config file when platforms is specified" + ) + platforms = config.get("MANDATORY", "platforms").split(",") + platform_directory_names = config.get( + "MANDATORY", "platform_directory_names" + ).split(",") + platforms = dict(zip(platforms, platform_directory_names)) + for directory_name in platform_directory_names: + if not os.path.isdir(os.path.join(data_directory, directory_name)): + raise ValueError( + f"No directory found for {directory_name} in data_directory" + ) + platform_objects = {} + for platform, platform_directory_name in platforms.items(): + platform_objects[platform] = Platform( + data_directory=data_directory, + platform_id=platform, + platform_directory_name=platform_directory_name, + ) + else: + platforms = [ + name + for name in os.listdir(data_directory) + if os.path.isdir(os.path.join(data_directory, name)) + ] + platform_objects = {} + for platform in platforms: + platform_objects[platform] = Platform( + data_directory=data_directory, platform_id=platform + ) + return platform_objects + + def create_and_populate_flight_object(config: configparser.ConfigParser) -> Flight: """ Creates a Flight object and populates it with A-files. From 267b8a2e07b220181db8d30a9768a74e66c3ad6e Mon Sep 17 00:00:00 2001 From: Geet George Date: Mon, 11 Dec 2023 21:46:34 +0100 Subject: [PATCH 4/7] change Flight object to accommodate new directory structure The Data_Directory should be of the structure where each directory in it should stand for a platform and directories within a platform's directory would be individual flight directories. This will be made mandatory. The package will then auto-infer platform names (`platforms` attribute) based on the platform directories' names (with the `get_platforms` function in the `pipeline` module. This value will go in to the dataset attributes (`platform_id`) along with `flight_id`. Now, the only way to batch process will be to process for all sondes of a campaign, i.e. all sondes from all flights of all platforms in a campaign. If the user wants a subset of the batching, they can choose to only include limited directories in the `data_directory` they provide in the config file. However, considering that the processing doesn't take is not compute-heavy, there are no use-cases coming to my mind which warrant a separate mode of batch processing. --- src/halodrops/helper/paths.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/src/halodrops/helper/paths.py b/src/halodrops/helper/paths.py index a4c377a..ee7ccef 100644 --- a/src/halodrops/helper/paths.py +++ b/src/halodrops/helper/paths.py @@ -51,7 +51,9 @@ class Flight: with the {doc}`Directory Structure ` that `halodrops` expects. """ - def __init__(self, data_directory, flight_id): + def __init__( + self, data_directory, flight_id, platform_id, platform_directory_name=None + ): """Creates an instance of Paths object for a given flight Parameters @@ -62,25 +64,33 @@ def __init__(self, data_directory, flight_id): `flight_id` : `str` Individual flight directory name + `platform_id` : `str` + Platform name + Attributes ---------- - `flight_id` + `flight_idpath` Path to flight data directory - `flight_idname` + `flight_id` Name of flight data directory `l1dir` Path to Level-1 data directory """ self.logger = logging.getLogger("halodrops.helper.paths.Paths") - self.flight_id = os.path.join(data_directory, flight_id) - self.flight_idname = flight_id - self.l0dir = os.path.join(data_directory, flight_id, "Level_0") - self.l1dir = os.path.join(data_directory, flight_id, "Level_1") + if platform_directory_name is None: + platform_directory_name = platform_id + self.flight_idpath = os.path.join( + data_directory, platform_directory_name, flight_id + ) + self.flight_id = flight_id + self.platform_id = platform_id + self.l1dir = os.path.join(self.flight_idpath, "Level_1") + self.l0dir = os.path.join(self.flight_idpath, "Level_0") self.logger.info( - f"Created Path Instance: {self.flight_id=}; {self.flight_idname=}; {self.l1dir=}" + f"Created Path Instance: {self.flight_idpath=}; {self.flight_id=}; {self.l1dir=}" ) def get_all_afiles(self): @@ -101,7 +111,7 @@ def quicklooks_path(self): `str` Path to quicklooks directory """ - quicklooks_path_str = os.path.join(self.flight_id, "Quicklooks") + quicklooks_path_str = os.path.join(self.flight_idpath, "Quicklooks") if pp(quicklooks_path_str).exists(): self.logger.info(f"Path exists: {quicklooks_path_str=}") else: @@ -112,7 +122,7 @@ def quicklooks_path(self): return quicklooks_path_str def populate_sonde_instances(self) -> Dict: - """Returns a dictionary of `Sonde` class instances for all A-files found in `flight_id` + """Returns a dictionary of `Sonde` class instances for all A-files found in `flight_idpath` and also sets the dictionary as value of `Sondes` attribute """ afiles = self.get_all_afiles() @@ -126,6 +136,8 @@ def populate_sonde_instances(self) -> Dict: Sondes[sonde_id] = Sonde(sonde_id, launch_time=launch_time) Sondes[sonde_id].add_launch_detect(launch_detect) + Sondes[sonde_id].add_flight_id(self.flight_id) + Sondes[sonde_id].add_platform_id(self.platform_id) Sondes[sonde_id].add_afile(a_file) if launch_detect: Sondes[sonde_id].add_postaspenfile() From 4525baf32e0a7d4d531ed0672137665d49924093 Mon Sep 17 00:00:00 2001 From: Geet George Date: Mon, 11 Dec 2023 21:50:54 +0100 Subject: [PATCH 5/7] add flight_id and platform_id attrs to Sonde --- src/halodrops/sonde.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/halodrops/sonde.py b/src/halodrops/sonde.py index 8d7ba35..bf53026 100644 --- a/src/halodrops/sonde.py +++ b/src/halodrops/sonde.py @@ -43,6 +43,26 @@ def __post_init__(self): if self.launch_time is not None: object.__setattr__(self, "sort_index", self.launch_time) + def add_flight_id(self, flight_id: str) -> None: + """Sets attribute of flight ID + + Parameters + ---------- + flight_id : str + The flight ID of the flight during which the sonde was launched + """ + object.__setattr__(self, "flight_id", flight_id) + + def add_platform_id(self, platform_id: str) -> None: + """Sets attribute of platform ID + + Parameters + ---------- + platform_id : str + The platform ID of the flight during which the sonde was launched + """ + object.__setattr__(self, "platform_id", platform_id) + def add_spatial_coordinates_at_launch(self, launch_coordinates: List) -> None: """Sets attributes of spatial coordinates at launch From d58abfab060aa9f5ccc2bdf07942d4bdc8661f9e Mon Sep 17 00:00:00 2001 From: Geet George Date: Mon, 11 Dec 2023 21:51:38 +0100 Subject: [PATCH 6/7] remove data_directory arg and get it from config --- src/halodrops/pipeline.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/halodrops/pipeline.py b/src/halodrops/pipeline.py index e1e37d7..c20cbfe 100644 --- a/src/halodrops/pipeline.py +++ b/src/halodrops/pipeline.py @@ -138,14 +138,12 @@ def get_args_for_function(config, function): return args -def get_platforms(data_directory, config): +def get_platforms(config): """ Get platforms based on the directory names in `data_directory` or the user-provided `platforms` values. Parameters ---------- - data_directory : str - The directory where platform data is stored. config : ConfigParser instance The configuration file parser. @@ -161,6 +159,7 @@ def get_platforms(data_directory, config): if a value in `platform_directory_names` does not correspond to a directory in `data_directory`. """ + data_directory = config.get("MANDATORY", "data_directory") if config.has_option("MANDATORY", "platforms"): if not config.has_option("MANDATORY", "platform_directory_names"): raise ValueError( From f0d3928a587113583ba496ad5a553277cfe846df Mon Sep 17 00:00:00 2001 From: Geet George Date: Mon, 11 Dec 2023 21:52:25 +0100 Subject: [PATCH 7/7] get all sondes from all flights together For each `Platform` object, the `create_and_populate_flight_object` function in the pipeline module now will get all corresponding `flight_id` values by looping over all directory names in a platform's directory and process all sondes in flight-wise batches. The function also now only has one output, i.e. "sondes". The "flight" output is no more relevant. After the flight-wise batch processing is done, all L2 files in the corresponding `flight_id` directories will be populated with L2 datasets that contain the corresponding `platform_id` and `flight_id` attributes. Now, the only way to batch process will be to process for all sondes of a campaign, i.e. all sondes from all flights of all platforms in a campaign. If the user wants a subset of the batching, they can choose to only include limited directories in the `data_directory` they provide in the config file. However, considering that the processing doesn't take is not compute-heavy, there are no use-cases coming to my mind which warrant a separate mode of batch processing. --- src/halodrops/pipeline.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/halodrops/pipeline.py b/src/halodrops/pipeline.py index c20cbfe..0c815b2 100644 --- a/src/halodrops/pipeline.py +++ b/src/halodrops/pipeline.py @@ -210,12 +210,19 @@ def create_and_populate_flight_object(config: configparser.ConfigParser) -> Flig Flight A Flight object. """ + platform_objects = get_platforms(config) output = {} - mandatory = get_mandatory_args(Flight) - mandatory_args = get_mandatory_values_from_config(config, mandatory) - output["flight"] = Flight(**mandatory_args) - output["sondes"] = output["flight"].populate_sonde_instances() - return output["flight"], output["sondes"] + output["sondes"] = {} + for platform in platform_objects: + for flight_id in platform_objects[platform].flight_ids: + flight = Flight( + platform_objects[platform].data_directory, + flight_id, + platform, + platform_objects[platform].platform_directory_name, + ) + output["sondes"].update(flight.populate_sonde_instances()) + return output["sondes"] def iterate_Sonde_method_over_dict_of_Sondes_objects( @@ -361,7 +368,7 @@ def run_pipeline(pipeline: dict, config: configparser.ConfigParser): "create_flight": { "intake": None, "apply": create_and_populate_flight_object, - "output": ["flight", "sondes"], + "output": "sondes", }, "qc": { "intake": "sondes",