From b56b618a45e89914f1c097274b51665b5ebf95e3 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 15 Jul 2024 08:44:28 -0700 Subject: [PATCH 01/81] Establishing inital `LiveSurvey` --- config_files/live_initialization_config.yml | 57 ++++++++++ config_files/live_survey_year_2019_config.yml | 91 ++++++++++++++++ echopop/live/__init__.py | 1 + echopop/live/acoustics.py | 0 echopop/live/biology.py | 0 echopop/live/core.py | 28 +++++ echopop/live/livesurvey.py | 41 +++++++ echopop/live/load.py | 0 echopop/live/spatial.py | 0 echopop/live/write.py | 0 echopop/zarr_read_ingest_test.py | 103 ++++++++++++++++++ 11 files changed, 321 insertions(+) create mode 100644 config_files/live_initialization_config.yml create mode 100644 config_files/live_survey_year_2019_config.yml create mode 100644 echopop/live/__init__.py create mode 100644 echopop/live/acoustics.py create mode 100644 echopop/live/biology.py create mode 100644 echopop/live/core.py create mode 100644 echopop/live/livesurvey.py create mode 100644 echopop/live/load.py create mode 100644 echopop/live/spatial.py create mode 100644 echopop/live/write.py create mode 100644 echopop/zarr_read_ingest_test.py diff --git a/config_files/live_initialization_config.yml b/config_files/live_initialization_config.yml new file mode 100644 index 00000000..ba1474aa --- /dev/null +++ b/config_files/live_initialization_config.yml @@ -0,0 +1,57 @@ +# This YAML file is a configuration file for all +# initialization parameters used for the `LiveSurvey` +# class in Echopop + +--- + ##################################################################################################################### + # Biological data processing# + ######################## + # Jolly and Hampton (1990) stratified mean calculation + bio_hake_len_bin: [ # length sequence array 2 - 80 cm. This specifies the bin centers! + 2, # start of bin centers + 80, # end of bin centers + 40 # number of bins in total + ] + + ##################################################################################################################### + # Geospatial settings# + ######################## + inpfc: # INPFC northern latitude limits and labels + latitude_max: [36.0, 40.5, 43.0, + 45.7667, 48.50, 55.0] + stratum_names: [1, 2, 3, 4, 5, 6] + geospatial: + init: epsg:4326 # EPSG integer code for geodetic parameter dataset + + ##################################################################################################################### + # Analysis settings# + ######################## + acoustics: + nasc_frequency: 38 # kHz + biology: + separate_stations: + station_id: ["length", "specimen"] + ## NOTE: `link_biology_acoustics` defines how biological and acoustic data are linked with one another. This + ## comprises True/False statements that denote the desired association. All values set to "True" will be output. + ## `global` --> NASC associated with sigma_bs calculated from all survey data + ## `INPFC` --> NASC for each INPFC stratum associated with matched stratum-specific sigma_bs + ## `closest_haul` --> NASC associated with sigma_bs calculated from the closest (spatially) trawls + ## `weighted_haul` --> NASC associated with sigma_bs calculated from all survey data weighted by distance from haul coordinates + link_biology_acoustics: + global: False + INPFC: True + closest_haul: False + weighted_haul: False + ## NOTE: `biological_processing` + + ##################################################################################################################### + # Log-linear regression# + ######################## + # Target strength (TS) - length (L) regression: TS=m*log10(L)+b + TS_length_regression_parameters: + pacific_hake: # corresponding species text code + number_code: 22500 # species number code + TS_L_slope: 20.0 # the 'm' or 'slope' parameter + TS_L_intercept: -68.0 # the 'b' or 'y-intercept' + length_units: cm # units for L used in regression/relationship +... diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml new file mode 100644 index 00000000..f374e624 --- /dev/null +++ b/config_files/live_survey_year_2019_config.yml @@ -0,0 +1,91 @@ +# This YAML file is a configuration file specifying +# input filenames & some process parameter settings. +# Relative file paths defined below are concatenated +# with the data_root_dir path also set below. + +--- +############################################################################## +# Parameters + +survey_year: 2019 # survey year being considered + +############################################################################## +# Directory path that contains all input data needed + +data_root_dir: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files + +############################################################################## +# Input data directories + +acoustic: + directory: acoustic/ + extension: zarr + sheetname: null +biological: + directory: biology/ + extension: csv + sheetname: null + + length: + directory: Biological + filename: Biological/ + length: + US: + filename: Biological/US/2019_biodata_length.xlsx + sheetname: biodata_length + CAN: + filename: Biological/CAN/2019_biodata_length_CAN.xlsx + sheetname: biodata_length_CAN + specimen: + US: + filename: Biological/US/2019_biodata_specimen_AGES.xlsx + sheetname: biodata_specimen + CAN: + filename: Biological/CAN/2019_biodata_specimen_CAN_AGES.xlsx + sheetname: biodata_specimen_CAN + catch: + US: + filename: Biological/US/2019_biodata_catch.xlsx + sheetname: biodata_catch + CAN: + filename: Biological/CAN/2019_biodata_catch_CAN.xlsx + sheetname: biodata_catch_CAN + haul_to_transect: + US: + filename: Biological/US/haul_to_transect_mapping_2019.xlsx + sheetname: Sheet1 + CAN: + filename: Biological/CAN/haul_to_transect_mapping_2019_CAN.xlsx + sheetname: Sheet1 +stratification: + strata: + # The two stratification types are found in two sheets: "Base KS" and "INPFC" + filename: Stratification/US_CAN strata 2019_final.xlsx + sheetname: Base KS + geo_strata: + # The two stratification types are found in two sheets: "stratification1" and "INPFC" + filename: Stratification/Stratification_geographic_Lat_2019_final.xlsx + sheetname: [ INPFC , stratification1 ] +NASC: + # NASC values + no_age1: + # file that excludes age1 values + filename: Exports/US_CAN_detailsa_2019_table2y+_ALL_final - updated.xlsx + sheetname: Sheet1 + all_ages: + # file that includes all ages + filename: Exports/US_CAN_detailsa_2019_table1y+_ALL_final - updated.xlsx + sheetname: Sheet1 +kriging: + mesh: + filename: Kriging_files/Kriging_grid_files/krig_grid2_5nm_cut_centroids_2013.xlsx + sheetname: krigedgrid2_5nm_forChu + isobath_200m: + # filename: Kriging_files/Kriging_grid_files/Smoothing_EasyKrig.xlsx + filename: Kriging_files/Kriging_grid_files/transformation_isobath_coordinates.xlsx + sheetname: Smoothing_EasyKrig + vario_krig_para: + # NOTE: This file is not currently used + filename: Kriging_files/default_vario_krig_settings_2019_US_CAN.xlsx + sheetname: Sheet1 +... diff --git a/echopop/live/__init__.py b/echopop/live/__init__.py new file mode 100644 index 00000000..b8585ba9 --- /dev/null +++ b/echopop/live/__init__.py @@ -0,0 +1 @@ +from _echopop_version import version as __version__ # noqa \ No newline at end of file diff --git a/echopop/live/acoustics.py b/echopop/live/acoustics.py new file mode 100644 index 00000000..e69de29b diff --git a/echopop/live/biology.py b/echopop/live/biology.py new file mode 100644 index 00000000..e69de29b diff --git a/echopop/live/core.py b/echopop/live/core.py new file mode 100644 index 00000000..de066ae3 --- /dev/null +++ b/echopop/live/core.py @@ -0,0 +1,28 @@ +from datetime import datetime + +import pandas as pd + +LIVE_DATA_STRUCTURE = { + "meta": { + "provenance": dict(), + "date": list(), + }, + "input": { + "acoustics": { + "nasc_df": pd.DataFrame(), + }, + "biology": { + "catch_df": pd.DataFrame(), + "distributions": { + "length_bins_df": pd.DataFrame(), + }, + "length_df": pd.DataFrame(), + "specimen_df": pd.DataFrame(), + }, + }, + "results": { + "acoustics": dict(), + "biology": dict(), + "stratified": dict(), + }, +} \ No newline at end of file diff --git a/echopop/live/livesurvey.py b/echopop/live/livesurvey.py new file mode 100644 index 00000000..70765b0f --- /dev/null +++ b/echopop/live/livesurvey.py @@ -0,0 +1,41 @@ +from typing import Union +from pathlib import Path +import copy +import yaml + +from .core import( + DATA_STRUCTURE +) + +from ..acoustics import ( + ts_length_regression, + to_dB, + to_linear +) + +class LiveSurvey: + """ + A real-time processing version of the `echopop` base + `Survey` class that ingests biological, acoustic, and + event meta data to provide population estimates when + generated. + """ + + def __init__( + self + ): + # Initialize `meta` attribute + self.meta = copy.deepcopy(DATA_STRUCTURE["meta"]) + + # Loading the configuration settings and definitions that are used to + # initialize the Survey class object + self.config = el.load_configuration(Path(init_config_path), Path(survey_year_config_path)) + + # Loading the datasets defined in the configuration files + self.input = el.load_survey_data(self.config) + + # Initialize the `analysis` data attribute + self.analysis = copy.deepcopy(DATA_STRUCTURE["analysis"]) + + # Initialize the `results` data attribute + self.results = copy.deepcopy(DATA_STRUCTURE["results"]) \ No newline at end of file diff --git a/echopop/live/load.py b/echopop/live/load.py new file mode 100644 index 00000000..e69de29b diff --git a/echopop/live/spatial.py b/echopop/live/spatial.py new file mode 100644 index 00000000..e69de29b diff --git a/echopop/live/write.py b/echopop/live/write.py new file mode 100644 index 00000000..e69de29b diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py new file mode 100644 index 00000000..9201adc7 --- /dev/null +++ b/echopop/zarr_read_ingest_test.py @@ -0,0 +1,103 @@ +import zarr +import xarray as xr +import shutil +from pathlib import Path +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +specimen_df = pd.DataFrame( + { + "haul_num": np.repeat([1,2,3], 4), + "station": "specimen", + "sex": np.tile(["male", "female"], 6), + "length": np.array([11, 11, 11, 18, 21, 23, 13, 11, 19, 25, 18, 9]), + "weight": np.array([11, 14, 16, 18, 21, 23, 13, 11, 19, 25, 18, 9]) / 3.5, + }, +) + +length_df = pd.DataFrame( + { + "haul_num": np.repeat([1,2,3], 4), + "station": "length", + "sex": np.tile(["male", "female"], 6), + "length": np.array([16, 15, 19, 14, 9, 10, 18, 15, 16, 22, 17, 11]), + "length_count": np.array([103, 123, 257, 106, 52, 329, 131, 72, 101, 212, 93, 81]), + }, +) + +catch_df = pd.DataFrame( + { + "haul_num": np.array([1, 2, 3]), + "weight": np.array([503.12, 684.32, 978.54]) + } +) + +TS_SLOPE = 20.0 +TS_INTERCEPT = -68.0 + +#### +# CONCATENATE FILE SOURCES +specimen_reframed = specimen_df.groupby(["haul_num", "station", "sex", "length"])["length"].value_counts().to_frame("length_count").reset_index() +specimen_reframed +# MELD +all_lengths = pd.concat([length_df, specimen_reframed]) +# COMBINE +comb_lengths = all_lengths.groupby(["haul_num", "sex", "length"])["length_count"].sum().to_frame("length_count").reset_index() + + +# CONVERT TO TS +comb_lengths["ts"] = TS_SLOPE * np.log10(comb_lengths["length"]) + TS_INTERCEPT +# TO SIGMA_BS +comb_lengths["sigma_bs"] = 10 ** (comb_lengths["ts"] / 10) +# WEIGHTED MEAN SIGMA_BS +sigma_mean = np.average(comb_lengths["sigma_bs"], weights=comb_lengths["length_count"]) + +### +# INTEGRATE NASC +path2file = "C:/Users/15052/Downloads/win_1720457505_1720460000_NASC.zarr" + +Path(path2file).exists() +xds = xr.open_dataset(path2file, engine="zarr") +xds +xdf = xds.to_dataframe().reset_index() +xdf["NASC"] = xdf["NASC"].fillna(0.0) +# convert frequency +xdf["frequency_nominal"] = (xdf["frequency_nominal"] * 1e-3).astype(int) +# filter +xdf_38 = xdf[xdf["frequency_nominal"] == nasc_frequency] + +xdf_38.plot.scatter(x="distance", y="depth", c="NASC") +plt.show() + +xdf_int = xdf_38.groupby(["distance", "longitude", "latitude"])["NASC"].sum().reset_index() + +plt.scatter(xdf_int["longitude"], xdf_int["latitude"], c=xdf_int["NASC"]) +plt.plot(xdf_int["longitude"], xdf_int["latitude"]) +plt.show() + +# CONVERT TO NUMBER DENSITY +xdf_int["number_density"] = xdf_int["NASC"] / (4.0 * np.pi * sigma_mean) + + +################### +from typing import Union +from pathlib import Path +import copy +import yaml + +# from echopop.acoustics import ts_length_regression, to_dB, to_linear +# from echopop.live.core import DATA_STRUCTURE + + +### INIT CONFIG +initialization_config = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_initialization_config.yml" + +# Initialize `meta` attribute +meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"]) + +# Loading the configuration settings and definitions that are used to +# initialize the Survey class object +config = yaml.safe_load(Path(initialization_config).read_text()) + +nasc_frequency = config["acoustics"]["nasc_frequency"] \ No newline at end of file From 00c898dcb6027199c80b2648fb7974123dc8bc90 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 15 Jul 2024 13:37:14 -0700 Subject: [PATCH 02/81] Initial data loading function refactoring --- config_files/live_initialization_config.yml | 79 +++-- config_files/live_survey_year_2019_config.yml | 81 +---- echopop/zarr_read_ingest_test.py | 327 +++++++++++++++++- 3 files changed, 374 insertions(+), 113 deletions(-) diff --git a/config_files/live_initialization_config.yml b/config_files/live_initialization_config.yml index ba1474aa..7cff952b 100644 --- a/config_files/live_initialization_config.yml +++ b/config_files/live_initialization_config.yml @@ -6,52 +6,51 @@ ##################################################################################################################### # Biological data processing# ######################## - # Jolly and Hampton (1990) stratified mean calculation - bio_hake_len_bin: [ # length sequence array 2 - 80 cm. This specifies the bin centers! - 2, # start of bin centers - 80, # end of bin centers - 40 # number of bins in total - ] + biology: + # Length-binning + # NOTE: start : end : number + length_distribution: + bins: [2, 80, 40] + # Station separation + # NOTE: if `separate_stations` is True, `['list']` is required for `station_id` + stations: + separate_stations: True + station_id: ["length", "specimen"] ##################################################################################################################### # Geospatial settings# ######################## - inpfc: # INPFC northern latitude limits and labels - latitude_max: [36.0, 40.5, 43.0, - 45.7667, 48.50, 55.0] - stratum_names: [1, 2, 3, 4, 5, 6] geospatial: - init: epsg:4326 # EPSG integer code for geodetic parameter dataset + inpfc: # INPFC northern latitude limits and labels + latitude_max: [36.0, 40.5, 43.0, + 45.7667, 48.50, 55.0] + stratum_names: [1, 2, 3, 4, 5, 6] + projection: epsg:4326 # EPSG integer code for geodetic parameter dataset + # NOTE: `link_biology_acoustics` defines how biological and acoustic data are linked with one another. This + # comprises True/False statements that denote the desired association. All values set to "True" will be output. + # `global` --> NASC associated with sigma_bs calculated from all survey data + # `INPFC` --> NASC for each INPFC stratum associated with matched stratum-specific sigma_bs + # `closest_haul` --> NASC associated with sigma_bs calculated from the closest (spatially) trawls + # `weighted_haul` --> NASC associated with sigma_bs calculated from all survey data weighted by distance from haul coordinates + link_biology_acoustics: + global: False + INPFC: True + closest_haul: False + weighted_haul: False ##################################################################################################################### - # Analysis settings# - ######################## + # Acoustics settings# + ######################## acoustics: - nasc_frequency: 38 # kHz - biology: - separate_stations: - station_id: ["length", "specimen"] - ## NOTE: `link_biology_acoustics` defines how biological and acoustic data are linked with one another. This - ## comprises True/False statements that denote the desired association. All values set to "True" will be output. - ## `global` --> NASC associated with sigma_bs calculated from all survey data - ## `INPFC` --> NASC for each INPFC stratum associated with matched stratum-specific sigma_bs - ## `closest_haul` --> NASC associated with sigma_bs calculated from the closest (spatially) trawls - ## `weighted_haul` --> NASC associated with sigma_bs calculated from all survey data weighted by distance from haul coordinates - link_biology_acoustics: - global: False - INPFC: True - closest_haul: False - weighted_haul: False - ## NOTE: `biological_processing` - - ##################################################################################################################### - # Log-linear regression# - ######################## - # Target strength (TS) - length (L) regression: TS=m*log10(L)+b - TS_length_regression_parameters: - pacific_hake: # corresponding species text code - number_code: 22500 # species number code - TS_L_slope: 20.0 # the 'm' or 'slope' parameter - TS_L_intercept: -68.0 # the 'b' or 'y-intercept' - length_units: cm # units for L used in regression/relationship + # Acoustic transmit frequency (Hz or kHz) + transmit: + frequency: 38.0 + units: kHz + # Target strength (TS) - length (L) regression: TS=m*log10(L)+b + TS_length_regression_parameters: + pacific_hake: # corresponding species text code + number_code: 22500 # species number code + TS_L_slope: 20.0 # the 'm' or 'slope' parameter + TS_L_intercept: -68.0 # the 'b' or 'y-intercept' + length_units: cm # units for L used in regression/relationship ... diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml index f374e624..6272b0fd 100644 --- a/config_files/live_survey_year_2019_config.yml +++ b/config_files/live_survey_year_2019_config.yml @@ -16,76 +16,13 @@ data_root_dir: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files ############################################################################## # Input data directories - -acoustic: - directory: acoustic/ - extension: zarr - sheetname: null -biological: - directory: biology/ - extension: csv - sheetname: null - - length: - directory: Biological - filename: Biological/ - length: - US: - filename: Biological/US/2019_biodata_length.xlsx - sheetname: biodata_length - CAN: - filename: Biological/CAN/2019_biodata_length_CAN.xlsx - sheetname: biodata_length_CAN - specimen: - US: - filename: Biological/US/2019_biodata_specimen_AGES.xlsx - sheetname: biodata_specimen - CAN: - filename: Biological/CAN/2019_biodata_specimen_CAN_AGES.xlsx - sheetname: biodata_specimen_CAN - catch: - US: - filename: Biological/US/2019_biodata_catch.xlsx - sheetname: biodata_catch - CAN: - filename: Biological/CAN/2019_biodata_catch_CAN.xlsx - sheetname: biodata_catch_CAN - haul_to_transect: - US: - filename: Biological/US/haul_to_transect_mapping_2019.xlsx - sheetname: Sheet1 - CAN: - filename: Biological/CAN/haul_to_transect_mapping_2019_CAN.xlsx - sheetname: Sheet1 -stratification: - strata: - # The two stratification types are found in two sheets: "Base KS" and "INPFC" - filename: Stratification/US_CAN strata 2019_final.xlsx - sheetname: Base KS - geo_strata: - # The two stratification types are found in two sheets: "stratification1" and "INPFC" - filename: Stratification/Stratification_geographic_Lat_2019_final.xlsx - sheetname: [ INPFC , stratification1 ] -NASC: - # NASC values - no_age1: - # file that excludes age1 values - filename: Exports/US_CAN_detailsa_2019_table2y+_ALL_final - updated.xlsx - sheetname: Sheet1 - all_ages: - # file that includes all ages - filename: Exports/US_CAN_detailsa_2019_table1y+_ALL_final - updated.xlsx - sheetname: Sheet1 -kriging: - mesh: - filename: Kriging_files/Kriging_grid_files/krig_grid2_5nm_cut_centroids_2013.xlsx - sheetname: krigedgrid2_5nm_forChu - isobath_200m: - # filename: Kriging_files/Kriging_grid_files/Smoothing_EasyKrig.xlsx - filename: Kriging_files/Kriging_grid_files/transformation_isobath_coordinates.xlsx - sheetname: Smoothing_EasyKrig - vario_krig_para: - # NOTE: This file is not currently used - filename: Kriging_files/default_vario_krig_settings_2019_US_CAN.xlsx - sheetname: Sheet1 +input_directories: + acoustic: + directory: acoustics/ + extension: zarr + sheetname: null + biological: + directory: biology/ + extension: csv + sheetname: null ... diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py index 9201adc7..3eac35bb 100644 --- a/echopop/zarr_read_ingest_test.py +++ b/echopop/zarr_read_ingest_test.py @@ -1,10 +1,335 @@ import zarr import xarray as xr import shutil -from pathlib import Path import numpy as np import pandas as pd import matplotlib.pyplot as plt +from typing import Union, Tuple +from pathlib import Path +import copy +import yaml +import glob +from datetime import datetime +import geopandas as gpd + +#################################################################################################### +# * Functionality for a) loading YAML configuration file, b) search defined directory for +# * input files, c) ingest *.zarr/*.csv +# TODO: Incorporate complete YAML file validator +# TODO: Documentation +def live_configuration(live_init_config_path: Union[str, Path], + live_file_config_path: Union[str, Path]): + + # Validate file existence + # ---- str-to-Path conversion, if necessary + live_init_config_path = Path(live_init_config_path) + live_file_config_path = Path(live_file_config_path) + # ---- Create list of both config paths + config_files = [live_init_config_path, live_file_config_path] + # ---- List of file existence checks + config_existence = [live_init_config_path.exists(), live_file_config_path.exists()] + # ---- Error evaluation and print message (if applicable) + if not all(config_existence): + missing_config = [ + files for files, exists in zip(config_files, config_existence) if not exists + ] + raise FileNotFoundError(f"The following configuration files do not exist: {missing_config}") + + # Read the YAML configuration/recipe file to parameterize the `LiveSurvey` class + # ---- Initialization settings + init_config = yaml.safe_load(Path(live_init_config_path).read_text()) + # ---- Filepath/directory settings + file_config = yaml.safe_load(Path(live_file_config_path).read_text()) + + # Check for intersecting/duplicative configuration keys + # ---- Compare sets of keys from each dictionary + config_intersect = set(init_config.keys()).intersection(set(file_config.keys())) + # ---- Raise error if needed + if config_intersect: + raise ValueError( + f"The initialization and file configuration files comprise the following intersecting " + f"keys: {' ,'.join(config_intersect)}. Key names must be unique for each configuration " + f"file." + ) + + # Combine both into a dictionary output that can be added to the `LiveSurvey` class object + return {**init_config, **file_config} +#################################################################################################### +# TEST: YAML FILE CONFIGURATION +# ---- Define filepaths +live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml" +live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" +# ---- Run function: `live_configuration` +file_configuration = live_configuration(live_init_config_path, live_file_config_path) +file_configuration +#################################################################################################### +# * Accessory function for tuning the acoustic transmit frequency units/scaling +# TODO: Documentation +def configure_transmit_frequency(frequency_values: pd.Series, + transmit_settings: dict, + current_units: str): + + # Extract transmit frequency units defined in configuration file + configuration_units = transmit_settings["units"] + + # Transform the units, if necessary + # ---- Hz to kHz + if current_units == "Hz" and configuration_units == "kHz": + return frequency_values * 1e-3 + # ---- kHz to Hz + elif current_units == "kHz" and configuration_units == "Hz": + return frequency_values * 1e3 + # ---- No change + else: + return frequency_values +#################################################################################################### +# * Define `LIVE_INPUT_FILE_CONFIG_MAP` configuration mapping (this will be in an equivalent +# * `core.py`) +# TODO: Update structure with additional information (as needed) +# TODO: Documentation +LIVE_INPUT_FILE_CONFIG_MAP = { + "acoustics": { + "xarray_coordinates": { + "distance": float, + "depth": float, + }, + "xarray_variables": { + "NASC": float, + "frequency_nominal": float, + "latitude": float, + "longitude": float, + "ping_time": "datetime64[ns]", + } + } +} +#################################################################################################### +# * Functionality for reading in processed acoustic data +# TODO: Expand data validator and limit cases to '*.zarr' (for now) +# TODO: Refactor "extra" components such as the validation steps, xarray-to-dataframe piping, etc. +# TODO: Documentation +def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame, xr.Dataset]: + # Get acoustic directory and initialization settings + # ---- Files + acoustic_file_settings = file_configuration["input_directories"]["acoustic"] + # ---- General settings + acoustic_analysis_settings = file_configuration["acoustics"] + + # Create full filepath + acoustic_directory_path = ( + Path(file_configuration["data_root_dir"]) / acoustic_file_settings["directory"] + ) + + # Validate filepath, columns, datatypes + # ---- Directory check + directory_existence = acoustic_directory_path.exists() + # ---- Error evaluation (if applicable) + if not directory_existence: + raise FileNotFoundError( + f"The acoustic data directory [{acoustic_directory_path}] does not exist." + ) + # ---- Get the defined file extension + file_extension = acoustic_file_settings["extension"] + # ---- In the case of a *.zarr file + if file_extension == "zarr": + # ---- Create Path.glob generator object + file_path_obj = acoustic_directory_path.glob(f"*{'.'+file_extension}") + # ---- Find all zarr files + zarr_files = list(file_path_obj) + # ---- Ensure files exist or raise error otherwise + if len(zarr_files) < 1: + raise FileNotFoundError( + f"No `*.zarr` files found in [{acoustic_directory_path}]!" + ) + # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` + acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"] + # ---- Create list of coordinate data variables + specified_vars = list(acoustics_config_map["xarray_variables"].keys()) + # ---- Create set of coordinate variables + specified_coords = list(acoustics_config_map["xarray_coordinates"].keys()) + # ---- Concatenate into a full configuration map + full_config_map = {**acoustics_config_map["xarray_coordinates"], + **acoustics_config_map["xarray_variables"]} + # ! [REQUIRES DASK] ---- Read in all listed files + zarr_data_ds = xr.open_mfdataset(zarr_files, + engine="zarr", + chunks="auto", + data_vars=specified_vars, + coords=specified_coords) + # ---- Extract coordinate metadata + coordinate_metadata = zarr_data_ds[["longitude", "latitude"]] + # ---- Convert to a DataFrame + zarr_data_df = zarr_data_ds.to_dataframe().reset_index() + # ---- Check for any missing columns + missing_columns = ( + [key for key in full_config_map.keys() if key not in zarr_data_df.columns] + ) + # ---- Raise Error, if needed + if missing_columns: + raise ValueError( + f"The following columns are missing from at least one *.{file_extension} file in " + f"[{acoustic_directory_path}]: {', '.join(missing_columns)}!" + ) + # ---- Select defined columns + zarr_data_df_filtered = zarr_data_df[full_config_map.keys()] + # ---- Validate data types + zarr_data_df_filtered = ( + zarr_data_df_filtered + .apply(lambda col: col.astype(full_config_map[col.name]) + if col.name in full_config_map else col) + ) + + # Extract defined acoustic frequency + # ---- From the configuration + transmit_settings = acoustic_analysis_settings["transmit"] + # ---- Transform `frequency_nominal`, if necessary + zarr_data_df_filtered["frequency_nominal"] = ( + configure_transmit_frequency(zarr_data_df_filtered["frequency_nominal"], + transmit_settings, + zarr_data_ds["frequency_nominal"].units) + ) + # ---- Filter out any unused frequency coordinates + zarr_data_df_output = ( + zarr_data_df_filtered + [zarr_data_df_filtered["frequency_nominal"] == transmit_settings["frequency"]] + ) + + # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object + # ---- Replace NASC `NaN` values with `0.0` + zarr_data_df_output.loc[:, "NASC"] = zarr_data_df_output.loc[:, "NASC"].fillna(0.0) + # ---- Drop frequency column and return the output + return zarr_data_df_output.drop(columns = ["frequency_nominal"]), coordinate_metadata +#################################################################################################### +# TEST: ACOUSTIC ZARR FILE INGESTION CONFIGURATION +# ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration` +acoustic_data, coordinate_metadata = load_acoustic_data(file_configuration) +acoustic_data +coordinate_metadata +#################################################################################################### +def load_spatial_data(file_configuration: dict, + acoustic_data: pd.DataFrame, + coordinate_metadata: xr.Dataset): + + # Extract spatial strata *only* if spatial information from the configuration settings + # ---- Extract the projection + projection = file_configuration["geospatial"]["projection"] + # ---- Extract the biology-acoustics linking method options + acoustics_biology_link = file_configuration["geospatial"]["link_biology_acoustics"] + + # Validate the spatial biology-acoustics linking method + # ---- Get the biology-acoustics linking method + link_method = next(key for key, value in acoustics_biology_link.items() if value) + # ---- Flag Error if unexpected method + if link_method not in ["global", "closest_haul", "INPFC", "weighted_haul"]: + raise ValueError( + f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options " + f"include: 'global', 'closest_haul', 'weighted_haul', and 'INPFC'." + ) + + # Validate projection information + # ---- Create a dummy GeoDataFrame to extract CRS information + # geo_crs = gpd.GeoDataFrame(geometry=[], crs=projection) + # ---- Extract coordinate limits from the acoustic data + # lat_min = coordinate_metadata.attrs['geospatial_lat_min'] + # lat_max = coordinate_metadata.attrs['geospatial_lat_max'] + # lon_min = coordinate_metadata.attrs['geospatial_lon_min'] + # lon_max = coordinate_metadata.attrs['geospatial_lon_max'] + # # ---- Create boundary box string + # boundary_box_str = ( + # f"POLYGON(({lon_min} {lat_min}, {lon_max} {lat_min}, {lon_max} {lat_max}, " + # f"{lon_min} {lat_max}, {lon_min} {lat_min}))" + # ) + + # data_gdf = gpd.GeoDataFrame(acoustic_data, geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]),crs=f"epsg:{utm_string_generator(lon_min, lat_min)}") + # gpd.GeoDataFrame(acoustic_data, geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]),crs=f"epsg:4326").to_crs("epsg:32610") + + # from pyproj import CRS + # from pyproj.aoi import AreaOfInterest + # from pyproj.database import query_utm_crs_info + + # utm_crs_list = query_utm_crs_info( + # datum_name="WGS 84", + # area_of_interest=AreaOfInterest( + # west_lon_degree=lon_min, + # south_lat_degree=lat_min, + # east_lon_degree=-lon_max, + # north_lat_degree=lat_max, + # ), + # ) + # CRS.from_epsg(utm_crs_list[0].code).to_epsg("+proj=latlon") + +#################################################################################################### +def live_data(file_configuration: dict): + + # Extract the file directories (or from the configuration) containing acoustic, biological, and + # spatial definitions/data/parameters + # ---- Acoustic data + acoustic_data = load_validated_acoustic_data(file_configuration) + # ---- Biological data + # ---- Spatial data + + + +#################################################################################################### +# * Define `LIVE_DATA_STRUCTURE` configuration mapping (this will be in an equivalent `core.py`) +# TODO: Update structure with additional information (as needed) +# TODO: Documentation +LIVE_DATA_STRUCTURE = { + "meta": { + "provenance": dict(), + "date": list(), + }, + "input": { + "acoustics": { + "nasc_df": pd.DataFrame(), + }, + "biology": { + "catch_df": pd.DataFrame(), + "distributions": { + "length_bins_df": pd.DataFrame(), + }, + "length_df": pd.DataFrame(), + "specimen_df": pd.DataFrame(), + }, + }, + "results": { + "acoustics": dict(), + "biology": dict(), + "stratified": dict(), + }, +} +#################################################################################################### +# * Define `LiveSurvey` class structure +# TODO: Incorporate validators +# TODO: Scope out full structure including accessors, attributes, and methods +# TODO: Configure input arguments (for initialization) +# TODO: Documentation +class LiveSurvey: + """ + A real-time processing version of the `echopop` base `Survey` class that ingests biological, + acoustic, and event meta data to provide population estimates when generated. + """ + + def __init__( + self, + live_init_config_path: Union[str, Path], + live_file_config_path: Union[str, Path], + ): + # Initialize `meta` attribute + self.meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"]) + + # Loading the configuration settings and definitions that are used for defining the + # configuration settings + self.config = live_configuration(live_file_config_path, live_file_config_path) + + # Loading the datasets defined in the configuration files + self.input = el.load_survey_data(self.config) + + # Initialize the `results` data attribute + self.results = copy.deepcopy(LIVE_DATA_STRUCTURE["results"]) + +current_units = zarr_data_ds["frequency_nominal"].units +acoustic_analysis_settings["transmit"] +file_configuration specimen_df = pd.DataFrame( { From 69340e751adc68c1201f8590ea3a1fc959e304df Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 15 Jul 2024 21:47:42 -0700 Subject: [PATCH 03/81] Updated methods --- config_files/live_initialization_config.yml | 13 ++ config_files/live_survey_year_2019_config.yml | 8 +- echopop/zarr_read_ingest_test.py | 143 ++++++++++++++++-- 3 files changed, 148 insertions(+), 16 deletions(-) diff --git a/config_files/live_initialization_config.yml b/config_files/live_initialization_config.yml index 7cff952b..6033595a 100644 --- a/config_files/live_initialization_config.yml +++ b/config_files/live_initialization_config.yml @@ -16,6 +16,9 @@ stations: separate_stations: True station_id: ["length", "specimen"] + # Trawl identifier + catch: + partition: codend ##################################################################################################################### # Geospatial settings# @@ -25,7 +28,17 @@ latitude_max: [36.0, 40.5, 43.0, 45.7667, 48.50, 55.0] stratum_names: [1, 2, 3, 4, 5, 6] + griddify: + # Coordinate bounds + bounds: + latitude: [32.75, 54.75] + longitude: [-134.75, -117.00] + # x/y (or E-W/N-S) grid resolution in nmi + grid_resolution: + x: 25.0 + y: 25.0 projection: epsg:4326 # EPSG integer code for geodetic parameter dataset + # TODO: Remember to convert this back to a string # NOTE: `link_biology_acoustics` defines how biological and acoustic data are linked with one another. This # comprises True/False statements that denote the desired association. All values set to "True" will be output. # `global` --> NASC associated with sigma_bs calculated from all survey data diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml index 6272b0fd..bf65930d 100644 --- a/config_files/live_survey_year_2019_config.yml +++ b/config_files/live_survey_year_2019_config.yml @@ -12,7 +12,7 @@ survey_year: 2019 # survey year being considered ############################################################################## # Directory path that contains all input data needed -data_root_dir: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files +data_root_dir: C:/Users/15052/Documents/GitHub/echopop_data/live_2019_files ############################################################################## # Input data directories @@ -20,9 +20,11 @@ input_directories: acoustic: directory: acoustics/ extension: zarr - sheetname: null biological: directory: biology/ extension: csv - sheetname: null + file_ids: + catch: catch_perc + length: lf + specimen: spec ... diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py index 3eac35bb..10ecf076 100644 --- a/echopop/zarr_read_ingest_test.py +++ b/echopop/zarr_read_ingest_test.py @@ -1,6 +1,4 @@ -import zarr import xarray as xr -import shutil import numpy as np import pandas as pd import matplotlib.pyplot as plt @@ -57,8 +55,8 @@ def live_configuration(live_init_config_path: Union[str, Path], #################################################################################################### # TEST: YAML FILE CONFIGURATION # ---- Define filepaths -live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml" -live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" +live_init_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_initialization_config.yml" +live_file_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" # ---- Run function: `live_configuration` file_configuration = live_configuration(live_init_config_path, live_file_config_path) file_configuration @@ -150,6 +148,8 @@ def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame, xr.Datas full_config_map = {**acoustics_config_map["xarray_coordinates"], **acoustics_config_map["xarray_variables"]} # ! [REQUIRES DASK] ---- Read in all listed files + # TODO: The sliding/overlapping windows makes this annoying -- in theory, only a single new zarr file will be ingested + # TODO: So this needs to be replaced w/ `open_dataset` instead zarr_data_ds = xr.open_mfdataset(zarr_files, engine="zarr", chunks="auto", @@ -200,6 +200,7 @@ def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame, xr.Datas return zarr_data_df_output.drop(columns = ["frequency_nominal"]), coordinate_metadata #################################################################################################### # TEST: ACOUSTIC ZARR FILE INGESTION CONFIGURATION +# NOTE: # ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration` acoustic_data, coordinate_metadata = load_acoustic_data(file_configuration) acoustic_data @@ -214,7 +215,14 @@ def load_spatial_data(file_configuration: dict, projection = file_configuration["geospatial"]["projection"] # ---- Extract the biology-acoustics linking method options acoustics_biology_link = file_configuration["geospatial"]["link_biology_acoustics"] - + + # Convert the DataFrame to a GeoDataFrame + acoustic_data_gdf = gpd.GeoDataFrame( + data=acoustic_data, + geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]), + crs=projection + ) + # Validate the spatial biology-acoustics linking method # ---- Get the biology-acoustics linking method link_method = next(key for key, value in acoustics_biology_link.items() if value) @@ -224,6 +232,9 @@ def load_spatial_data(file_configuration: dict, f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options " f"include: 'global', 'closest_haul', 'weighted_haul', and 'INPFC'." ) + + # Create INPFC stratum dataframe + # ---- Extract # Validate projection information # ---- Create a dummy GeoDataFrame to extract CRS information @@ -406,17 +417,123 @@ def __init__( ################### -from typing import Union -from pathlib import Path -import copy -import yaml +from geopy.distance import distance +from shapely.geometry import Polygon, Point, box +import geopandas as gpd +from shapely.ops import unary_union +import pyproj + + +grid_settings = file_configuration["geospatial"]["griddify"] +grid = [] +lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters +lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters +lat_min = grid_settings["bounds"]["latitude"][0] +lat_max = grid_settings["bounds"]["latitude"][1] +lon_min = grid_settings["bounds"]["longitude"][0] +lon_max = grid_settings["bounds"]["longitude"][1] + +utm_str = utm_string_generator((lon_max + lon_min)/2, (lat_max + lat_min)/2) +utm_proj = pyproj.Proj(f"epsg:{utm_str}") +x_min, y_min = utm_proj(lon_min, lat_min) +x_max, y_max = utm_proj(lon_max, lat_max) + +num_lon_steps = int((x_max - x_min) / lon_step) +num_lat_steps = int((y_max - y_min) / lat_step) + +lon1 = np.linspace(x_min, x_max - lon_step, num_lon_steps) +lat1 = np.linspace(y_min, y_max - lat_step, num_lat_steps) +lon2 = lon1 + lon_step +lat2 = lat1 + lat_step + +# Convert UTM coordinates back to degrees +lon_min_grid, lat_min_grid = np.meshgrid(lon1, lat1) +lon_max_grid, lat_max_grid = np.meshgrid(lon2, lat2) + +# Convert UTM coordinates back to degrees with adjusted resolution +lon1_deg, lat1_deg = utm_proj(lon_min_grid.ravel(), lat_min_grid.ravel(), inverse=True) +lon2_deg, lat2_deg = utm_proj(lon_max_grid.ravel(), lat_max_grid.ravel(), inverse=True) + +polygons = [box(lon1, lat1, lon2, lat2) for lon1, lat1, lon2, lat2 in zip(lon1_deg, lat1_deg, lon2_deg, lat2_deg)] +grid_gdf = gpd.GeoDataFrame({'geometry': polygons}, crs="epsg:4326") + +world = gpd.read_file("C:/Users/15052/Documents/GitHub/echopop_data/live_2019_files/coastline/ne_110m_land/ne_110m_land.shp") +bbox = box(lon_min - 0.25, lat_min - 0.25, lon_max + 0.25, lat_max + 0.25) +shapefile = world +clipped_shapefile = gpd.clip(shapefile, bbox).to_crs(utm_proj.srs) +clipped_shapefile.to_crs(utm_proj.srs) +# clipped_geometry = bbox.intersection(world.union_all()) +# clipped_gdf = gpd.GeoDataFrame(geometry=[clipped_geometry], crs=world.crs) + +from shapely.geometry import MultiPolygon +# Create an empty list to store clipped geometries +# clipped_geometries = [] + +# # Iterate over each grid polygon +# for index, row in grid_gdf.iterrows(): +# # Intersect grid polygon with land shape +# intersection = row['geometry'].intersection(clipped_shapefile.unary_union) + +# # If intersection is a MultiPolygon, get the difference with the land shape +# if isinstance(intersection, MultiPolygon): +# clipped = row['geometry'].difference(clipped_shapefile.unary_union) +# if clipped.is_empty: +# continue +# clipped_geometries.append(clipped) +# else: +# # If intersection is a single Polygon, directly add to clipped geometries +# clipped_geometries.append(intersection) + +# clipped_grid = gpd.GeoDataFrame(geometry=clipped_geometries, crs=grid_gdf.crs) + +clipped_geometries = grid_gdf['geometry'].to_crs(utm_proj.srs).difference(clipped_shapefile.geometry.union_all()) +clipped_gdf = gpd.GeoDataFrame(geometry=clipped_geometries) +clipped_gdf.to_crs(epsg=32610) + +invalid_geometries = clipped_gdf[~clipped_gdf.is_valid] +clipped_gdf = clipped_gdf.buffer(0.001) +clipped_gdf['area_sqm'] = clipped_gdf.area / 46300.00000000001**2 + +clipped_gdf.area + +fig, ax = plt.subplots(figsize=(10, 8)) +clipped_gdf.plot(ax=ax, facecolor="none", edgecolor="black") +clipped_shapefile.plot(ax=ax, edgecolor='black', linewidth=0.5) +plt.tight_layout() +plt.show() + + +bbox.crs = {"init": "epsg:4326"} +intersection = gpd.overlay(bbox, world, how='intersection') + +world_cut = gpd.sjoin(world, gpd.GeoDataFrame(geometry=[bbox]), how='inner', op='intersects') + +world_cut = world[world.geometry.intersects(bbox)] +world_cut.to_crs("epsg:4326") + +import matplotlib.pyplot as plt +fig, ax = plt.subplots(figsize=(10, 10)) +grid_gdf.plot(ax=ax, facecolor="none", edgecolor="black") +world_cut.plot(ax=ax, linewidth=2, color='blue') +plt.show() + +for cell in grid_gdf: + + x, y = cell.exterior.xy # Extract x and y coordinates of the cell + ax.fill(x, y, facecolor='none', edgecolor='black') # Plot the cell as a polygon patch +# Plot coastline +# world.plot(ax=ax, linewidth=2, color='blue') +plt.show() + + +bbox = (lat_min, lon_min, lat_max, lon_max) +G = ox.graph_from_bbox(bbox[2], bbox[3], bbox[0], bbox[1], network_type='none', simplify=False) +G = ox.geometries_from_bbox(north=bbox[2], south=bbox[0], east=bbox[3], west=bbox[1], tags={'natural': ['coastline']}) -# from echopop.acoustics import ts_length_regression, to_dB, to_linear -# from echopop.live.core import DATA_STRUCTURE -### INIT CONFIG -initialization_config = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_initialization_config.yml" +latitudes = range(int(lat_min), int(lat_max) + 1, int(lat_step)) +longitudes = range(int(lon_min), int(lon_max) + 1, int(lon_step)) # Initialize `meta` attribute meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"]) From 3adf521bac3371b301513f9df04362f81b3fcfe9 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Fri, 19 Jul 2024 17:41:32 -0700 Subject: [PATCH 04/81] Updated methods/processing (plus SQL) --- config_files/live_initialization_config.yml | 4 +- config_files/live_survey_year_2019_config.yml | 14 +- echopop/mesh_generation.py | 905 ++++++++++++++++++ echopop/zarr_read_ingest_test.py | 745 ++++++++++++-- 4 files changed, 1607 insertions(+), 61 deletions(-) create mode 100644 echopop/mesh_generation.py diff --git a/config_files/live_initialization_config.yml b/config_files/live_initialization_config.yml index 6033595a..84c48bbb 100644 --- a/config_files/live_initialization_config.yml +++ b/config_files/live_initialization_config.yml @@ -31,8 +31,8 @@ griddify: # Coordinate bounds bounds: - latitude: [32.75, 54.75] - longitude: [-134.75, -117.00] + latitude: [32.75, 55.50] + longitude: [-135.25, -117.00] # x/y (or E-W/N-S) grid resolution in nmi grid_resolution: x: 25.0 diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml index bf65930d..a8450039 100644 --- a/config_files/live_survey_year_2019_config.yml +++ b/config_files/live_survey_year_2019_config.yml @@ -8,11 +8,13 @@ # Parameters survey_year: 2019 # survey year being considered - +species: + text_code: pacific_hake # target species for the survey year -- species name + number_code: 22500 # target species for the survey year -- numeric code ############################################################################## # Directory path that contains all input data needed -data_root_dir: C:/Users/15052/Documents/GitHub/echopop_data/live_2019_files +data_root_dir: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files ############################################################################## # Input data directories @@ -23,6 +25,14 @@ input_directories: biological: directory: biology/ extension: csv + file_name_formats: + catch: "{DATE:YYYYMM}_{HAUL}_{FILE_ID:catch_perc}" + length: "{DATE:YYYYMM}_{SPECIES_CODE}_{HAUL}_{FILE_ID:lf}" + specimen: "{DATE:YYYYMM}_{SPECIES_CODE}_{HAUL}_{FILE_ID:spec}" + file_index: + catch: [haul_num] + length: [haul_num, species_id] + specimen: [haul_num, species_id] file_ids: catch: catch_perc length: lf diff --git a/echopop/mesh_generation.py b/echopop/mesh_generation.py new file mode 100644 index 00000000..3fab6d89 --- /dev/null +++ b/echopop/mesh_generation.py @@ -0,0 +1,905 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import geopandas as gpd +from geopy.distance import distance +from shapely.geometry import Polygon, Point, box +import geopandas as gpd +from shapely.ops import unary_union +import pyproj +import geopy +from echopop.spatial.projection import wgs84_to_utm, utm_string_generator +import shapely.geometry +from echopop.survey import Survey +survey = Survey( init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/initialization_config.yml" , + survey_year_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/survey_year_2019_config.yml" ) + + +grid_settings = file_configuration["geospatial"]["griddify"] +# lat_min = grid_settings["bounds"]["latitude"][0] +lat_min = 33.75 +# lat_max = grid_settings["bounds"]["latitude"][1] +lat_max = 55.50 +# lon_min = grid_settings["bounds"]["longitude"][0] +lon_min = -134.25 +lon_max = grid_settings["bounds"]["longitude"][1] + +projection = file_configuration["geospatial"]["projection"] + +utm_code = utm_string_generator((lon_max + lon_min)/2, (lat_max + lat_min)/2) +utm_num = int(utm_code) +utm_str = f"epsg:{utm_num}" + +biology_data = filtered_biology_output + +from sqlalchemy import create_engine, text, Engine, inspect +root_dir = file_configuration["data_root_dir"] +db_directory = Path(root_dir) / "database" +db_directory.mkdir(parents=True, exist_ok=True) +db_file = db_directory / "biology.db" +# Create the engine with the full path +engine = create_engine(f'sqlite:///{db_file}') + +SQL_COMMANDS = { + "create": "CREATE TABLE IF NOT EXISTS {table_name} ({column_definitions});", + "check": "SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';", + "drop": "DROP TABLE IF EXISTS {table_name};", + "select": "SELECT {columns} FROM {table_name};", + "index": "CREATE UNIQUE INDEX IF NOT EXISTS {index_name} ON {table_name} ({columns})", + # "insert": "INSERT INTO {table_name} ({columns});", + "insert": """ + INSERT INTO {table_name} ({columns}) + SELECT {columns} + FROM (SELECT VALUES {values} FROM (VALUES {value_placeholder})) AS source ({columns}) + {filter_clause}; + """, + "inspect": None, +} + +SQL_DTYPES = { + 'int32': 'INTEGER', + 'int64': 'INTEGER', + 'float64': 'FLOAT', + 'bool': 'BOOLEAN', + 'datetime64[ns]': 'DATETIME', + 'object': 'TEXT' +} + +def SQL(db_file: str, command: str, **kwargs): + + # Create engine from `db_file` string + engine = create_engine(f"sqlite:///{db_file}") + + # Format `columns`, if there are any and more than 1 + if "columns" in kwargs.keys(): + if isinstance(kwargs["columns"], list): + kwargs["columns"] = ", ".join(kwargs["columns"]) + else: + kwargs["columns"] = "*" + + # Format `columns`, if there are any and more than 1 + # if "filter_columns" in kwargs.keys(): + # # ---- Store the value for later + # kwargs["filter_columns_store"] = kwargs["filter_columns"] + # if isinstance(kwargs["filter_columns"], list): + # kwargs["filter_columns"] = ", ".join(kwargs["filter_columns"]) + + # Run the command + try: + with engine.connect() as connection: + # ---- SELECT + if command == "select": + return pd.read_sql(text(SQL_COMMANDS[command].format(**kwargs)), con=connection) + # ---- CREATE + elif command == "create": + # ---- Extract dataframe + df_to_add = kwargs["dataframe"] + # ---- Check whether the table already exists or not + table_exists = ( + connection.execute(text(SQL_COMMANDS["check"].format(**kwargs))).fetchone() + ) + # ---- If it doesn't, pre-allocate the table + if table_exists is None: + # ---- Get column definitions as a string + column_def_dict = { + col: SQL_DTYPES.get(str(dtype), 'TEXT') + for col, dtype in zip(df_to_add.columns, df_to_add.dtypes) + } + # ---- Convert to a single string + kwargs["column_definitions"] = ( + ", ".join([f"{col} {dtype}" for col, dtype in column_def_dict.items()]) + ) + # ---- Create table + connection.execute(text(SQL_COMMANDS["create"].format(**kwargs))) + # ---- REPLACE + elif command == "replace": + # ---- Extract dataframe + df_to_add = kwargs["dataframe"] + # ---- Replace current + df_to_add.to_sql(name=kwargs["table_name"], + con=connection, + if_exists="replace", index=False) + + # ---- INSERT + elif command == "insert": + # ---- Extract dataframe + df_to_add = kwargs["dataframe"] + # ---- Check if + # table_exists = ( + # connection.execute(text(SQL_COMMANDS["check"].format(**kwargs))).fetchone() + # ) + # tables = SQL(db_file, "inspect") + # ---- If it doesn't, pre-allocate the table + # if kwargs["table_name"] not in tables and "filter_columns" in kwargs.keys(): + df_to_add.to_sql(name=kwargs["table_name"], + con=connection, + if_exists="append", index=False) + # else: + # # ---- Format `filter_columns` command if present + # if "filter_columns" in kwargs.keys(): + # # ---- Fetch table + # fetch_table = ( + # connection.execute(text( + # ("SELECT DISTINCT {filter_columns} FROM {table_name}") + # .format(**kwargs)) + # ) + # ) + # # ---- Format the SQL data into a DataFrame + # fetched_df = pd.DataFrame(fetch_table.fetchall(), columns=fetch_table.keys()) + # # ---- Create an index tuples + # index_tuples = ( + # set(fetched_df[kwargs["filter_columns_store"]] + # .itertuples(index=False, name=None)) + # ) + # # ---- Filter the dataframe + # filtered_df = ( + # df_to_add[ + # ~df_to_add[fetched_df.columns].apply(tuple, axis=1) + # .isin(index_tuples) + # ] + # ) + # # ---- Insert the data + # filtered_df.to_sql(name=kwargs["table_name"], + # con=connection, + # if_exists="append", index=False) + # else: + # df_to_add.to_sql(name=kwargs["table_name"], + # con=connection, + # if_exists="append", index=False) + # ---- INSPECT + elif command == "inspect": + return inspect(engine).get_table_names() + else: + connection.execute(text(SQL_COMMANDS[command].format(**kwargs))) + finally: + # ---- Dispose of the engine to release any resources being pooled/used + engine.dispose() + +_ = SQL(db_file, "drop", table_name="catch_df") +_ = SQL(db_file, "drop", table_name="specimen_df") +_ = SQL(db_file, "drop", table_name="length_df") +_ = SQL(db_file, "drop", table_name="files_read") + +_ = SQL(db_file, "insert", table_name="files_read", dataframe=current_files) +current = SQL(db_file, "select", table_name="files_read", columns="filepath") +current + + +# Get acoustic directory and initialization settings +# ---- Files +biology_file_settings = file_configuration["input_directories"]["biological"] +# ---- General settings +biology_analysis_settings = file_configuration["biology"] + +# Get the file-specific settings, datatypes, columns, etc. +# ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` +biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] +# ---- Extract the expected file name ID's +biology_file_ids = biology_file_settings["file_name_formats"] +# ---- Extract all of the file ids +biology_config_ids = list(biology_file_ids.keys()) +# ---- Initialize the dictionary that will define this key in the `input` attribute +biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} +# ---- Initialize the SQL dictionary +sql_biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} + +# Create full filepath +biology_directory_path = ( + Path(file_configuration["data_root_dir"]) / biology_file_settings["directory"] +) +# ---- Directory check +directory_existence = biology_directory_path.exists() +# ---- Error evaluation (if applicable) +if not directory_existence: + raise FileNotFoundError( + f"The acoustic data directory [{biology_directory_path}] does not exist." + ) +# ---- Get the defined file extension +file_extension = biology_file_settings["extension"] +# ---- Create Path.glob generator object +file_path_obj = biology_directory_path.glob(f"*{'.'+file_extension}") +#---- Create list of `*.csv`` files +csv_files = list(file_path_obj) +# ---- Ensure files exist or raise error otherwise +if len(csv_files) < 1: + raise FileNotFoundError( + f"No `*.csv` files found in [{biology_directory_path}]!" + ) +else: + # ---- Create Path to SQL database file + db_directory = Path(file_configuration["data_root_dir"]) / "database" + # ---- Create the directory if it does not already exist + db_directory.mkdir(parents=True, exist_ok=True) + # ---- Complete path to `biology.db` + db_file = db_directory / "biology.db" + # ---- Query the external SQL database to see if the file tracking table exists + tables = SQL(db_file, "inspect") + # ---- Create a list of string-formatted Path names + csv_files_str = [str(file) for file in csv_files] + # ---- Create DataFrame + current_files = pd.DataFrame(csv_files_str, columns=["filepath"]) + # ---- Create if it is missing and then advance `csv_files` + if "files_read" not in tables: + # ---- Insert into the SQL database file + _ = SQL(db_file, "insert", table_name="files_read", columns="filepath", + dataframe=current_files) + # ---- Create empty list for later comparison + new_files = [] + else: + # ---- Pull already processed filenames + previous_files = SQL(db_file, "select", table_name="files_read") + # ---- Compare against the current filelist + new_files = ( + [file for file in csv_files_str if file not in set(previous_files["filepath"])] + ) + # ---- Create a DataFrame for the new files + new_files_df = pd.DataFrame(new_files, columns=["filepath"]) + # ---- Insert into the SQL database file + _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) + +# Iterate through each of the file ids and read in the data +for id in list(biology_file_ids.keys()): + # ---- Extract the specific config mapping for this tag/id + sub_config_map = biology_config_map[id] + # ---- Drop the `{FIELD_ID}` tag identifier + file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id]) + # ---- Replace all other tags with `*` placeholders + file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) + # ---- Create Path object with the generalized format + subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}") + # ---- List all files that match this pattern + subcsv_files_str = [str(file) for file in list(subfile_path_obj)] + # ---- Filter for only new files + subset_files = set(subcsv_files_str).intersection(set(new_files)) + # ---- Pull from SQL database, if applicable + if f"{id}_df" in tables: + # ---- SELECT + sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*") + # ---- Concatenate to the dictionary + sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df]) + # ---- Add data files not stored in SQL database + if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables: + if len(subset_files) > 0: + file_list = subset_files + else: + file_list = subcsv_files_str + # ---- Create a list of relevant dataframes + sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map) + for file in file_list] + # ---- Concatenate into a single DataFrame + sub_df = pd.concat(sub_df_lst, ignore_index=True) + # ---- Concatenate to the dictionary DataFrame + biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df]) + +# Get contrasts used for filtering the dataset +# ---- Species +species_filter = file_configuration["species"]["number_code"] +# ---- Trawl partition information +trawl_filter = biology_analysis_settings["catch"]["partition"] +# ---- Apply the filter +filtered_biology_output = { + key: df[ + (df['species_id'] == species_filter if 'species_id' in df.columns else True) & + (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns else True) + ] + for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty +} + +# Update the SQL database +for table_name, df in filtered_biology_output.items(): + # ---- Update + _ = SQL(db_file, "insert", table_name=table_name, columns="*", + dataframe=df) + +# Combine the two datasets +merged_output = { + key: pd.concat([ + sql_biology_output.get(key, pd.DataFrame()), + filtered_biology_output.get(key, pd.DataFrame()) + ]).drop_duplicates().reset_index(drop=True) + for key in set(sql_biology_output) | set(filtered_biology_output) +} +# ---- Return output +merged_output + +coordinate_metadata.attrs[] + +SQL(biology_db, command="drop", table_name="catch_df") +SQL(biology_db, command="drop", table_name="specimen_df") +SQL(biology_db, command="drop", table_name="length_df") +SQL(biology_db, command="drop", table_name="files_read") +_ = SQL(db_file=db_file, command="create", table_name="files_read", columns="filepath") +tables = SQL(db_file, "inspect") +tables +current = SQL(db_file, "select", table_name="files_read", columns=["filepath"]) +current + +SQL(db_file, "select", table_name="catch_df", columns="*") +new_files_df = pd.DataFrame(csv_files_str, columns=['file_path']) +_ = SQL("insert", engine, table_name="files_read",dataframe=new_files_df) +current = SQL("select", engine, table_name="csv_files_read", columns="file_path") +current +for table_name, df in biology_data.items(): + df.to_sql(table_name, con=engine, if_exists='append', index=False) +command = "read" +engine = create_engine(f'sqlite:///{db_file}') +table_name = "files_read" +columns = "file_path" + +kwargs = { + "table_name": table_name, + "columns": columns, +} + +zarr_data_ds["depth"].diff(dim="depth") + +prc_nasc_df.groupby(["longitude", "latitude"]) + +from pandas.core.groupby import DataFrameGroupBy + +def estimate_echometrics(acoustic_data_df: pd.DataFrame): + + # Create copy + acoustic_df = acoustic_data_df.copy().reset_index(drop=True) + + # Pre-compute the change in depth + acoustic_df["dz"] = acoustic_df["depth"].diff() + + # Initialize echometrics dictionary + echometrics = {} + + # Compute the metrics center-of-mass + if acoustic_df["NASC"].sum() == 0.0: + echometrics.update({ + "n_layers": 0, + "mean_Sv": -999, + "max_Sv": -999, + "nasc_db": np.nan, + "center_of_mass": np.nan, + "dispersion": np.nan, + "evenness": np.nan, + "aggregation": np.nan, + "occupied_area": 0.0, + }) + else: + + # Compute the number of layers + echometrics.update({ + "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size + }) + + # Compute ABC + # ---- Convert NASC to ABC + acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2) + # ---- Estimate mean Sv + echometrics.update({ + "mean_Sv": 10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max()) + }) + # --- Estimate max Sv (i.e. ) + echometrics.update({ + "max_Sv": 10 * np.log10(acoustic_df["ABC"].max() + / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"]) + }) + + # Compute (acoustic) abundance + echometrics.update({ + "nasc_db": 10 * np.log10(acoustic_df["ABC"].sum()) + }) + + # Compute center of mass + echometrics.update({ + "center_of_mass": ( + (acoustic_df["depth"] * acoustic_df["NASC"]).sum() + / (acoustic_df["NASC"]).sum() + ) + }) + + # Compute the dispersion + echometrics.update({ + "dispersion": ( + ((acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2 + * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum() + ) + }) + + # Compute the evenness + echometrics.update({ + "evenness": (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2 + }) + + # Compute the index of aggregation + echometrics.update({ + "aggregation": 1 / echometrics["evenness"] + }) + + # Get the occupied area + echometrics.update({ + "occupied_area": ( + acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max() + ) + }) + + # Return the dictionary + return echometrics + +def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True): + + # Vertically integrate PRC NASC + nasc_dict = {"nasc": acoustic_data_df["NASC"].sum()} + + # Horizontally concatenate `echometrics`, if `True` + if echometrics: + # ---- Compute values + # NOTE: This uses NASC instead of linear `sv` + echometrics_dict = estimate_echometrics(acoustic_data_df) + # ---- Merge + nasc_dict.update(echometrics_dict) + + # Convert `nasc_dict` to a DataFrame and return the output + return pd.Series(nasc_dict) + +def process_group(group): + result = integrate_nasc(group, echometrics=True) + result = result.reset_index(drop=True) + # Concatenate the result back to the original group for alignment + group = group.reset_index(drop=True) + combined = pd.concat([group, result], axis=1) + return combined + +acoustic_data_df = acoustic_data["prc_nasc_df"] + + +rc_nasc_df[prc_nasc_df["distance"] == 0.0] +acoustic_data_df = mek[mek["distance"] == 0.0] +pd.DataFrame(nasc_dict, index=[0]).reset_index(drop=True).unstack() +nasc_data_df = ( + prc_nasc_df.groupby(["longitude", "latitude", "ping_time"]) + .apply(lambda group: integrate_nasc(group, echometrics=False), include_groups=False) + .reset_index() +) + + + + +kwargs = { + "table_name": "csv_files_read", + "columns": "file_path", + "dataframe": new_files_df +} + +current_process = psutil.Process() +import logging + +# Create a session +Session = sessionmaker(bind=engine) +session = Session() + +# Perform database operations +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) +logger.info("Performing database operations") + +# Create a session +Session = sessionmaker(bind=engine) +session = Session() + +# Perform database operations +logger.info("Performing database operations") + +# Close the session +session.close() +logger.info("Session closed") + +# Dispose the engine +engine.dispose() +logger.info("Engine disposed") + +# Force garbage collection +import gc +gc.collect() +logger.info("Garbage collection performed") + +import psutil + +pid = psutil.Process().pid +process = psutil.Process(pid) +open_files = process.open_files() +db_path = r'C:\Users\Brandyn\Documents\GitHub\EchoPro_data\live_2019_files\database\biology.db' + +# Check if the file is still in use +for file in open_files: + if db_path in file.path: + logger.info(f"File {db_path} is still in use.") + else: + logger.info(f"File {db_path} is not in use.") + +# Define the SQL to drop the table +drop_table_sql = "DROP TABLE IF EXISTS csv_files_read;" +# Execute the drop table SQL +with engine.connect() as connection: + _ = connection.execute(text(drop_table_sql)) + +import sqlite3 +if os.path.exists(db_path): + conn = sqlite3.connect(db_path) + conn.close() + # Force the file to be removed + try: + os.remove(db_path) + print(f"Database file {db_path} has been deleted.") + except PermissionError: + print(f"Failed to delete {db_path}. The file is still in use.") + +create_table_sql = """ +CREATE TABLE IF NOT EXISTS csv_files_read ( + file_path TEXT UNIQUE +); +""" +# Execute the create table SQL +with engine.connect() as connection: + _ = connection.execute(text(create_table_sql)) + +root_directory = Path(root_dir) +dataset = "biology" + +# Convert to strings +csv_files_str = [str(file) for file in csv_files] + +existing_files_df = pd.read_sql('SELECT file_path FROM csv_files_read', con=engine) +existing_files_set = set(existing_files_df['file_path']) +# Filter out duplicates from the csv_files list +new_files = [file for file in csv_files_str if file not in existing_files_set] +# Insert only new file paths into the SQL table +if new_files: + new_files_df = pd.DataFrame(new_files, columns=['file_path']) + _ = new_files_df.to_sql('csv_files_read', con=engine, if_exists='append', index=False) + + +with engine.connect() as conn: + conn.execute(""" + CREATE TABLE IF NOT EXISTS csv_files_read ( + file_path TEXT UNIQUE + ) + """) + +csv_files +files_df.to_sql('csv_files_read', con=engine, if_exists='append', index=False) +file_name_format = biology_file_ids[id] +def compile_filename_format(file_name_format: str): + + # Create a copy of `file_name_format` + regex_pattern = file_name_format + + # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern + for key, value in LIVE_FILE_FORMAT_MAP.items(): + regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"]) + # ---- Replace the `FILE_ID` tag + regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P\1)', regex_pattern) + + # Compile the regex pattern and return the output + return re.compile(regex_pattern) + +from sqlalchemy.orm import sessionmaker +Session = sessionmaker(bind=engine) +session = Session() +session.close() +engine.pool.status() +# Dispose the engine to close all connections +engine.dispose() +import gc +gc.collect() +import psutil +dbapi_conn = engine.raw_connection() +dbapi_conn.close() +# Get the process ID of the current process +pid = psutil.Process().pid + +# List all open files for the current process +process = psutil.Process(pid) +open_files = process.open_files() + +for file in open_files: + print(file.path) + + +pattern = filename_format +config_settings = sub_config_map +regex_pattern = pattern + +# Replace patterns based on LIVE_FILE_FORMAT_MAP +for key, value in LIVE_FILE_FORMAT_MAP.items(): + regex_pattern = regex_pattern.replace(f'{{{key}}}', value['expression']) +regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P\1)', regex_pattern) +new_pattern = compile_filename_format(regex_pattern) +match_obj = new_pattern.search(file.name) +# Get substring components as a list +filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern) +valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings))) + +for i in valid_tags: + matched_key = LIVE_FILE_FORMAT_MAP[i] + df[matched_key["name"]] = matched_key["dtype"](match_obj.group(i)) + + + +# Assign the data as new columns to the DataFrame +for key, value in data_to_add.items(): + df[key] = value + +for i in valid_tags: + matched_key = LIVE_FILE_FORMAT_MAP[i] + df[matched_key["name"]] = matched_key["dtype"](match_obj.group(i)) +biology_analysis_settings +species_id_value = 22500 +trawl_partition_value = 'Codend' # Adjust as needed +{ + key: df[ + (('species_id' not in df.columns) or (df['species_id'] == species_id_value)) & + (('trawl_partition' not in df.columns) or (df['trawl_partition'] == trawl_partition_value)) + ] + for key, df in biology_output.items() if isinstance(df, pd.DataFrame) +} + +(match_obj.group(i)).astype(matched_key["dtype"]) +pattern = '{DATE:YYYYMM}_{HAUL}_{FILE_ID:catch_perc}' +modified_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'\1', pattern) +# Create the regex pattern +regex_pattern = modified_pattern.replace('{', '(?P<').replace('}', '>.+?)') +re.compile(regex_pattern) + +modified_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'\1', pattern) + +# Create the regex pattern +regex_pattern = modified_pattern.replace('{', '(?P<').replace('}', '>.+?)') +compile_filename_format(regex_pattern) +# Regular expression to capture values inside the curly braces +regex = r'\{([^:}]+):([^}]+)\}' + +# Find all matches +matches = re.findall(regex, modified_pattern) + +# Get substring components as a list +filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern) + +pattern_changed = pattern.replace("FILE_ID:", "") + +# Compilte the filename regular expression format +compiled_regex = compile_filename_format(pattern_changed) + +file_id_tag = pattern.split('{FILE_ID:')[1].split('}')[0] + + # Get the file name and produce a `re.Match` object +match_obj = compiled_regex.search(file.name) + + +def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict): + + # Get the file name and produce a `re.Match` object + match_obj = pattern.search(file.name) + + # Read in the `*.csv` file + df = pd.read_csv(file, usecols=list(config_settings["dtypes"].keys())) + + # Validate the dataframe + # ---- Check for any missing columns + missing_columns = ( + [key for key in config_settings["dtypes"].keys() if key not in df.columns] + ) + # ---- Raise Error, if needed + if missing_columns: + raise ValueError( + f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!" + ) + # ---- Ensure the correct datatypes + df_validated = df.astype(config_settings["dtypes"]) + + # Replace column names and drop + df_validated = df_validated.rename(columns=config_settings["names"]) + + # Get the haul number and add the the dataframe + # ---- Extract the haul number and convert to an integer + haul_num = int(match_obj.group("HAUL")) + # ---- Add the column + df_validated["haul_num"] = haul_num + + # Return the resulting DataFrame + return df_validated + +## +grid_settings["grid_resolution"]["x"] = 50 +grid_settings["grid_resolution"]["y"] = 50 +lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters +lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters + +# CREATE BOUNDING +bound_df = pd.DataFrame({ + "lon": np.array([lon_min, lon_max, lon_max, lon_min, lon_min]), + "lat": np.array([lat_min, lat_min, lat_max, lat_max, lat_min]) +}) + +bound_gdf = gpd.GeoDataFrame( + data=bound_df, + geometry=gpd.points_from_xy(bound_df["lon"], bound_df["lat"]), + crs = projection +) + +utm_string_generator(-117.0, 33.75) +bound_gdf.total_bounds +# Convert to UTM +bound_utm = bound_gdf.to_crs(utm_num) +bound_utm.total_bounds +y_step = lat_step +x_step = lon_step +# bound_utm = bound_gdf +# y_step = grid_settings["grid_resolution"]["y"] * 1852 / 110574 +# x_step = grid_settings["grid_resolution"]["x"] * 1852 / 60.0 + +xmin, ymin, xmax, ymax = bound_utm.total_bounds + +# Get number of cells +n_x_cells = int(np.ceil((xmax - xmin) / x_step)) +n_y_cells = int(np.ceil((ymax - ymin) / y_step)) + +import pyproj +# create the cells in a loop +# grid_cells = [] +# for x0 in np.arange(xmin, xmax, x_step): +# for y0 in np.arange(ymin, ymax, y_step): +# # bounds +# utm_zone = utm_string_generator(x0, y0) +# proj = pyproj.Proj(f"epsg:{utm_code}") +# x1 = x0-x_step +# y1 = y0+y_step +# grid_cells.append(shapely.geometry.box(x0, y0, x1, y1)) + +grid_cells = [] +for y0 in np.arange(ymin, ymax, y_step): + + # x_step = grid_settings["grid_resolution"]["x"] * 1852 / (1852 * 60 * np.cos(np.radians(y0))) + + for x0 in np.arange(xmin, xmax, x_step): + # bounds + # utm_zone = utm_string_generator(x0, y0) + # proj = pyproj.Proj(f"epsg:{utm_code}") + # x1, y1 = proj(x0, y0) + # x2, y2 = proj(x0 - x_step, y0 + y_step) + # grid_cells.append(box(x1, y1, x2, y2)) + x1 = x0-x_step + y1 = y0+y_step + grid_cells.append(shapely.geometry.box(x0, y0, x1, y1)) + +cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=utm_code) +cells_gdf.shape +n_x_cells * n_y_cells +# cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"]) +cells_gdf.total_bounds +cells_gdf.to_crs(projection).total_bounds +from shapely.validation import make_valid +from shapely.geometry import mapping +######## +world = gpd.read_file("C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files/coastline/ne_10m_land/ne_10m_land.shp") +bb_orig = box(lon_min, lat_min, lon_max, lat_max) +boundary_box = box(lon_min - 5, lat_min - 5, lon_max + 5, lat_max + 5) +world_orig = gpd.clip(world, box(lon_min-1, lat_min-1, lon_max+1, lat_max+1)) +world_clipped_latlon = gpd.clip(world, boundary_box) +world_clipped = gpd.clip(world, boundary_box).to_crs(utm_code) + +world_utm = world.to_crs(utm_code) +world_utm = world_utm[~world_utm.is_empty] + +bbox_latlon = box(lon_min, lat_min, lon_max, lat_max) + +gpd.GeoDataFrame(geometry=[bbox_latlon], crs=projection).to_crs(utm_code) + +bbox_utm = bound_utm.total_bounds + +buffer = [-lon_step * 1.01, -lat_step * 1.01, lon_step * 1.01, lat_step * 1.01] +array_buffer = bbox_utm + buffer +array_names = ["minx", "miny", "maxx", "maxy"] +buffered = dict(zip(array_names, array_buffer)) +buffer_boundary = box(**buffered) +# box(array_buffer[0], array_buffer[1], array_buffer[2], array_buffer[3]) +# buffer_boundary = buffer_boundary.to_crs(world_utm.crs) + +buffer_boundary_gdf = gpd.GeoDataFrame(geometry=[buffer_boundary], crs=world_utm.crs) # Replace with the correct EPSG code +bb_orig_gdf = gpd.GeoDataFrame(geometry=[bb_orig], crs=projection) +# sub_clipped = gpd.clip(world_utm, buffer_boundary) +# sub_clipped = gpd.clip(world_utm, bbox_utm) + +# fig, ax = plt.subplots(figsize=(10, 10)) +# # Plot the buffer_boundary +# world.plot(ax=ax, linewidth=2, color='gray') +# buffer_boundary_gdf.to_crs(projection).plot(ax=ax, facecolor='none', edgecolor='blue') +# bb_orig_gdf.plot(ax=ax, facecolor='none', edgecolor='red') +# plt.xlim(lon_min-3, lon_max+3) +# plt.ylim(lat_min-3, lat_max+3) +# plt.show() + +len(bbox_latlon.exterior.coords) +len(buffer_boundary.exterior.coords) + +# world_clipped_latlon = gpd.clip(world_utm, buffer_boundary).to_crs(projection) +world_clipped_latlon +######## +cells_clipped = cells_gdf["geometry"].difference(world_clipped.geometry.union_all()).to_frame("geometry") +# cells_clipped = cells_gdf["geometry"].difference(world_clipped_latlon.geometry.union_all()).to_frame("geometry") +cell_colors = cells_clipped.area / (lat_step * lon_step) +# cell_colors = cells_clipped.to_crs({"proj": "cea"}).area / 46300.00000000001**2 +cells_clipped['cell_colors'] = cell_colors +# ---> back to epsg lat/long +cells_latlon = cells_clipped.to_crs(projection) +cells_latlon_clipped = gpd.clip(cells_latlon, bb_orig_gdf) +cell_colors_clipped = cells_latlon_clipped.to_crs(utm_code).area / (lat_step * lon_step) +# cell_colors = cells_clipped.to_crs({"proj": "cea"}).area / 46300.00000000001**2 +cells_latlon_clipped['cell_colors'] = cell_colors_clipped +######## +from shapely.geometry import Point, LineString, shape +nasc_df = survey.input["acoustics"]["nasc_df"] +nasc_gdf = gpd.GeoDataFrame(data=nasc_df, geometry=gpd.points_from_xy(nasc_df["longitude"], nasc_df["latitude"]), crs=projection) +geo_df = nasc_gdf.groupby(["transect_num"])['geometry'].apply(lambda x: LineString(x.tolist())).to_frame("geometry").set_crs(projection) +custom_crs = '+proj=epsg:4326 +lat_ts=0 +lat_0=0 +lon_0=-180 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs +type=crs' +cells_latlon_clipped.to_crs(custom_crs).crs +######## +import matplotlib.colors as colors +import matplotlib.cm as cm +cells_transformed = cells_latlon.to_crs(utm_code) +lims = cells_transformed.total_bounds + +fig, ax = plt.subplots(figsize=(10, 10)) +# cells_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=True) +# cells_clipped.plot.hexbin() +cells_latlon.to_crs(utm_code).plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=False) +# cells_latlon.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=False) +# cells_latlon_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=False) +# cells_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=True) +# cells_gdf.plot(ax=ax, facecolor="none", edgecolor="black") +norm = colors.Normalize(vmin=cells_latlon["cell_colors"].min(), vmax=cells_latlon["cell_colors"].max()) +cbar = plt.colorbar(cm.ScalarMappable(norm=norm, cmap="viridis"), ax=ax, orientation="horizontal", shrink=0.5) +cbar.set_label("Normalized grid area (50x50 nmi)", fontsize=12, labelpad=10, loc='center') +cbar.ax.xaxis.set_label_position('top') +cbar.ax.xaxis.set_ticks_position('top') +geo_df.reset_index().to_crs(utm_code).plot(ax=ax, color="red") +# geo_df.reset_index().plot(ax=ax, color="red") +# plt.plot(ax=ax, nasc_df["longitude"], nasc_df["latitude"], color="red") +ax.margins(0.00, 0.00) +world_orig.to_crs(utm_code).plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") +# world_orig.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") +# bb_orig_gdf.to_crs(utm_code).plot(ax=ax, facecolor='none', edgecolor='red') +plt.xlim(lims[0]*1.02, lims[2]*1.01) +# ax.set_yticks([4e6, 5e6, 6e6]) +# ax.set_yticklabels(["4000", "5000", "6000"], fontsize=10) +plt.ylim(lims[1]*0.98, lims[3]*1.005) +ax.set_yticks([4e6, 5e6, 6e6]) +ax.set_yticklabels(["4000", "5000", "6000"], fontsize=10) +plt.xlabel("Eastings (km)") +plt.ylabel("Northings (km)") +# plt.xlabel("Longitude (°E)") +# ax.set_xticks([-135, -130, -125, -120]) +# plt.ylabel("Latitude (°N)") +ax.set_xticks([-600e3, -400e3, -200e3, 0, 200e3, 400e3, 600e3, 800e3]) +ax.set_xticklabels(["-600", "-400", "-200", "0", "200", "400", "600", "800"], fontsize=10) +# Adding the colorbar title +# cax = fig.get_axes()[1] # Assuming the colorbar is the second axis +# cax.set_ylabel("Normalized grid area (25x25 nmi)") # Setting the title of the colorbar +plt.tight_layout() +plt.show() \ No newline at end of file diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py index 10ecf076..44a83ab4 100644 --- a/echopop/zarr_read_ingest_test.py +++ b/echopop/zarr_read_ingest_test.py @@ -9,6 +9,10 @@ import glob from datetime import datetime import geopandas as gpd +import os +import re +import contextlib +from sqlalchemy import create_engine, text, Engine, inspect #################################################################################################### # * Functionality for a) loading YAML configuration file, b) search defined directory for @@ -55,11 +59,11 @@ def live_configuration(live_init_config_path: Union[str, Path], #################################################################################################### # TEST: YAML FILE CONFIGURATION # ---- Define filepaths -live_init_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_initialization_config.yml" -live_file_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" +live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml" +live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" # ---- Run function: `live_configuration` file_configuration = live_configuration(live_init_config_path, live_file_config_path) -file_configuration +file_configuration.update({"database": {"acoustics": None, "biology": None}}) #################################################################################################### # * Accessory function for tuning the acoustic transmit frequency units/scaling # TODO: Documentation @@ -98,20 +102,156 @@ def configure_transmit_frequency(frequency_values: pd.Series, "longitude": float, "ping_time": "datetime64[ns]", } - } + }, + "biology": { + "catch": { + "dtypes": { + "partition": str, + "species_code": int, + "sample_weight_kg": float, + "catch_perc": float, + }, + "names": { + "partition": "trawl_partition", + "species_code": "species_id", + "sample_weight_kg": "haul_weight", + "catch_perc": "catch_percentage", + } + }, + "length": { + "dtypes": { + "sex": str, + "rounded_length": int, + "frequency": int, + }, + "names": { + "sex": "sex", + "rounded_length": "length", + "frequency": "length_count", + } + }, + "specimen": { + "dtypes": { + "rounded_length": int, + "organism_weight": float, + "sex": str, + }, + "names": { + "sex": "sex", + "rounded_length": "length", + "organism_weight": "weight" + }, + }, + }, } + +LIVE_FILE_FORMAT_MAP = { + "DATE:YYYYMM": { + "name": "date", + "dtype": "datetime[ns]", + "expression": r"(?P\d{6})", + }, + "DATE:YYYYMMDD": { + "name": "date", + "dtype": "datetime[ns]", + "expression": r"(?P\d{8})", + }, + "HAUL": { + "name": "haul_num", + "dtype": int, + "expression": r"(?P\d+)", + }, + "SPECIES_CODE": { + "name": "species_id", + "dtype": int, + "expression": r"(?P\d+)" + }, + "FILE_ID": { + "name": "file_id", + "dtype": str, + "expression": r"(?P.+)" + }, +} + +def compile_filename_format(file_name_format: str): + + # Create a copy of `file_name_format` + regex_pattern = file_name_format + + # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern + for key, value in LIVE_FILE_FORMAT_MAP.items(): + regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"]) + # ---- Replace the `FILE_ID` tag + regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P\1)', regex_pattern) + + # Compile the regex pattern and return the output + return re.compile(regex_pattern) + +def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict): + + # Read in the `*.csv` file + df = pd.read_csv(file, usecols=list(config_settings["dtypes"].keys())) + + # Validate the dataframe + # ---- Check for any missing columns + missing_columns = ( + [key for key in config_settings["dtypes"].keys() if key not in df.columns] + ) + # ---- Raise Error, if needed + if missing_columns: + raise ValueError( + f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!" + ) + # ---- Ensure the correct datatypes + df_validated = df.astype(config_settings["dtypes"]) + # ---- Replace column names and drop + df_validated = df_validated.rename(columns=config_settings["names"]) + + # Get the substring components that can be added to the DataFrame + filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern) + # ---- Create sub-list of columns that can be added to the DataFrame + valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings))) + + # Compile the filename regular expression + compiled_regex = compile_filename_format(pattern) + # ---- Create the `Match` object that will be used to parse the string + match_obj = compiled_regex.search(file.name) + + # Iterate through the filename-derived tags and add them to the DataFrame + for i in valid_tags: + matched_key = LIVE_FILE_FORMAT_MAP[i] + df_validated[matched_key["name"]] = matched_key["dtype"](match_obj.group(i)) + + # Return the resulting DataFrame + return df_validated #################################################################################################### # * Functionality for reading in processed acoustic data # TODO: Expand data validator and limit cases to '*.zarr' (for now) # TODO: Refactor "extra" components such as the validation steps, xarray-to-dataframe piping, etc. # TODO: Documentation -def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame, xr.Dataset]: +def load_acoustic_data(file_configuration: dict, update_config: bool = True) -> Tuple[pd.DataFrame, xr.Dataset]: # Get acoustic directory and initialization settings # ---- Files acoustic_file_settings = file_configuration["input_directories"]["acoustic"] # ---- General settings acoustic_analysis_settings = file_configuration["acoustics"] + # Get the file-specific settings, datatypes, columns, etc. + # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` + acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"] + # ---- Create list of coordinate data variables + specified_vars = list(acoustics_config_map["xarray_variables"].keys()) + # ---- Create set of coordinate variables + specified_coords = list(acoustics_config_map["xarray_coordinates"].keys()) + # ---- Concatenate into a full configuration map + full_config_map = {**acoustics_config_map["xarray_coordinates"], + **acoustics_config_map["xarray_variables"]} + # ---- Initialize the dictionary that will define this key in the `input` attribute + acoustics_output = {"prc_nasc_df": pd.DataFrame(), + "nasc_df": pd.DataFrame()} + # ---- Initialize the SQL dictionary + # sql_acoustics_output = {"sv_df": pd.DataFrame()} + # Create full filepath acoustic_directory_path = ( Path(file_configuration["data_root_dir"]) / acoustic_file_settings["directory"] @@ -127,57 +267,79 @@ def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame, xr.Datas ) # ---- Get the defined file extension file_extension = acoustic_file_settings["extension"] - # ---- In the case of a *.zarr file - if file_extension == "zarr": - # ---- Create Path.glob generator object - file_path_obj = acoustic_directory_path.glob(f"*{'.'+file_extension}") - # ---- Find all zarr files - zarr_files = list(file_path_obj) - # ---- Ensure files exist or raise error otherwise - if len(zarr_files) < 1: - raise FileNotFoundError( - f"No `*.zarr` files found in [{acoustic_directory_path}]!" - ) - # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` - acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"] - # ---- Create list of coordinate data variables - specified_vars = list(acoustics_config_map["xarray_variables"].keys()) - # ---- Create set of coordinate variables - specified_coords = list(acoustics_config_map["xarray_coordinates"].keys()) - # ---- Concatenate into a full configuration map - full_config_map = {**acoustics_config_map["xarray_coordinates"], - **acoustics_config_map["xarray_variables"]} - # ! [REQUIRES DASK] ---- Read in all listed files - # TODO: The sliding/overlapping windows makes this annoying -- in theory, only a single new zarr file will be ingested - # TODO: So this needs to be replaced w/ `open_dataset` instead - zarr_data_ds = xr.open_mfdataset(zarr_files, - engine="zarr", - chunks="auto", - data_vars=specified_vars, - coords=specified_coords) - # ---- Extract coordinate metadata - coordinate_metadata = zarr_data_ds[["longitude", "latitude"]] - # ---- Convert to a DataFrame - zarr_data_df = zarr_data_ds.to_dataframe().reset_index() - # ---- Check for any missing columns - missing_columns = ( - [key for key in full_config_map.keys() if key not in zarr_data_df.columns] + # ---- Create Path.glob generator object (the case of a *.zarr file) + file_path_obj = acoustic_directory_path.glob(f"*{'.'+file_extension}") + # ---- Find all zarr files + zarr_files = list(file_path_obj) + # ---- Ensure files exist or raise error otherwise + if len(zarr_files) < 1: + raise FileNotFoundError( + f"No `*.zarr` files found in [{acoustic_directory_path}]!" ) - # ---- Raise Error, if needed - if missing_columns: - raise ValueError( - f"The following columns are missing from at least one *.{file_extension} file in " - f"[{acoustic_directory_path}]: {', '.join(missing_columns)}!" - ) - # ---- Select defined columns - zarr_data_df_filtered = zarr_data_df[full_config_map.keys()] - # ---- Validate data types - zarr_data_df_filtered = ( - zarr_data_df_filtered - .apply(lambda col: col.astype(full_config_map[col.name]) - if col.name in full_config_map else col) + else: + # ---- Create Path to SQL database file + db_directory = Path(file_configuration["data_root_dir"]) / "database" + # ---- Create the directory if it does not already exist + db_directory.mkdir(parents=True, exist_ok=True) + # ---- Complete path to `biology.db` + db_file = db_directory / "acoustics.db" + # ---- Query the external SQL database to see if the file tracking table exists + tables = SQL(db_file, "inspect") + # ---- Create a list of string-formatted Path names + zarr_files_str = [str(file) for file in zarr_files] + # ---- Create DataFrame + current_files = pd.DataFrame(zarr_files_str, columns=["filepath"]) + # ---- Create if it is missing and then advance `zarr_files` + if "files_read" not in tables: + # ---- Insert into the SQL database file + _ = SQL(db_file, "insert", table_name="files_read", columns="filepath", + dataframe=current_files) + # ---- Create empty list for later comparison + new_files = [] + else: + # ---- Pull already processed filenames + previous_files = SQL(db_file, "select", table_name="files_read") + # ---- Compare against the current filelist + new_files = ( + [file for file in zarr_files_str if file not in set(previous_files["filepath"])] + ) + # ---- Create a DataFrame for the new files + new_files_df = pd.DataFrame(new_files, columns=["filepath"]) + # ---- Insert into the SQL database file + _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) + + # Find new files that have not yet been processed + if not new_files: + subset_files = zarr_files + else: + subset_files = set(zarr_files).intersection(set(new_files)) + + # Read in the `*.zarr` file(s) + # ! [REQUIRES DASK] ---- Read in the listed file + if len(subset_files) > 1: + zarr_data_ds = xr.open_mfdataset(subset_files, engine="zarr", chunks="auto", + data_vars=specified_vars, coords=specified_coords) + elif len(subset_files) == 1: + zarr_data_ds = xr.open_dataset(subset_files[0], engine="zarr", chunks="auto") + + # Pre-process the Dataset, convert it to a DataFrame, and validate the structure + # ---- Extract coordinate metadata + coordinate_metadata = zarr_data_ds[["longitude", "latitude"]] + # ---- Convert to a DataFrame + zarr_data_df = zarr_data_ds.to_dataframe().reset_index() + # ---- Check for any missing columns + missing_columns = ( + [key for key in full_config_map.keys() if key not in zarr_data_df.columns] + ) + # ---- Raise Error, if needed + if missing_columns: + raise ValueError( + f"The following columns are missing from at least one *.{file_extension} file in " + f"[{acoustic_directory_path}]: {', '.join(missing_columns)}!" ) - + # ---- Select defined columns + zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map) + # Extract defined acoustic frequency # ---- From the configuration transmit_settings = acoustic_analysis_settings["transmit"] @@ -197,14 +359,475 @@ def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame, xr.Datas # ---- Replace NASC `NaN` values with `0.0` zarr_data_df_output.loc[:, "NASC"] = zarr_data_df_output.loc[:, "NASC"].fillna(0.0) # ---- Drop frequency column and return the output - return zarr_data_df_output.drop(columns = ["frequency_nominal"]), coordinate_metadata + acoustics_output["prc_nasc_df"] = zarr_data_df_output.drop(columns = ["frequency_nominal"]) + # ---- Return output + if update_config: + if file_configuration["database"]["acoustics"] is None: + file_configuration["database"]["acoustics"] = db_file + return acoustics_output, file_configuration + else: + return acoustics_output #################################################################################################### # TEST: ACOUSTIC ZARR FILE INGESTION CONFIGURATION # NOTE: # ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration` -acoustic_data, coordinate_metadata = load_acoustic_data(file_configuration) +acoustic_data, file_configuration = load_acoustic_data(file_configuration) acoustic_data -coordinate_metadata +#################################################################################################### +def load_biology_data(file_configuration: dict, update_config: bool = True): + + # Get acoustic directory and initialization settings + # ---- Files + biology_file_settings = file_configuration["input_directories"]["biological"] + # ---- General settings + biology_analysis_settings = file_configuration["biology"] + + # Get the file-specific settings, datatypes, columns, etc. + # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` + biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] + # ---- Extract the expected file name ID's + biology_file_ids = biology_file_settings["file_name_formats"] + # ---- Extract all of the file ids + biology_config_ids = list(biology_file_ids.keys()) + # ---- Initialize the dictionary that will define this key in the `input` attribute + biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} + # ---- Initialize the SQL dictionary + sql_biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} + + # Create full filepath + biology_directory_path = ( + Path(file_configuration["data_root_dir"]) / biology_file_settings["directory"] + ) + # ---- Directory check + directory_existence = biology_directory_path.exists() + # ---- Error evaluation (if applicable) + if not directory_existence: + raise FileNotFoundError( + f"The acoustic data directory [{biology_directory_path}] does not exist." + ) + # ---- Get the defined file extension + file_extension = biology_file_settings["extension"] + # ---- Create Path.glob generator object + file_path_obj = biology_directory_path.glob(f"*{'.'+file_extension}") + #---- Create list of `*.csv`` files + csv_files = list(file_path_obj) + # ---- Ensure files exist or raise error otherwise + if len(csv_files) < 1: + raise FileNotFoundError( + f"No `*.csv` files found in [{biology_directory_path}]!" + ) + else: + # ---- Create Path to SQL database file + db_directory = Path(file_configuration["data_root_dir"]) / "database" + # ---- Create the directory if it does not already exist + db_directory.mkdir(parents=True, exist_ok=True) + # ---- Complete path to `biology.db` + db_file = db_directory / "biology.db" + # ---- Query the external SQL database to see if the file tracking table exists + tables = SQL(db_file, "inspect") + # ---- Create a list of string-formatted Path names + csv_files_str = [str(file) for file in csv_files] + # ---- Create DataFrame + current_files = pd.DataFrame(csv_files_str, columns=["filepath"]) + # ---- Create if it is missing and then advance `csv_files` + if "files_read" not in tables: + # ---- Insert into the SQL database file + _ = SQL(db_file, "insert", table_name="files_read", columns="filepath", + dataframe=current_files) + # ---- Create empty list for later comparison + new_files = [] + else: + # ---- Pull already processed filenames + previous_files = SQL(db_file, "select", table_name="files_read") + # ---- Compare against the current filelist + new_files = ( + [file for file in csv_files_str if file not in set(previous_files["filepath"])] + ) + # ---- Create a DataFrame for the new files + new_files_df = pd.DataFrame(new_files, columns=["filepath"]) + # ---- Insert into the SQL database file + _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) + + # Iterate through each of the file ids and read in the data + for id in list(biology_file_ids.keys()): + # ---- Extract the specific config mapping for this tag/id + sub_config_map = biology_config_map[id] + # ---- Drop the `{FIELD_ID}` tag identifier + file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id]) + # ---- Replace all other tags with `*` placeholders + file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) + # ---- Create Path object with the generalized format + subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}") + # ---- List all files that match this pattern + subcsv_files_str = [str(file) for file in list(subfile_path_obj)] + # ---- Filter for only new files + subset_files = set(subcsv_files_str).intersection(set(new_files)) + # ---- Pull from SQL database, if applicable + if f"{id}_df" in tables: + # ---- SELECT + sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*") + # ---- Concatenate to the dictionary + sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df]) + # ---- Add data files not stored in SQL database + if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables: + if len(subset_files) > 0: + file_list = subset_files + else: + file_list = subcsv_files_str + # ---- Create a list of relevant dataframes + sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map) + for file in file_list] + # ---- Concatenate into a single DataFrame + sub_df = pd.concat(sub_df_lst, ignore_index=True) + # ---- Lower-case sex + if "sex" in sub_df.columns: + sub_df["sex"] = sub_df["sex"].str.lower() + # ---- Concatenate to the dictionary DataFrame + biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df]) + + # Get contrasts used for filtering the dataset + # ---- Species + species_filter = file_configuration["species"]["number_code"] + # ---- Trawl partition information + trawl_filter = biology_analysis_settings["catch"]["partition"] + # ---- Apply the filter + filtered_biology_output = { + key: df[ + (df['species_id'] == species_filter if 'species_id' in df.columns else True) & + (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns else True) + ] + for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty + } + + # Update the SQL database + for table_name, df in filtered_biology_output.items(): + # ---- Update + _ = SQL(db_file, "insert", table_name=table_name, columns="*", + dataframe=df) + + # Combine the two datasets + merged_output = { + key: pd.concat([ + sql_biology_output.get(key, pd.DataFrame()), + filtered_biology_output.get(key, pd.DataFrame()) + ]).drop_duplicates().reset_index(drop=True) + for key in set(sql_biology_output) | set(filtered_biology_output) + } + # ---- Return output + if update_config: + if file_configuration["database"]["biology"] is None: + file_configuration["database"]["biology"] = db_file + return merged_output, file_configuration + else: + return merged_output +#################################################################################################### +# TEST: BIOLOGY FILE INGESTION CONFIGURATION +# NOTE: +# ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration` +biology_data, file_configuration = load_biology_data(file_configuration) +biology_data +#################################################################################################### +prc_nasc_df = acoustic_data["prc_nasc_df"] + +def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: dict, + echometrics: bool = True): + + # Integrate NASC (and compute the echometrics, if necessary) + nasc_data_df = ( + acoustic_data_df.groupby(["longitude", "latitude", "ping_time"]) + .apply(lambda group: integrate_nasc(group, echometrics), include_groups=False) + .reset_index() + ) + # ---- Amend the dtypes if echometrics were computed + if echometrics: + nasc_data_df = ( + nasc_data_df + .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float, + "center_of_mass": float, "dispersion": float, "evenness": float, + "aggregation": float, "occupied_area": float}) + ) + + # Get the name of the associated db file + acoustics_db = file_configuration["database"]["acoustics"] + # ---- Get current tables + tables = SQL(acoustics_db, "inspect") + + # + if "nasc_df" not in tables: + _ = SQL(acoustics_db, "insert", table_name="nasc_df", dataframe=nasc_data_df) + else: + # ---- + nasc_sql = SQL(acoustics_db, "select", table_name="nasc_df") + # ---- + index_equiv = nasc_data_df[["longitude", "latitude", "ping_time"]].isin(nasc_sql) + # ---- + bool_idx = index_equiv.apply(lambda x: np.all(x), axis=1) + # ---- + _ = SQL(acoustics_db, "insert", table_name="nasc_df", dataframe=nasc_data_df.loc[~bool_idx]) + # ---- + nasc_data_df = pd.concat([nasc_sql, nasc_data_df], ignore_index=True) + + # Return the output + return nasc_data_df + + +SQL(acoustics_db, command="drop", table_name="nasc_df") +SQL(acoustics_db, "inspect") + +nasc_analysis = process_acoustic_data(acoustic_data["prc_nasc_df"], file_configuration) + +SQL(acoustics_db, command="select", table_name="nasc_df") + +TS_SLOPE = 20.0 +TS_INTERCEPT = -68.0 + +# CONVERT TO TS +comb_lengths["ts"] = TS_SLOPE * np.log10(comb_lengths["length"]) + TS_INTERCEPT +# TO SIGMA_BS +comb_lengths["sigma_bs"] = 10 ** (comb_lengths["ts"] / 10) +# WEIGHTED MEAN SIGMA_BS +sigma_mean = np.average(comb_lengths["sigma_bs"], weights=comb_lengths["length_count"]) + +from typing import Optional +from echopop.utils import operations +from echopop.acoustics import ts_length_regression, to_linear, to_dB + +__all__ = ["operations"] + +# Meld bio datasets +length_datasets = biology_data["specimen_df"].meld(biology_data["length_df"], + contrasts=["haul_num", "sex", "species_id", "length"]) + +# Create distribution +distrib_params = file_configuration["biology"]["length_distribution"]["bins"] + +length_bins = np.linspace(**{key: value for key, value in zip(["start", "stop", "num"], distrib_params)}, dtype=float) +binwidth = np.diff(length_bins / 2.0).mean() +intervals = np.concatenate([length_bins[:1] - binwidth, length_bins + binwidth]) +length_bins_df = pd.DataFrame({"bin": length_bins, "interval": pd.cut(length_bins, intervals)}) +# +length_datasets["length_bin"] = pd.cut(length_datasets["length"], bins=intervals, labels=length_bins_df["bin"]) + +stratify_key = file_configuration["geospatial"]["link_biology_acoustics"] + +if stratify_key == "global": + length_distribution = ( + length_datasets.pivot_table(columns=["sex"], index=["length_bin"], + values="length_count", aggfunc="sum", observed=False) + ) + # + length_distribution["total"] = length_distribution.sum(axis=1) + +length_distribution.transpose() +SQL(biology_db, "drop", table_name="length_distribution") +# Get the name of the associated db file +biology_db = file_configuration["database"]["biology"] +# ---- Get current tables +tables = SQL(biology_db, "inspect") + + +if "length_distribution" not in tables: + _ = SQL(biology_db, "insert", table_name="length_distribution", + dataframe=length_distribution.transpose()) + + +SQL(biology_db, "select", table_name="length_distribution") +SQL(biology_db, "drop", table_name="length_distribution") +SQL(biology_db, "replace", table_name="length_distribution", dataframe=length_distribution.unstack().reset_index(name="count")) +length_distribution.unstack().reset_index(name="count") +mixed = SQL(biology_db, "select", table_name="length_distribution") +length_bins[:1] +from typing import Optional +from echopop.utils import operations +from echopop.acoustics import ts_length_regression, to_linear, to_dB + +__all__ = ["operations"] + +# Meld bio datasets +length_datasets = biology_data["specimen_df"].meld(biology_data["length_df"], + contrasts=["haul_num", "species_id", "length"]) + +ts_length_parameters_spp = [ + spp + for spp in file_configuration["acoustics"]["TS_length_regression_parameters"].values() + if spp["number_code"] in np.unique(length_datasets.species_id).astype(int) +] + +# ---- get species info +target_species = pd.DataFrame.from_dict(ts_length_parameters_spp) + +ts_lengths_df = length_datasets.merge( + target_species.drop("length_units", axis=1), + left_on=["species_id"], + right_on=["number_code"], +) +# ---- filter out other spp +length_datasets[length_datasets["species_id"].isin(target_species["number_code"])] + +# +file_configuration["acoustics"]["TS_length_regression_parameters"][target_species["text_code"]] + +def average_sigma_bs(length: Union[pd.DataFrame, float, int], TS_L_slope: Optional[float] = None, TS_L_intercept: Optional[float] = None, weighted: Optional[Union[float, int, str]] = None): + + # + if isinstance(length, pd.DataFrame): + if "length" not in length.columns: + raise ValueError( + "Column [`length`] missing from dataframe input `length`." + ) + if "TS_L_slope" not in length.columns and TS_L_slope is None: + raise ValueError( + "Value [`TS_L_slope`] missing from dataframe input `length` and optional " + "separate argument `TS_L_slope`." + ) + if "TS_L_intercept" not in length.columns and TS_L_intercept is None: + raise ValueError( + "Value [`TS_L_intercept`] missing from dataframe input `length` and optional " + "separate argument `TS_L_intercept`." + ) + elif isinstance(length, float) or isinstance(length, int): + if TS_L_slope is None: + raise ValueError( + "Argument [`TS_L_slope`] missing." + ) + elif TS_L_slope is not None and not isinstance(TS_L_slope, float): + raise TypeError( + "Argument `TS_L_slope` must be type `float`." + ) + if "TS_L_intercept" not in length.columns and TS_L_intercept is None: + raise ValueError( + "Argument [`TS_L_intercept`] missing." + ) + elif TS_L_intercept is not None and not isinstance(TS_L_intercept, float): + raise TypeError( + "Argument `TS_L_intercept` must be type `float`." + ) + + # + if TS_L_slope is None: + TS_L_slope = length["TS_L_slope"] + + # + if TS_L_intercept is None: + TS_L_intercept = length["TS_L_intercept"] + + # + if isinstance(length, pd.DataFrame): + length_val = length["length"] + + ts_value = ts_length_regression(length_val, TS_L_slope, TS_L_intercept) + sigma_bs_value = to_linear(ts_value) + + + + if isinstance(weighted, str): + if weighted not in length.columns: + raise ValueError( + f"Argument [`weighted` (str)], '{weighted}', is not a column in argument `length` " + f"(DataFrame)." + ) + else: + return (sigma_bs_value * length[weighted]).sum() / length[weighted].sum() + elif weighted is not None: + if weighted.size != sigma_bs_value.size: + raise ValueError( + f"Argument [`weighted` (float|int)] of size {weighted.size} does not match size of " + f"argument [`length` (float|int)`] of size {sigma_bs_value.size}." + ) + else: + return (sigma_bs_value * weighted).sum() / weighted.sum() + else: + return sigma_bs_value.mean() + +average_sigma_bs + +ts_lengths_df.groupby(["haul_num"]).apply(average_sigma_bs).apply(lambda x: to_dB(x)) +def integrate_nasc(prc_nasc_df: pd.DataFrame): + +# Compute the number of layers +echometrics.update({ + "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size +}) + +# Compute the index of aggregation +echometrics.update({ + "aggregation": 1 / echometrics["evenness"] +}) + +# Get the occupied area +echometrics.update({ + "occupied_area": ( + acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max() + ) +}) + + + + +pd.read_fr +pd.read_sql(text(SQL_COMMANDS["select"].format(**kwargs)), con=connection) +engine = create_engine(f"sqlite:///{db_file}") +connection = engine.connect() +kwargs["dataframe"].to_sql(name=kwargs["table_name"], + con=connection, + if_exists="append", index=False) +connection.close() +engine.dispose() +SQL(db_file, "insert", table_name=table_name, columns="*", + filter_columns=insertion_filter, + dataframe=df) + +SQL(db_file, "select", table_name="files_read") +SQL(db_file, "select", table_name="catch_df") +SQL(db_file, "select", table_name="specimen_df") +SQL(db_file, "select", table_name="length_df") + +def check_table_schema(connection, **kwargs): + query = text(("PRAGMA table_info({table_name});").format(**kwargs)) + schema = connection.execute(query).fetchall() + print("Table Schema:", schema) + +check_table_schema(connection, table_name=table_name) + +def insert_test_data(connection, table_name): + test_data = pd.DataFrame({ + 'trawl_partition': ['test'], + 'species_id': ['test'], + 'haul_weight': [0.0], + 'catch_percentage': [0.0], + 'haul_num': [1] + }) + + test_data.to_sql(name=table_name, con=connection, if_exists='append', index=False) + print("Test data inserted.") + +insert_test_data(connection, table_name) + +kwargs = {} +command = "insert" +kwargs["table_name"] = "catch_df" +kwargs["dataframe"] = df +kwargs["filter_columns"] = insertion_filter +columns = "*" + + +re.compile(file_name_format) +pattern = file_name_format +pattern = pattern.replace('{DATE:YYYYMM}', r'(?P\d{6})') +pattern = pattern.replace('{HAUL}', r'(?P\d+)') +pattern = pattern.replace('{FILE_ID}', r'(?P.+)') +regex = re.compile(pattern) +haul_values = [] + +file_name_format.search(file.name) +sub_df_lst = [] +for file in subcsv_files: + match = regex.search(file.name) + if match: + haul_value = match.group('HAUL') + df = pd.read_csv(file, usecols=list(sub_config_map.keys())) + df['HAUL'] = haul_value # Append HAUL value as a new column + sub_df_lst.append(df) #################################################################################################### def load_spatial_data(file_configuration: dict, acoustic_data: pd.DataFrame, @@ -438,6 +1061,14 @@ def __init__( x_min, y_min = utm_proj(lon_min, lat_min) x_max, y_max = utm_proj(lon_max, lat_max) +lat = 55.5000 +lon = -134.2500 +utm_code = int(utm_string_generator(lon, lat)) +utm_proj = pyproj.Proj(f"epsg:{utm_code}") +utm_proj(lon, lat) +gpd.GeoDataFrame(geometry=gpd.points_from_xy(np.array([lon]), np.array([lat])), crs=projection).to_crs(utm_code) + + num_lon_steps = int((x_max - x_min) / lon_step) num_lat_steps = int((y_max - y_min) / lat_step) From 9b79d814f7c2ea949d0a7ff3ce9d40e7d724d6d3 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Sun, 21 Jul 2024 20:05:09 -0700 Subject: [PATCH 05/81] Updating `LiveSurvey` methods --- echopop/live/__init__.py | 4 + echopop/live/acoustics.py | 0 echopop/live/core.py | 28 -- echopop/live/live_data_processing.py | 422 +++++++++++++++++++++++++++ echopop/live/liveacoustics.py | 143 +++++++++ echopop/live/livecore.py | 119 ++++++++ echopop/live/livesurvey.py | 56 ++-- echopop/live/sql_methods.py | 73 +++++ 8 files changed, 799 insertions(+), 46 deletions(-) delete mode 100644 echopop/live/acoustics.py delete mode 100644 echopop/live/core.py create mode 100644 echopop/live/live_data_processing.py create mode 100644 echopop/live/liveacoustics.py create mode 100644 echopop/live/livecore.py create mode 100644 echopop/live/sql_methods.py diff --git a/echopop/live/__init__.py b/echopop/live/__init__.py index b8585ba9..f4e742bb 100644 --- a/echopop/live/__init__.py +++ b/echopop/live/__init__.py @@ -1 +1,5 @@ +from echopop.utils import operations + +__all__ = ["operations"] + from _echopop_version import version as __version__ # noqa \ No newline at end of file diff --git a/echopop/live/acoustics.py b/echopop/live/acoustics.py deleted file mode 100644 index e69de29b..00000000 diff --git a/echopop/live/core.py b/echopop/live/core.py deleted file mode 100644 index de066ae3..00000000 --- a/echopop/live/core.py +++ /dev/null @@ -1,28 +0,0 @@ -from datetime import datetime - -import pandas as pd - -LIVE_DATA_STRUCTURE = { - "meta": { - "provenance": dict(), - "date": list(), - }, - "input": { - "acoustics": { - "nasc_df": pd.DataFrame(), - }, - "biology": { - "catch_df": pd.DataFrame(), - "distributions": { - "length_bins_df": pd.DataFrame(), - }, - "length_df": pd.DataFrame(), - "specimen_df": pd.DataFrame(), - }, - }, - "results": { - "acoustics": dict(), - "biology": dict(), - "stratified": dict(), - }, -} \ No newline at end of file diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py new file mode 100644 index 00000000..293862c4 --- /dev/null +++ b/echopop/live/live_data_processing.py @@ -0,0 +1,422 @@ +import yaml +import re + +from pathlib import Path +from typing import Union, Tuple + +import pandas as pd +import xarray as xr +import numpy as np + +from .livecore import( + LIVE_DATA_STRUCTURE, + LIVE_FILE_FORMAT_MAP, + LIVE_INPUT_FILE_CONFIG_MAP +) + +from .sql_methods import SQL + +# TODO: Incorporate complete YAML file validator +# TODO: Documentation +def live_configuration(live_init_config_path: Union[str, Path], + live_file_config_path: Union[str, Path]): + + # Validate file existence + # ---- str-to-Path conversion, if necessary + live_init_config_path = Path(live_init_config_path) + live_file_config_path = Path(live_file_config_path) + # ---- Create list of both config paths + config_files = [live_init_config_path, live_file_config_path] + # ---- List of file existence checks + config_existence = [live_init_config_path.exists(), live_file_config_path.exists()] + # ---- Error evaluation and print message (if applicable) + if not all(config_existence): + missing_config = [ + files for files, exists in zip(config_files, config_existence) if not exists + ] + raise FileNotFoundError(f"The following configuration files do not exist: {missing_config}") + + # Read the YAML configuration/recipe file to parameterize the `LiveSurvey` class + # ---- Initialization settings + init_config = yaml.safe_load(Path(live_init_config_path).read_text()) + # ---- Filepath/directory settings + file_config = yaml.safe_load(Path(live_file_config_path).read_text()) + + # Check for intersecting/duplicative configuration keys + # ---- Compare sets of keys from each dictionary + config_intersect = set(init_config.keys()).intersection(set(file_config.keys())) + # ---- Raise error if needed + if config_intersect: + raise ValueError( + f"The initialization and file configuration files comprise the following intersecting " + f"keys: {' ,'.join(config_intersect)}. Key names must be unique for each configuration " + f"file." + ) + + # Combine both into a dictionary output that can be added to the `LiveSurvey` class object + return {**init_config, **file_config} + +# TODO: Documentation +def compile_filename_format(file_name_format: str): + + # Create a copy of `file_name_format` + regex_pattern = file_name_format + + # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern + for key, value in LIVE_FILE_FORMAT_MAP.items(): + regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"]) + # ---- Replace the `FILE_ID` tag + regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P\1)', regex_pattern) + + # Compile the regex pattern and return the output + return re.compile(regex_pattern) + + +# TODO: Documentation +def configure_transmit_frequency(frequency_values: pd.Series, + transmit_settings: dict, + current_units: str): + + # Extract transmit frequency units defined in configuration file + configuration_units = transmit_settings["units"] + + # Transform the units, if necessary + # ---- Hz to kHz + if current_units == "Hz" and configuration_units == "kHz": + return frequency_values * 1e-3 + # ---- kHz to Hz + elif current_units == "kHz" and configuration_units == "Hz": + return frequency_values * 1e3 + # ---- No change + else: + return frequency_values + +# TODO: Documentation +def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict): + + # Read in the `*.csv` file + df = pd.read_csv(file, usecols=list(config_settings["dtypes"].keys())) + + # Validate the dataframe + # ---- Check for any missing columns + missing_columns = ( + [key for key in config_settings["dtypes"].keys() if key not in df.columns] + ) + # ---- Raise Error, if needed + if missing_columns: + raise ValueError( + f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!" + ) + # ---- Ensure the correct datatypes + df_validated = df.astype(config_settings["dtypes"]) + # ---- Replace column names and drop + df_validated = df_validated.rename(columns=config_settings["names"]) + + # Get the substring components that can be added to the DataFrame + filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern) + # ---- Create sub-list of columns that can be added to the DataFrame + valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings))) + + # Compile the filename regular expression + compiled_regex = compile_filename_format(pattern) + # ---- Create the `Match` object that will be used to parse the string + match_obj = compiled_regex.search(file.name) + + # Iterate through the filename-derived tags and add them to the DataFrame + for i in valid_tags: + matched_key = LIVE_FILE_FORMAT_MAP[i] + df_validated[matched_key["name"]] = matched_key["dtype"](match_obj.group(i)) + + # Return the resulting DataFrame + return df_validated + +# TODO: Documentation +# TODO: Refactor, break up cyclomatic complexity +def load_biology_data(file_configuration: dict, update_config: bool = True): + + # Get acoustic directory and initialization settings + # ---- Files + biology_file_settings = file_configuration["input_directories"]["biological"] + # ---- General settings + biology_analysis_settings = file_configuration["biology"] + + # Get the file-specific settings, datatypes, columns, etc. + # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` + biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] + # ---- Extract the expected file name ID's + biology_file_ids = biology_file_settings["file_name_formats"] + # ---- Extract all of the file ids + biology_config_ids = list(biology_file_ids.keys()) + # ---- Initialize the dictionary that will define this key in the `input` attribute + biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} + # ---- Initialize the SQL dictionary + sql_biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} + + # Create full filepath + biology_directory_path = ( + Path(file_configuration["data_root_dir"]) / biology_file_settings["directory"] + ) + # ---- Directory check + directory_existence = biology_directory_path.exists() + # ---- Error evaluation (if applicable) + if not directory_existence: + raise FileNotFoundError( + f"The acoustic data directory [{biology_directory_path}] does not exist." + ) + # ---- Get the defined file extension + file_extension = biology_file_settings["extension"] + # ---- Create Path.glob generator object + file_path_obj = biology_directory_path.glob(f"*{'.'+file_extension}") + #---- Create list of `*.csv`` files + csv_files = list(file_path_obj) + # ---- Ensure files exist or raise error otherwise + if len(csv_files) < 1: + raise FileNotFoundError( + f"No `*.csv` files found in [{biology_directory_path}]!" + ) + else: + # ---- Create Path to SQL database file + db_directory = Path(file_configuration["data_root_dir"]) / "database" + # ---- Create the directory if it does not already exist + db_directory.mkdir(parents=True, exist_ok=True) + # ---- Complete path to `biology.db` + db_file = db_directory / "biology.db" + # ---- Query the external SQL database to see if the file tracking table exists + tables = SQL(db_file, "inspect") + # ---- Create a list of string-formatted Path names + csv_files_str = [str(file) for file in csv_files] + # ---- Create DataFrame + current_files = pd.DataFrame(csv_files_str, columns=["filepath"]) + # ---- Create if it is missing and then advance `csv_files` + if "files_read" not in tables: + # ---- Insert into the SQL database file + _ = SQL(db_file, "insert", table_name="files_read", columns="filepath", + dataframe=current_files) + # ---- Create empty list for later comparison + new_files = [] + else: + # ---- Pull already processed filenames + previous_files = SQL(db_file, "select", table_name="files_read") + # ---- Compare against the current filelist + new_files = ( + [file for file in csv_files_str if file not in set(previous_files["filepath"])] + ) + # ---- Create a DataFrame for the new files + new_files_df = pd.DataFrame(new_files, columns=["filepath"]) + # ---- Insert into the SQL database file + _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) + + # Iterate through each of the file ids and read in the data + for id in list(biology_file_ids.keys()): + # ---- Extract the specific config mapping for this tag/id + sub_config_map = biology_config_map[id] + # ---- Drop the `{FIELD_ID}` tag identifier + file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id]) + # ---- Replace all other tags with `*` placeholders + file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) + # ---- Create Path object with the generalized format + subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}") + # ---- List all files that match this pattern + subcsv_files_str = [str(file) for file in list(subfile_path_obj)] + # ---- Filter for only new files + subset_files = set(subcsv_files_str).intersection(set(new_files)) + # ---- Pull from SQL database, if applicable + if f"{id}_df" in tables: + # ---- SELECT + sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*") + # ---- Concatenate to the dictionary + sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df]) + # ---- Add data files not stored in SQL database + if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables: + if len(subset_files) > 0: + file_list = subset_files + else: + file_list = subcsv_files_str + # ---- Create a list of relevant dataframes + sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map) + for file in file_list] + # ---- Concatenate into a single DataFrame + sub_df = pd.concat(sub_df_lst, ignore_index=True) + # ---- Lower-case sex + if "sex" in sub_df.columns: + sub_df["sex"] = sub_df["sex"].str.lower() + # ---- Concatenate to the dictionary DataFrame + biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df]) + + # Get contrasts used for filtering the dataset + # ---- Species + species_filter = file_configuration["species"]["number_code"] + # ---- Trawl partition information + trawl_filter = biology_analysis_settings["catch"]["partition"] + # ---- Apply the filter + filtered_biology_output = { + key: df[ + (df['species_id'] == species_filter if 'species_id' in df.columns else True) & + (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns else True) + ] + for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty + } + + # Update the SQL database + for table_name, df in filtered_biology_output.items(): + # ---- Update + _ = SQL(db_file, "insert", table_name=table_name, columns="*", + dataframe=df) + + # Combine the two datasets + merged_output = { + key: pd.concat([ + sql_biology_output.get(key, pd.DataFrame()), + filtered_biology_output.get(key, pd.DataFrame()) + ]).drop_duplicates().reset_index(drop=True) + for key in set(sql_biology_output) | set(filtered_biology_output) + } + # ---- Return output + if update_config: + if file_configuration["database"]["biology"] is None: + file_configuration["database"]["biology"] = db_file + return merged_output, file_configuration + else: + return merged_output + +# TODO: Expand data validator and limit cases to '*.zarr' (for now) +# TODO: Refactor "extra" components such as the validation steps, xarray-to-dataframe piping, etc. +# TODO: Documentation +def load_acoustic_data(file_configuration: dict, update_config: bool = True) -> Tuple[pd.DataFrame, xr.Dataset]: + # Get acoustic directory and initialization settings + # ---- Files + acoustic_file_settings = file_configuration["input_directories"]["acoustic"] + # ---- General settings + acoustic_analysis_settings = file_configuration["acoustics"] + + # Get the file-specific settings, datatypes, columns, etc. + # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` + acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"] + # ---- Create list of coordinate data variables + specified_vars = list(acoustics_config_map["xarray_variables"].keys()) + # ---- Create set of coordinate variables + specified_coords = list(acoustics_config_map["xarray_coordinates"].keys()) + # ---- Concatenate into a full configuration map + full_config_map = {**acoustics_config_map["xarray_coordinates"], + **acoustics_config_map["xarray_variables"]} + # ---- Initialize the dictionary that will define this key in the `input` attribute + acoustics_output = {"prc_nasc_df": pd.DataFrame(), + "nasc_df": pd.DataFrame()} + # ---- Initialize the SQL dictionary + # sql_acoustics_output = {"sv_df": pd.DataFrame()} + + # Create full filepath + acoustic_directory_path = ( + Path(file_configuration["data_root_dir"]) / acoustic_file_settings["directory"] + ) + + # Validate filepath, columns, datatypes + # ---- Directory check + directory_existence = acoustic_directory_path.exists() + # ---- Error evaluation (if applicable) + if not directory_existence: + raise FileNotFoundError( + f"The acoustic data directory [{acoustic_directory_path}] does not exist." + ) + # ---- Get the defined file extension + file_extension = acoustic_file_settings["extension"] + # ---- Create Path.glob generator object (the case of a *.zarr file) + file_path_obj = acoustic_directory_path.glob(f"*{'.'+file_extension}") + # ---- Find all zarr files + zarr_files = list(file_path_obj) + # ---- Ensure files exist or raise error otherwise + if len(zarr_files) < 1: + raise FileNotFoundError( + f"No `*.zarr` files found in [{acoustic_directory_path}]!" + ) + else: + # ---- Create Path to SQL database file + db_directory = Path(file_configuration["data_root_dir"]) / "database" + # ---- Create the directory if it does not already exist + db_directory.mkdir(parents=True, exist_ok=True) + # ---- Complete path to `biology.db` + db_file = db_directory / "acoustics.db" + # ---- Query the external SQL database to see if the file tracking table exists + tables = SQL(db_file, "inspect") + # ---- Create a list of string-formatted Path names + zarr_files_str = [str(file) for file in zarr_files] + # ---- Create DataFrame + current_files = pd.DataFrame(zarr_files_str, columns=["filepath"]) + # ---- Create if it is missing and then advance `zarr_files` + if "files_read" not in tables: + # ---- Insert into the SQL database file + _ = SQL(db_file, "insert", table_name="files_read", columns="filepath", + dataframe=current_files) + # ---- Create empty list for later comparison + new_files = [] + else: + # ---- Pull already processed filenames + previous_files = SQL(db_file, "select", table_name="files_read") + # ---- Compare against the current filelist + new_files = ( + [file for file in zarr_files_str if file not in set(previous_files["filepath"])] + ) + # ---- Create a DataFrame for the new files + new_files_df = pd.DataFrame(new_files, columns=["filepath"]) + # ---- Insert into the SQL database file + _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) + + # Find new files that have not yet been processed + if not new_files: + subset_files = zarr_files + else: + subset_files = set(zarr_files).intersection(set(new_files)) + + # Read in the `*.zarr` file(s) + # ! [REQUIRES DASK] ---- Read in the listed file + if len(subset_files) > 1: + zarr_data_ds = xr.open_mfdataset(subset_files, engine="zarr", chunks="auto", + data_vars=specified_vars, coords=specified_coords) + elif len(subset_files) == 1: + zarr_data_ds = xr.open_dataset(subset_files[0], engine="zarr", chunks="auto") + + # Pre-process the Dataset, convert it to a DataFrame, and validate the structure + # ---- Extract coordinate metadata + coordinate_metadata = zarr_data_ds[["longitude", "latitude"]] + # ---- Convert to a DataFrame + zarr_data_df = zarr_data_ds.to_dataframe().reset_index() + # ---- Check for any missing columns + missing_columns = ( + [key for key in full_config_map.keys() if key not in zarr_data_df.columns] + ) + # ---- Raise Error, if needed + if missing_columns: + raise ValueError( + f"The following columns are missing from at least one *.{file_extension} file in " + f"[{acoustic_directory_path}]: {', '.join(missing_columns)}!" + ) + # ---- Select defined columns + zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map) + + # Extract defined acoustic frequency + # ---- From the configuration + transmit_settings = acoustic_analysis_settings["transmit"] + # ---- Transform `frequency_nominal`, if necessary + zarr_data_df_filtered["frequency_nominal"] = ( + configure_transmit_frequency(zarr_data_df_filtered["frequency_nominal"], + transmit_settings, + zarr_data_ds["frequency_nominal"].units) + ) + # ---- Filter out any unused frequency coordinates + zarr_data_df_output = ( + zarr_data_df_filtered + [zarr_data_df_filtered["frequency_nominal"] == transmit_settings["frequency"]] + ) + + # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object + # ---- Replace NASC `NaN` values with `0.0` + zarr_data_df_output.loc[:, "NASC"] = zarr_data_df_output.loc[:, "NASC"].fillna(0.0) + # ---- Drop frequency column and return the output + acoustics_output["prc_nasc_df"] = zarr_data_df_output.drop(columns = ["frequency_nominal"]) + # ---- Return output + if update_config: + if file_configuration["database"]["acoustics"] is None: + file_configuration["database"]["acoustics"] = db_file + return acoustics_output, file_configuration + else: + return acoustics_output \ No newline at end of file diff --git a/echopop/live/liveacoustics.py b/echopop/live/liveacoustics.py new file mode 100644 index 00000000..f526f578 --- /dev/null +++ b/echopop/live/liveacoustics.py @@ -0,0 +1,143 @@ +from typing import Union, Optional + +import pandas as pd + +from echopop.acoustics import ts_length_regression, to_linear, to_dB + +# TODO: Documentation +def average_sigma_bs(length: Union[pd.DataFrame, float, int], + weights: Optional[Union[float, int, str]] = None): + + # Function approach for dataframe input + if isinstance(length, pd.DataFrame): + if "length" not in length.columns: + raise ValueError( + "Column [`length`] missing from dataframe input `length`." + ) + elif "TS_L_slope" not in length.columns: + raise ValueError( + "Column [`TS_L_slope`] missing from dataframe input `length`." + ) + elif "TS_L_slope" not in length.columns: + raise ValueError( + "Column [`TS_L_intercept`] missing from dataframe input `length`." + ) + else: + # ---- Compute the TS (as an array) + target_strength = ts_length_regression(length["length"], length["TS_L_slope"], + length["TS_L_intercept"]) + # ---- Convert to `sigma_bs` + sigma_bs_value = to_linear(target_strength) + # ---- Weighted or arithmetic avveraging + if weights is None: + return sigma_bs_value.mean() + elif weights not in length.columns: + raise ValueError( + f"Defined `weights` column, {weights}, missing from dataframe input " + f"`length`." + ) + else: + return (sigma_bs_value * length[weights]).sum() / length[weights].sum() + +# TODO: Documentation +# TODO: Refactor +def estimate_echometrics(acoustic_data_df: pd.DataFrame): + + # Create copy + acoustic_df = acoustic_data_df.copy().reset_index(drop=True) + + # Pre-compute the change in depth + acoustic_df["dz"] = acoustic_df["depth"].diff() + + # Initialize echometrics dictionary + echometrics = {} + + # Compute the metrics center-of-mass + if acoustic_df["NASC"].sum() == 0.0: + echometrics.update({ + "n_layers": 0, + "mean_Sv": -999, + "max_Sv": -999, + "nasc_db": np.nan, + "center_of_mass": np.nan, + "dispersion": np.nan, + "evenness": np.nan, + "aggregation": np.nan, + "occupied_area": 0.0, + }) + else: + + # Compute the number of layers + echometrics.update({ + "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size + }) + + # Compute ABC + # ---- Convert NASC to ABC + acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2) + # ---- Estimate mean Sv + echometrics.update({ + "mean_Sv": 10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max()) + }) + # --- Estimate max Sv (i.e. ) + echometrics.update({ + "max_Sv": 10 * np.log10(acoustic_df["ABC"].max() + / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"]) + }) + + # Compute (acoustic) abundance + echometrics.update({ + "nasc_db": 10 * np.log10(acoustic_df["ABC"].sum()) + }) + + # Compute center of mass + echometrics.update({ + "center_of_mass": ( + (acoustic_df["depth"] * acoustic_df["NASC"]).sum() + / (acoustic_df["NASC"]).sum() + ) + }) + + # Compute the dispersion + echometrics.update({ + "dispersion": ( + ((acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2 + * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum() + ) + }) + + # Compute the evenness + echometrics.update({ + "evenness": (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2 + }) + + # Compute the index of aggregation + echometrics.update({ + "aggregation": 1 / echometrics["evenness"] + }) + + # Get the occupied area + echometrics.update({ + "occupied_area": ( + acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max() + ) + }) + + # Return the dictionary + return echometrics + +def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True): + + # Vertically integrate PRC NASC + nasc_dict = {"nasc": acoustic_data_df["NASC"].sum()} + + # Horizontally concatenate `echometrics`, if `True` + if echometrics: + # ---- Compute values + # NOTE: This uses NASC instead of linear `sv` + echometrics_dict = estimate_echometrics(acoustic_data_df) + # ---- Merge + nasc_dict.update(echometrics_dict) + + # Convert `nasc_dict` to a DataFrame and return the output + return pd.Series(nasc_dict) diff --git a/echopop/live/livecore.py b/echopop/live/livecore.py new file mode 100644 index 00000000..83e72a86 --- /dev/null +++ b/echopop/live/livecore.py @@ -0,0 +1,119 @@ +from datetime import datetime + +import pandas as pd + +LIVE_DATA_STRUCTURE = { + "meta": { + "provenance": dict(), + "date": list(), + }, + "database": { + "acoustics": None, + "biology": None, + "population": None, + }, + "input": { + "acoustics": { + "nasc_df": pd.DataFrame(), + }, + "biology": { + "catch_df": pd.DataFrame(), + "distributions": { + "length_bins_df": pd.DataFrame(), + }, + "length_df": pd.DataFrame(), + "specimen_df": pd.DataFrame(), + }, + }, + "results": { + "acoustics": dict(), + "biology": dict(), + "stratified": dict(), + }, +} + +# TODO: Update structure with additional information (as needed) +# TODO: Documentation +LIVE_INPUT_FILE_CONFIG_MAP = { + "acoustics": { + "xarray_coordinates": { + "distance": float, + "depth": float, + }, + "xarray_variables": { + "NASC": float, + "frequency_nominal": float, + "latitude": float, + "longitude": float, + "ping_time": "datetime64[ns]", + } + }, + "biology": { + "catch": { + "dtypes": { + "partition": str, + "species_code": int, + "sample_weight_kg": float, + "catch_perc": float, + }, + "names": { + "partition": "trawl_partition", + "species_code": "species_id", + "sample_weight_kg": "haul_weight", + "catch_perc": "catch_percentage", + } + }, + "length": { + "dtypes": { + "sex": str, + "rounded_length": int, + "frequency": int, + }, + "names": { + "sex": "sex", + "rounded_length": "length", + "frequency": "length_count", + } + }, + "specimen": { + "dtypes": { + "rounded_length": int, + "organism_weight": float, + "sex": str, + }, + "names": { + "sex": "sex", + "rounded_length": "length", + "organism_weight": "weight" + }, + }, + }, +} + +LIVE_FILE_FORMAT_MAP = { + "DATE:YYYYMM": { + "name": "date", + "dtype": "datetime[ns]", + "expression": r"(?P\d{6})", + }, + "DATE:YYYYMMDD": { + "name": "date", + "dtype": "datetime[ns]", + "expression": r"(?P\d{8})", + }, + "HAUL": { + "name": "haul_num", + "dtype": int, + "expression": r"(?P\d+)", + }, + "SPECIES_CODE": { + "name": "species_id", + "dtype": int, + "expression": r"(?P\d+)" + }, + "FILE_ID": { + "name": "file_id", + "dtype": str, + "expression": r"(?P.+)" + }, +} diff --git a/echopop/live/livesurvey.py b/echopop/live/livesurvey.py index 70765b0f..6d6a8621 100644 --- a/echopop/live/livesurvey.py +++ b/echopop/live/livesurvey.py @@ -3,8 +3,10 @@ import copy import yaml -from .core import( - DATA_STRUCTURE +from .livecore import( + LIVE_DATA_STRUCTURE, + LIVE_FILE_FORMAT_MAP, + LIVE_INPUT_FILE_CONFIG_MAP ) from ..acoustics import ( @@ -13,29 +15,47 @@ to_linear ) +from . import live_data_processing as eldp + class LiveSurvey: """ - A real-time processing version of the `echopop` base - `Survey` class that ingests biological, acoustic, and - event meta data to provide population estimates when - generated. + A real-time processing version of the `echopop` base `Survey` class that ingests biological, + acoustic, and event meta data to provide population estimates when generated. """ def __init__( - self + self, + live_init_config_path: Union[str, Path], + live_file_config_path: Union[str, Path], + update_config: bool = True, + verbose: bool = True, ): # Initialize `meta` attribute - self.meta = copy.deepcopy(DATA_STRUCTURE["meta"]) + self.meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"]) # Loading the configuration settings and definitions that are used to # initialize the Survey class object - self.config = el.load_configuration(Path(init_config_path), Path(survey_year_config_path)) - - # Loading the datasets defined in the configuration files - self.input = el.load_survey_data(self.config) - - # Initialize the `analysis` data attribute - self.analysis = copy.deepcopy(DATA_STRUCTURE["analysis"]) - - # Initialize the `results` data attribute - self.results = copy.deepcopy(DATA_STRUCTURE["results"]) \ No newline at end of file + self.config = eldp.live_configuration(Path(live_init_config_path), + Path(live_file_config_path)) + + # Initialize input attribute + self.input = copy.deepcopy(LIVE_DATA_STRUCTURE["input"]) + + # Initialize database attribute + self.database = copy.deepcopy(LIVE_DATA_STRUCTURE["database"]) + + # Initialize the results attribute + self.results = copy.deepcopy(LIVE_DATA_STRUCTURE["results"]) + + # TODO: Replace Tuple output by appending the "database" key to the respective dataset dict + # Ingest data + # ---- Acoustics + self.input["acoustics"]["prc_nasc_df"], self.config = eldp.load_acoustic_data(self.config, + update_config) + # ---- Biology + self.input["biology"], self.config = eldp.load_biology_data(self.config, + update_config) + + # TODO: Add verbosity for printing database filepaths/connections + if verbose: + pass \ No newline at end of file diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py new file mode 100644 index 00000000..37a9d3b7 --- /dev/null +++ b/echopop/live/sql_methods.py @@ -0,0 +1,73 @@ +from sqlalchemy import create_engine, text, Engine, inspect + +import pandas as pd + +SQL_COMMANDS = { + "create": "CREATE TABLE IF NOT EXISTS {table_name} ({column_definitions});", + "check": "SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';", + "drop": "DROP TABLE IF EXISTS {table_name};", + "select": "SELECT {columns} FROM {table_name};", + "index": "CREATE UNIQUE INDEX IF NOT EXISTS {index_name} ON {table_name} ({columns})", + # "insert": "INSERT INTO {table_name} ({columns});", + "insert": """ + INSERT INTO {table_name} ({columns}) + SELECT {columns} + FROM (SELECT VALUES {values} FROM (VALUES {value_placeholder})) AS source ({columns}) + {filter_clause}; + """, + "inspect": None, +} + +SQL_DTYPES = { + 'int32': 'INTEGER', + 'int64': 'INTEGER', + 'float64': 'FLOAT', + 'bool': 'BOOLEAN', + 'datetime64[ns]': 'DATETIME', + 'object': 'TEXT' +} + +# TODO: Documentation +def SQL(db_file: str, command: str, **kwargs): + + # Create engine from `db_file` string + engine = create_engine(f"sqlite:///{db_file}") + + # Format `columns`, if there are any and more than 1 + if "columns" in kwargs.keys(): + if isinstance(kwargs["columns"], list): + kwargs["columns"] = ", ".join(kwargs["columns"]) + else: + kwargs["columns"] = "*" + + # Run the command + try: + with engine.connect() as connection: + # ---- SELECT + if command == "select": + return pd.read_sql(text(SQL_COMMANDS[command].format(**kwargs)), con=connection) + # ---- REPLACE + elif command == "replace": + # ---- Extract dataframe + df_to_add = kwargs["dataframe"] + # ---- Replace current + df_to_add.to_sql(name=kwargs["table_name"], + con=connection, + if_exists="replace", index=False) + + # ---- INSERT + elif command == "insert": + # ---- Extract dataframe + df_to_add = kwargs["dataframe"] + # ---- Inser into the table + df_to_add.to_sql(name=kwargs["table_name"], con=connection, if_exists="append", + index=False) + # ---- INSPECT + elif command == "inspect": + return inspect(engine).get_table_names() + # ---- OTHER COMMAND + else: + connection.execute(text(SQL_COMMANDS[command].format(**kwargs))) + finally: + # ---- Dispose of the engine to release any resources being pooled/used + engine.dispose() From 382d444d06b4427dd83902e3e33eaee3a62cbc6b Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 22 Jul 2024 15:40:28 -0700 Subject: [PATCH 06/81] General changes --- echopop/live/biology.py | 0 echopop/live/load.py | 0 echopop/live/spatial.py | 0 echopop/live/sql_methods.py | 165 ++++- echopop/live/write.py | 0 echopop/mesh_generation.py | 1186 +++++++++++++++++++++++++++++++++++ 6 files changed, 1339 insertions(+), 12 deletions(-) delete mode 100644 echopop/live/biology.py delete mode 100644 echopop/live/load.py delete mode 100644 echopop/live/spatial.py delete mode 100644 echopop/live/write.py diff --git a/echopop/live/biology.py b/echopop/live/biology.py deleted file mode 100644 index e69de29b..00000000 diff --git a/echopop/live/load.py b/echopop/live/load.py deleted file mode 100644 index e69de29b..00000000 diff --git a/echopop/live/spatial.py b/echopop/live/spatial.py deleted file mode 100644 index e69de29b..00000000 diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index 37a9d3b7..e8d8de93 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -1,9 +1,140 @@ from sqlalchemy import create_engine, text, Engine, inspect - +import sqlalchemy as sqla import pandas as pd +from typing import Optional + +def sql_create(connection: sqla.Connection, df: pd.DataFrame, table_name: str, + primary_keys: Optional[list] = None): + """ + Generate a SQL command to create a table with dynamic columns, primary keys, and indices. + + Args: + table_name (str): The name of the table. + columns (dict): A dictionary where keys are column names and values are data types. + primary_keys (list, optional): List of column names to be used as primary keys. + + Returns: + str: The SQL command to create the table. + """ + # Generate column definitions + column_definitions = ( + ",\n".join(f"{col} {SQL_DTYPES[type(col).__name__]}" + for col in df.columns) + ) + + # Generate primary key definition + primary_key_definition = "" + if primary_keys: + primary_key_definition = f",\nPRIMARY KEY ({', '.join(primary_keys)})" + + # Combine all parts into the final SQL command + create_table_command = f""" + CREATE TABLE IF NOT EXISTS {table_name} ( + {column_definitions} + {primary_key_definition} + ); + """ + + # Execute + connection.execute(text(create_table_command.strip())) + +def sql_validate(connection: sqla.Connection, table_name: str): + """ + Check if a table exists in the database. + + Args: + connection: SQLAlchemy Connection object. + table_name (str): The name of the table to check. + + Returns: + bool: True if the table exists, False otherwise. + """ + inspector = inspect(connection) + return table_name in inspector.get_table_names() + +def sql_inspect(connection: sqla.Connection): + """ + Get a list of all tables present + + Args: + connection: SQLAlchemy Connection object. + + Returns: + list: True if the table exists, False otherwise. + """ + return inspect(connection).get_table_names() + +def sql_drop(connection: sqla.Connection, table_name: str): + """ + """ + connection.execute(text(f"DROP TABLE IF EXISTS {table_name};")) + +def sql_insert(connection: sqla.Connection, table_name: str, columns: list, dataframe: pd.DataFrame, + id_columns: Optional[list] = None): + """ + Insert data into a table. + + Args: + connection (Connection): The SQLAlchemy Connection instance. + table_name (str): The name of the table. + columns (list): List of column names. + data (list of dict): List of dictionaries containing data to insert or update. + conflict_columns (list): List of column names to use for conflict resolution. + """ + + # Prepare the SQL statement for insertion + # ---- If not a List + if not isinstance(columns, list): + columns = list(columns) + + column_names = ", ".join(columns) + + # Convert the DataFrame into a tuple and then into a string + # ---- DataFrame to Tuple + data_tuple = [tuple(row) for row in dataframe.itertuples(index=False)] + # ---- Tuple to String + if dataframe.columns.size == 1: + data_str = ", ".join( + f"{', '.join(map(str, row))}" + for row in data_tuple + ) + else: + data_str = ", ".join( + f"({', '.join(map(str, row))})" + for row in data_tuple + ) + + # Construct the "ON CONFLICT, DO UPDATE SET" if needed + on_conflict_clause = "" + if id_columns: + on_conflict_clause = f""" + ON CONFLICT ({', '.join(id_columns)}) + DO UPDATE SET {', '.join(f'{col}=excluded.{col}' for col in columns)} + """ + + # Construct the SQL query + sql_command = f""" + INSERT INTO {table_name} ({column_names}) + VALUES ({data_str}) + {on_conflict_clause} + """ + + # Execute + connection.execute(text(sql_command.strip())) + + # Commit + connection.commit() + + SQL_COMMANDS = { - "create": "CREATE TABLE IF NOT EXISTS {table_name} ({column_definitions});", + "create": sql_create, + "drop": sql_drop, + "inspect": sql_inspect, + "validate": sql_validate, + + + "check": "SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';", "drop": "DROP TABLE IF EXISTS {table_name};", "select": "SELECT {columns} FROM {table_name};", @@ -24,22 +155,32 @@ 'float64': 'FLOAT', 'bool': 'BOOLEAN', 'datetime64[ns]': 'DATETIME', - 'object': 'TEXT' + 'object': 'TEXT', + "str": "TEXT", } +def format_sql_columns(kwargs: dict): + if "columns" in kwargs: + if isinstance(kwargs["columns"], list): + kwargs["columns"] = ", ".join(kwargs["columns"]) + else: + kwargs["columns"] = "*" + + # Return the updated `kwargs` dictionary + return kwargs + +# TODO: Documentation + + # TODO: Documentation def SQL(db_file: str, command: str, **kwargs): # Create engine from `db_file` string engine = create_engine(f"sqlite:///{db_file}") - - # Format `columns`, if there are any and more than 1 - if "columns" in kwargs.keys(): - if isinstance(kwargs["columns"], list): - kwargs["columns"] = ", ".join(kwargs["columns"]) - else: - kwargs["columns"] = "*" - + + # Format the data columns, if necessary, to fit within the SQL commands + kwargs = format_sql_columns(kwargs) + # Run the command try: with engine.connect() as connection: @@ -59,7 +200,7 @@ def SQL(db_file: str, command: str, **kwargs): elif command == "insert": # ---- Extract dataframe df_to_add = kwargs["dataframe"] - # ---- Inser into the table + # ---- Insert into the table df_to_add.to_sql(name=kwargs["table_name"], con=connection, if_exists="append", index=False) # ---- INSPECT diff --git a/echopop/live/write.py b/echopop/live/write.py deleted file mode 100644 index e69de29b..00000000 diff --git a/echopop/mesh_generation.py b/echopop/mesh_generation.py index 3fab6d89..077d9c93 100644 --- a/echopop/mesh_generation.py +++ b/echopop/mesh_generation.py @@ -1,3 +1,1189 @@ +import numpy as np +import pandas as pd +from sqlalchemy import create_engine, text +from pathlib import Path +import os + +SQL_COMMANDS["create"].format(**{"table_name": "A", "column_definitions": "B"}) + +# Coordinates +x = np.array([1, 2, 3, 4, 5]) +y = np.array([1, 2, 3, 4, 5]) + +# Create the grid points +grid_points = [(i, j, 0) for i in x for j in y] + +# +data_root_dir = Path("C:/Users/Brandyn/Documents/GitHub/EchoPro_data/") +db_directory = data_root_dir / "database" +# ---- Create the directory if it does not already exist +db_directory.mkdir(parents=True, exist_ok=True) +# ---- Complete path to `biology.db` +db_file = db_directory / "grid.db" + +from sqlalchemy import create_engine, MetaData, Table, select, inspect, update, text, case + +# Initialize the database and create the table +engine = create_engine(f"sqlite:///{db_file}") + +# Define metadata and the table to drop +metadata = MetaData() +grid_table = Table('grid', metadata, autoload_with=engine) +# Drop the table +with engine.connect() as connection: + grid_table.drop(connection) + print("Table 'grid' has been dropped.") + +# Inspect the database +inspector = inspect(engine) +tables = inspector.get_table_names() +print(tables) + +def create_table_sql(table_name, columns, primary_keys=None, index_columns=None): + """ + Generate a SQL command to create a table with dynamic columns, primary keys, and indices. + + Args: + table_name (str): The name of the table. + columns (dict): A dictionary where keys are column names and values are data types. + primary_keys (list, optional): List of column names to be used as primary keys. + index_columns (list, optional): List of column names to be indexed. + + Returns: + str: The SQL command to create the table. + """ + # Generate column definitions + column_definitions = ",\n ".join(f"{col} {dtype}" for col, dtype in columns.items()) + + # Generate primary key definition + primary_key_definition = "" + if primary_keys: + primary_key_definition = f",\n PRIMARY KEY ({', '.join(primary_keys)})" + + # Generate index definitions + index_definitions = "" + if index_columns: + index_definitions = "\n".join( + f"CREATE INDEX IF NOT EXISTS idx_{table_name}_{col} ON {table_name} ({col});" + for col in index_columns + ) + + # Combine all parts into the final SQL command + create_table_command = f""" + CREATE TABLE IF NOT EXISTS {table_name} ( + {column_definitions} + {primary_key_definition} + ); + """ + # Return the command and any index definitions + return create_table_command.strip() + "\n" + index_definitions + +# Define metadata and the table to drop +metadata = MetaData() +grid_table = Table('grid', metadata, autoload_with=engine) +# Drop the table +with engine.connect() as connection: + grid_table.drop(connection) + print("Table 'grid' has been dropped.") + +check_table_exists(engine, "grid") + +with engine.connect() as connection: + sql_create(connection, df, table_name, primary_keys) + +# Create the table +table_name = "grid" +columns = {"x": "INTEGER", "y": "INTEGER", "value": "REAL"} +primary_keys = ["x", "y"] +index_columns = ["x", "y"] + +create_sql = create_table_sql(table_name, columns, primary_keys, index_columns) +print("Create Table SQL:\n", create_sql) + +with engine.connect() as connection: + connection.execute(text(create_sql)) + +inspector = inspect(engine) +tables = inspector.get_table_names() +print(tables) + +check_table_exists(engine, "grid") + +sql_command = f"SELECT * FROM {table_name};" + +with engine.connect() as connection: + result = connection.execute(text(sql_command)) + rows = result.fetchall() + +for row in rows: + print(row) + + +check_table_exists(engine, "files_read") + +zarr_files_str = ["A", "B", "C", "D"] +# ---- Create DataFrame +current_files = pd.DataFrame(zarr_files_str, columns=["filepath"]) + +with engine.connect() as connection: + sql_create(connection, table_name="files_read", df=current_files) + sql_insert(connection, table_name="files_read", columns=["filepath"], dataframe=current_files) + +table_name = "files_read" +sql_command = f"SELECT * FROM {table_name};" + +with engine.connect() as connection: + result = connection.execute(text(sql_command)) + rows = result.fetchall() + +for row in rows: + print(row) + + + +from sqlalchemy.exc import IntegrityError + +def insert_or_update(engine, table_name, columns, data, conflict_columns): + """ + Insert or update data in a table. + + Args: + engine (Engine): The SQLAlchemy engine instance. + table_name (str): The name of the table. + columns (list): List of column names. + data (list of dict): List of dictionaries containing data to insert or update. + conflict_columns (list): List of column names to use for conflict resolution. + """ + + # Prepare the SQL statement for insertion + column_names = ", ".join(columns) + placeholder = ", ".join(f":{col}" for col in columns) + # values_list = ", ".join(f"({', '.join(f':{col}' for col in columns)})" for _ in data) + values_str = ", ".join( + f"({', '.join(map(str, row))})" + for row in data + ) + + + # Construct the SQL query + sql = f""" + INSERT INTO {table_name} ({column_names}) + VALUES {values_str} + ON CONFLICT ({', '.join(conflict_columns)}) + DO UPDATE SET {', '.join(f'{col}=excluded.{col}' for col in columns)} + """ + + # Flatten the list of data for execution + # flattened_data = [item for sublist in [[(item[col] for col in columns)] for item in data] for item in sublist] + + # Execute the SQL command + with engine.connect() as connection: + try: + connection.execute(text(sql)) + # connection.commit() + print(f"Data inserted or updated successfully in table '{table_name}'.") + except IntegrityError as e: + print(f"IntegrityError: {e}") + except Exception as e: + print(f"An error occurred: {e}") + +# Prepare data for insertion or update +# data = [{'x': i, 'y': j, 'value': v} for i, j, v in grid_points] +data = grid_points + +# Insert or update data +insert_or_update(engine, table_name, columns.keys(), data, primary_keys) + +sql_command = f"SELECT * FROM {table_name};" + +with engine.connect() as connection: + result = connection.execute(text(sql_command)) + rows = result.fetchall() + +for row in rows: + print(row) + +def update_specific_rows(engine, table_name, updates, conditions): + """ + Update specific rows in a table based on conditions. + + Args: + engine (Engine): The SQLAlchemy engine instance. + table_name (str): The name of the table. + updates (dict): Dictionary of columns and their new values to be updated. + conditions (dict): Dictionary of columns and their values to be used in the WHERE clause. + """ + + # Construct the SET clause for the update + set_clause = ", ".join(f"{col} = :{col}" for col in updates.keys()) + + # Construct the WHERE clause for the update + where_clause = " AND ".join(f"{col} = :{col}_cond" for col in conditions.keys()) + + # Construct the SQL query + sql = f""" + UPDATE {table_name} + SET {set_clause} + WHERE {where_clause} + """ + + # Prepare parameters for the query + parameters = {**updates, **{f"{col}_cond": val for col, val in conditions.items()}} + + # Execute the SQL command + with engine.connect() as connection: + try: + connection.execute(text(sql), parameters) + print(f"Rows updated successfully in table '{table_name}'.") + except IntegrityError as e: + print(f"IntegrityError: {e}") + except Exception as e: + print(f"An error occurred: {e}") + +# Define table name +table_name = "grid" +# Define the table and columns +table_name = 'grid' +condition_columns = ['x', 'y'] + +# Define the updates and conditions +dd = {"x": np.array([1, 2, 3 , 4, 5]), "y": np.array([1, 2, 3 , 4, 5]), "value": np.array([1, 2, 3 , 4, 5]).astype(float)} +new_data = pd.DataFrame(dd) +new_data +df = new_data + +with engine.connect() as connection: + # sql_create(connection, table_name = "grid", df = df) + # sql_validate(connection, "grid") + # sql_drop(connection, "grid") + sql_insert(connection, table_name="grid", columns=df.columns, dataframe=df, id_columns=["x", "y"]) + + +data_tuples = [tuple(row) for row in df.itertuples(index=False)] + +all_columns = df.columns.tolist() +if len(condition_columns) >= len(all_columns): + raise ValueError("The number of condition columns must be less than the number of columns in data.") + +# Prepare column names and conditions +update_columns = [col for col in all_columns if col not in condition_columns] +condition_str = " AND ".join(f"{col} = ?" for col in condition_columns) +update_str = ", ".join(f"{col} = ?" for col in update_columns) +data_tuples = [tuple(row) for row in df.itertuples(index=False)] +# Generate values string for SQL command +values_str = ", ".join( + f"({', '.join(map(str, row))})" + for row in data_tuples +) + +# Construct the SQL query +sql = f""" +INSERT INTO {table_name} ({', '.join(all_columns)}) +VALUES {values_str} +ON CONFLICT ({', '.join(condition_columns)}) +DO UPDATE SET {', '.join(f'{col} = {table_name}.{col} + excluded.{col}' for col in update_columns)} +""" + +# Execute the SQL command +with engine.connect() as connection: + try: + connection.execute(text(sql)) + connection.commit() + print(f"Specific rows updated successfully in table '{table_name}'.") + except IntegrityError as e: + print(f"IntegrityError: {e}") + except Exception as e: + print(f"An error occurred: {e}") + +sql_command = f"SELECT * FROM {table_name};" + +with engine.connect() as connection: + result = connection.execute(text(sql_command)) + rows = result.fetchall() + +for row in rows: + print(row) + + +# Insert or update data +insert_or_update(engine, table_name, columns.keys(), data, primary_keys) + +sql_command = f"SELECT * FROM {table_name};" + +with engine.connect() as connection: + result = connection.execute(text(sql_command)) + rows = result.fetchall() + +for row in rows: + print(row) + +# Ensure that condition_columns match the length of data tuples minus the update column +if len(condition_columns) != len(df.columns) - 1: + raise ValueError("The number of condition columns must match the number of columns in data minus the update column.") + +# Prepare the SQL statement for update +update_columns = [col for col in df.columns if col not in condition_columns] +condition_str = " AND ".join(f"{col} = ?" for col in condition_columns) +update_str = ", ".join(f"{col} = ?" for col in update_columns) +# Convert DataFrame rows to list of tuples +data_tuples = [tuple(row) for row in df.itertuples(index=False)] + +# Generate a values string for the SQL command +values_str = ", ".join( + f"({', '.join(map(str, row))})" + for row in data_tuples +) +# Construct the SQL query +sql = f""" +UPDATE {table_name} +SET {update_str} +WHERE {condition_str} +""" + +# Flatten the list of data for execution +flattened_data = [] +for row in data_tuples: + conditions = row[:len(condition_columns)] + update_values = row[len(condition_columns):] + flattened_data.extend(conditions + update_values) + +# Execute the SQL command +with engine.connect() as connection: + try: + connection.execute(text(sql), flattened_data) + print(f"Specific rows updated successfully in table '{table_name}'.") + except IntegrityError as e: + print(f"IntegrityError: {e}") + except Exception as e: + print(f"An error occurred: {e}") + +# Execute the SQL command +with engine.connect() as connection: + try: + connection.execute(text(sql), flattened_data) + print(f"Specific rows updated successfully in table '{table_name}'.") + except IntegrityError as e: + print(f"IntegrityError: {e}") + except Exception as e: + print(f"An error occurred: {e}") +# Update specific rows +update_specific_rows(engine, table_name, updates, conditions) + +# Verify the update +sql_command = f"SELECT * FROM {table_name};" +with engine.connect() as connection: + result = connection.execute(text(sql_command)) + rows = result.fetchall() + +for row in rows: + print(row) +# Construct the full SQL command +sql_command = f""" +INSERT INTO {table_name} ({columns_str}) +VALUES {values_str}; +""" + +# Execute the SQL command +with engine.connect() as connection: + connection.execute(text(sql_command)) + connection.commit() + +check_table_exists(engine, "grid") + +# Define table name, columns, and data +table_name = 'grid' +columns = ['x', 'y', 'value'] +data = [ + (1, 1, 1.0), + (2, 2, 1.5), + (3, 3, 2.0) +] + +# Prepare the columns part of the SQL statement +columns_str = ", ".join(columns) + +# Prepare the values part of the SQL statement +values_str = ", ".join( + f"({', '.join(map(str, row))})" + for row in data +) + + + + + + +print("Generated SQL Command:") +print(sql_command) + +# Execute the SQL command +with engine.connect() as connection: + connection.execute(text(sql_command)) + +def insert_values_sql(table_name, columns, values, filter_clause=""): + """ + Generate a SQL command to insert values into a table. + + Args: + table_name (str): The name of the table. + columns (list): List of column names to be inserted. + values (list of tuples): List of tuples where each tuple represents a row of values to be inserted. + filter_clause (str, optional): Optional filter clause to specify conditions for insertion. + + Returns: + str: The SQL command to insert values into the table. + """ + # Generate column names + column_names = ", ".join(columns) + + # Generate value placeholders + value_placeholders = ", ".join("?" * len(columns)) + + # Generate values part + values_part = ", ".join(f"({', '.join('?' * len(columns))})" for _ in values) + + # Flatten the values list for insertion + flattened_values = [item for sublist in values for item in sublist] + + # Create the SQL command + insert_command = f""" + INSERT INTO {table_name} ({column_names}) + VALUES {values_part} + {filter_clause} + """ + return insert_command.strip(), flattened_values + +# Define the values for insertion +insert_columns = ["x", "y", "value"] +insert_values = [(1, 1, 10.0)] + +insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values) +print("Insert Values SQL:\n", insert_sql) +print("Data:\n", insert_data) + +insrt_stmt = + +with engine.connect() as connection: + connection.execute(text(insert_sql), tuple(insert_data)) + +# Define the values for insertion +insert_columns = ["x", "y", "value"] +insert_values = [(1, 1, 10.0), (2, 2, 20.0), (3, 3, 30.0)] + +# Call the function +insert_or_update_table(engine, table_name, columns, data, conflict_columns) + +# Example usage +table_name = "grid" +columns = ["x", "y", "value"] +data = [ + (1, 1, 1.0), + (2, 2, 1.5), + (3, 3, 2.0), +] + +sql_command = "INSERT INTO grid (x, y, value) VALUES (:x, :y, :value)" +test_data = [{'x': 1, 'y': 1, 'value': 1.0}] + +with engine.connect() as connection: + connection.execute(text(sql_command), test_data) + +# Generate the SQL command and data +insert_stmt = insert_into_table(table_name, columns, data) + +# Print the generated SQL command (for validation) +print("Insert SQL Command:") +print(insert_stmt) + +# Print for validation +print("Insert SQL Command:") +print(insert_sql) +print("Data:") +print(insert_data) + +# Example execution with SQLAlchemy +with engine.connect() as connection: + connection.execute(insert_stmt) + +def insert_values_sql(table_name, columns, values): + """ + Generate SQL command for inserting values into a table. + + Args: + table_name (str): The name of the table. + columns (list): List of column names. + values (list of tuples): List of values to insert. + + Returns: + str: The SQL command to insert the values. + list: Flattened list of values for binding to the SQL command. + """ + column_names = ", ".join(columns) + value_placeholders = ", ".join("?" * len(columns)) + values_part = ", ".join(f"({value_placeholders})" for _ in values) + flattened_values = [item for sublist in values for item in sublist] + + insert_command = f""" + INSERT INTO {table_name} ({column_names}) + VALUES {values_part} + """ + return insert_command.strip(), flattened_values + +def check_table_exists(engine, table_name): + """ + Check if a table exists in the database. + + Args: + engine: SQLAlchemy engine object. + table_name (str): The name of the table to check. + + Returns: + bool: True if the table exists, False otherwise. + """ + inspector = inspect(engine) + return table_name in inspector.get_table_names() + +with engine.connect() as connection: + # sql_validate(connection, "grid") + sql_inspect(connection) + sql_drop(connection, table_name) + +def select_from_table(engine, table_name, columns='*'): + """ + Select data from a table. + + Args: + engine: SQLAlchemy engine object. + table_name (str): The name of the table to select from. + columns (str or list): Columns to select. '*' selects all columns. + + Returns: + list: List of rows returned by the query. + """ + metadata = MetaData(bind=engine) + table = Table(table_name, metadata, autoload_with=engine) + + if columns == '*': + columns = [col.name for col in table.columns] + elif isinstance(columns, str): + columns = [columns] + + stmt = select([table.c[col] for col in columns]) + + with engine.connect() as connection: + result = connection.execute(stmt) + return result.fetchall() + +# Create table +table_name = "grid" +columns = {"x": "INTEGER", "y": "INTEGER", "value": "REAL"} +primary_keys = ["x", "y"] +index_columns = ["value"] + +create_sql = create_table_sql(table_name, columns, primary_keys, index_columns) +print("Create Table SQL:\n", create_sql) + +with engine.connect() as connection: + connection.execute(create_sql) + +insert_columns = ["x", "y", "value"] +insert_values = [(1, 1, 10.0), (2, 2, 20.0), (3, 3, 30.0)] + +# Insert data function +def insert_values_sql(table_name, columns, values): + column_names = ", ".join(columns) + value_placeholders = ", ".join("?" * len(columns)) + values_part = ", ".join(f"({value_placeholders})" for _ in values) + + insert_command = f""" + INSERT INTO {table_name} ({column_names}) + VALUES {values_part} + """ + # Flatten the list of values into a single list + flattened_values = [value for sublist in values for value in sublist] + + return insert_command.strip(), flattened_values + + +table_name = 'grid' +columns = ['x', 'y', 'value'] +data = [ + (1, 1, 1.0), + (2, 2, 1.5), + (3, 3, 2.0) +] + +# Prepare the columns part of the SQL statement +columns_str = ", ".join(columns) + +# Prepare the values part of the SQL statement +values_str = ", ".join( + f"({', '.join(map(str, row))})" + for row in data +) + +# Construct the full SQL command +sql_command = f""" +INSERT INTO {table_name} ({columns_str}) +VALUES {values_str}; +""" + +# Execute the SQL command +with engine.connect() as connection: + connection.execute(text(sql_command)) + +sql_command = f"SELECT * FROM {table_name};" + +with engine.connect() as connection: + result = connection.execute(text(sql_command)) + rows = result.fetchall() + +print(f"Data in table {table_name}:") +for row in rows: + print(row) +# Construct the full SQL command +sql_command = f""" +INSERT INTO {table_name} ({columns_str}) +VALUES {values_str}; +""" + + +insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values) +print("Insert Values SQL:\n", insert_sql) +print("Insert Data:\n", insert_data) + +with engine.connect() as connection: + connection.execute(insert_sql, [insert_data]) + +# Check table existence +exists = check_table_exists(engine, table_name) +print(f"Table '{table_name}' exists: {exists}") + +# Select data from table +data = select_from_table(engine, table_name, insert_columns) +print(f"Data from '{table_name}':") +for row in data: + print(row) + + + + +create_sql = create_table_sql(table_name, columns, primary_keys, index_columns) +print("Create Table SQL:\n", create_sql) + +# Define the values for insertion +insert_columns = ["x", "y", "value"] +insert_values = [(1, 1, 10.0)] + +insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values) +print("Insert Values SQL:\n", insert_sql) +print("Data:\n", insert_data) + +# Example usage +table_name = "grid" +columns = { + "x": "INTEGER", + "y": "INTEGER", + "value": "REAL" +} +primary_keys = ["x", "y"] +index_columns = ["value"] + +sql_command = create_table_sql(table_name, columns, primary_keys, index_columns) +print(sql_command) + +# Create the table +create_table_sql = """ +CREATE TABLE IF NOT EXISTS grid ( + x INTEGER, + y INTEGER, + value REAL, + PRIMARY KEY (x, y) +); +""" + +# Insert grid points +insert_values = ", ".join(f"({i}, {j}, {v})" for i, j, v in grid_points) +insert_sql = f""" +INSERT INTO grid (x, y, value) VALUES {insert_values}; +""" + +# Connect to the database and execute the commands +with engine.connect() as connection: + try: + # Create table if it does not exist + connection.execute(text(create_table_sql)) + # Insert grid points + connection.execute(text(insert_sql)) + connection.commit() + print("Grid points successfully inserted.") + except Exception as e: + print(f"An error occurred: {e}") + + +engine = create_engine(f"sqlite:///{db_file}") +metadata = MetaData() +grid_table = Table('grid', metadata, autoload_with=engine) +# Read existing grid values from the database into a DataFrame +with engine.connect() as connection: + select_stmt = select(grid_table.c.x, grid_table.c.y, grid_table.c.value) + result = connection.execute(select_stmt) + existing_data = pd.DataFrame(result.fetchall(), columns=['x', 'y', 'value']) + +# Coordinates to update +update_coords = {(1,1), (2,2), (3,3), (4,4), (5,5)} + +# Create a dictionary for fast lookup +update_dict = {(i, j): 1.0 for i, j in update_coords} + +# Update the grid_points with new values where applicable +updated_grid_points = [ + (i, j, update_dict.get((i, j), value)) + for i, j, value in grid_points +] + +# Convert the list of tuples to a DataFrame +df_updated_grid_points = pd.DataFrame(updated_grid_points, columns=['x', 'y', 'value']) + +# Print the DataFrame +print(df_updated_grid_points) + +# Merge existing and updated data to find differences +merged_data = pd.merge(existing_data, df_updated_grid_points, on=['x', 'y'], suffixes=('_existing', '_updated')) +differences = merged_data[merged_data['value_existing'] != merged_data['value_updated']] + +# Assuming 'differences' is your DataFrame with updated values +# Create a dictionary for batch updating +update_dict = differences.set_index(['x', 'y'])['value_updated'].to_dict() + +# Generate the SQLAlchemy update statement +update_stmt = update(grid_table).where( + grid_table.c.x.in_(update_dict.keys()) +).values({ + grid_table.c.value: update_dict.get((grid_table.c.x, grid_table.c.y), grid_table.c.value) +}) + +# Create the CASE statement +case_stmt = case( + { + (grid_table.c.x == x) & (grid_table.c.y == y): value + for (x, y), value in update_dict.items() + }, + else_=grid_table.c.value +) + +# Convert the DataFrame into a dictionary of case statements +case_stmt = case( + [(grid_table.c.x == x) & (grid_table.c.y == y), value] + for (x, y), value in update_dict.items() +) + +# Create the case statement +case_stmt = case( + { (x, y): value for (x, y), value in update_dict.items() }, + value=grid_table.c.x, # Assuming `x` is the column being compared + else_=grid_table.c.value +) + +case_stmt = case( + { + (x, y): value + for (x, y), value in update_dict.items() + }, + value=grid_table.c.x, + else_=grid_table.c.value +) + +# Create the case statement +# Create a CASE statement using a dictionary +case_stmt = case( + { + (grid_table.c.x == x) & (grid_table.c.y == y): value + for (x, y), value in update_dict.items() + }, + else_=grid_table.c.value +) +case_stmt = case( + {((grid_table.c.x == x) & (grid_table.c.y == y)): value + for (x, y), value in update_dict.items()}, + else_=grid_table.c.value +) +print("Case Statement:", str(case_stmt.compile(engine, compile_kwargs={"literal_binds": True}))) + + +# Create the update statement +update_stmt = ( + update(grid_table). + where(grid_table.c.value != case_stmt). + values(value=case_stmt) +) + +print("Update Statement:", str(update_stmt.compile(engine, compile_kwargs={"literal_binds": True}))) + + +# Print the SQL for each update +for (x, y), value in update_dict.items(): + update_stmt = ( + update(grid_table) + .where((grid_table.c.x == x) & (grid_table.c.y == y)) + .values(value=value) + ) + # Print the SQL statement with literal values for debugging + print("Update Statement:", str(update_stmt.compile(engine, compile_kwargs={"literal_binds": True}))) + + # Execute the update statement + with engine.connect() as connection: + result = connection.execute(update_stmt) + print(f"Updated {result.rowcount} entries for coordinates ({x}, {y}).") + +# Execute the update +with engine.connect() as connection: + result = connection.execute(update_stmt) + print(f"Updated {result.rowcount} entries.") + +engine.dispose() + +engine = create_engine(f"sqlite:///{db_file}") +metadata = MetaData() +grid_table = Table('grid', metadata, autoload_with=engine) +# Verify the updated rows +select_stmt = select(grid_table) + +with engine.connect() as connection: + result = connection.execute(select_stmt) + rows = result.fetchall() + +for row in rows: + print(row) + +# Define your SQLite engine and metadata +engine = create_engine(F'sqlite:///{db_file}') +metadata = MetaData() + +# Reflect the grid table +grid_table = Table('grid', metadata, autoload_with=engine) + +# Define your update dictionary +update_dict = {(1, 1): 1.0, (2, 2): 1.0, (3, 3): 1.0, (4, 4): 1.0, (5, 5): 1.0} + +# Execute updates +# with engine.connect() as connection: +connection = engine.connect() +# for (x, y), value in update_dict.items(): +(x,y) = (1, 1) +value = update_dict[(1,1)] + +update_stmt = ( + update(grid_table) + .where((grid_table.c.x == x) & (grid_table.c.y == y)) + .values(value=value) +) +# Print the SQL statement for debugging +print("Executing Update Statement:", str(update_stmt.compile(engine, compile_kwargs={"literal_binds": True}))) + +# Execute the update statement +result = connection.execute(update_stmt) +print(f"Updated {result.rowcount} entries for coordinates ({x}, {y}).") +connection.close() + +select_stmt = select(grid_table.c.x) + +# Execute the SELECT statement +with engine.connect() as connection: + result = connection.execute(select_stmt) + x_values = result.fetchall() + +type(x_values[0]) + +select_stmt = select(grid_table.c.y) + +# Execute the SELECT statement +with engine.connect() as connection: + result = connection.execute(select_stmt) + y_values = result.fetchall() + +select_stmt = select(grid_table.c.value) + +# Execute the SELECT statement +with engine.connect() as connection: + result = connection.execute(select_stmt) + values = result.fetchall() + +case_stmt = case( + *[(grid_table.c.x == x) & (grid_table.c.y == y, value) + for (x, y), value in update_dict.items()], + else_=grid_table.c.value +) + +update_dict = {(1, 2): 1.0, (3, 2): 1.0, (1, 5): 1.0, (4, 5): 1.0, (3, 5): 4.0} + +with engine.connect() as connection: + # Select all values to check the current state + result = connection.execute(select(grid_table.c.x, grid_table.c.y, grid_table.c.value)) + current_values = result.fetchall() + print("Current Values:", current_values) + +with engine.connect() as connection: + with connection.begin(): # Begin a transaction + for (x, y), value in update_dict.items(): + stmt = ( + update(grid_table) + .where((grid_table.c.x == x) & (grid_table.c.y == y)) + .values(value=grid_table.c.value + value) + ) + connection.execute(stmt) + +with engine.connect() as connection: + # Re-select to check the updated state + result = connection.execute(select(grid_table.c.x, grid_table.c.y, grid_table.c.value)) + updated_values = result.fetchall() + print("Updated Values:", updated_values) + + +# Confirm the updates +with engine.connect() as connection: + select_stmt = select([grid_table]) + result = connection.execute(select_stmt) + rows = result.fetchall() + +# Print all rows to verify updates +print("Database contents after update:") +for row in rows: + print(row) + + +# Construct the update statement +update_stmt = ( + update(grid_table) + .values(value=case_stmt) + .where(grid_table.c.value != case_stmt) +) + +# Create a SELECT statement to fetch all rows from the grid_table +select_stmt = select(grid_table) + +# Execute the SELECT statement and fetch results +with engine.connect() as connection: + result = connection.execute(select_stmt) + rows = result.fetchall() + +# Print or inspect the fetched rows +for row in rows: + print(row) + +# Create the update statement +update_stmt = ( + update(grid_table) + .where(grid_table.c.value != case_stmt) + .values(value=case_stmt) +) + +# Execute the update +with engine.connect() as connection: + result = connection.execute(update_stmt) + print(f"Updated {result.rowcount} entries.") + +case( + [ + ((grid_table.c.x == x) & (grid_table.c.y == y), value) + for (x, y), value in update_dict.items() + ], + else_=grid_table.c.value +) + +# Create a case statement for conditional update +case_statements = { + (x, y): case( + [(grid_table.c.x == x) & (grid_table.c.y == y, value)], + else_=grid_table.c.value + ) + for (x, y), value in update_dict.items() +} + + +# Define SQL command to select all data from the grid table +select_sql = "SELECT * FROM grid;" + +# Connect to the database and execute the query +with engine.connect() as connection: + try: + # Execute the select command + result = connection.execute(text(select_sql)) + # Fetch all rows from the result + rows = result.fetchall() + # Print the results + print("Data in grid table:") + for row in rows: + print(row) + except Exception as e: + print("An error occurred: {}".format(e)) + +# Coordinates to update +update_coords = {(1,1), (2,2), (3,3), (4,4), (5,5)} + +# Create a copy of grid_points and update specific coordinates +updated_grid_points = [ + (i, j, 1.0) if (i, j) in update_coords else (i, j, value) + for i, j, value in grid_points +] + +# Retrieve current data from the database +with engine.connect() as connection: + result = connection.execute(text("SELECT x, y, value FROM grid;")) + current_data = result.fetchall() + +# Convert to a dictionary for easy comparison +current_values = {(x, y): value for x, y, value in current_data} + +# Convert updated_grid_points to a dictionary +updated_values = {(i, j): value for i, j, value in updated_grid_points} + +# Find differences +differences = [ + (i, j, value) + for i, j, value in updated_grid_points + if (i, j) in updated_values and (i, j) not in current_values or + (i, j) in current_values and current_values[(i, j)] != value +] + +# Update differing values in the database +with engine.connect() as connection: + for i, j, value in differences: + connection.execute( + text(f"UPDATE grid SET value = {value} WHERE x = {i} AND y = {j}"), + ) + print(f"Updated {len(differences)} entries.") + +# Step 8: Read the table into Python +with engine.connect() as connection: + # Query to select all rows from the table + result = connection.execute(text("SELECT x, y, value FROM grid;")) + df = pd.DataFrame(result.fetchall(), columns=['x', 'y', 'value']) + +# Print the DataFrame to validate the changes +print(df) + +# Check current values +with engine.connect() as connection: + result = connection.execute(text("SELECT x, y, value FROM grid;")) + current_values = {(row[0], row[1]): row[2] for row in result.fetchall()} + +print("Current grid points in database:") +for row in current_values.items(): + print(row) + +print("Updated grid points with changes:") +for row in updated_grid_points: + print(row) + +# Determine differences +differences = [ + (i, j, value) + for i, j, value in updated_grid_points + if (i, j) in current_values and current_values[(i, j)] != value +] + +print(f"Differences to update: {differences}") + +# Step 6: Update the database with INSERT OR REPLACE +with engine.connect() as connection: + with connection.begin(): # Ensure transactions are committed + for i, j, value in updated_grid_points: + sql = """ + INSERT OR REPLACE INTO grid (x, y, value) + VALUES (:x, :y, :value) + """ + print(f"Executing SQL: {sql} with values: x={i}, y={j}, value={value}") + connection.execute( + text(sql), + {"x": i, "y": j, "value": value} + ) + print(f"Updated entries with INSERT OR REPLACE.") + +# Step 8: Read the table into Python +with engine.connect() as connection: + result = connection.execute(text("SELECT x, y, value FROM grid;")) + rows = result.fetchall() + df = pd.DataFrame(rows, columns=['x', 'y', 'value']) + +# Print the DataFrame to validate the changes +print("Updated table data:") +print(df) + + +engine.dispose() + +# Check if the file exists and then remove it +if db_file.exists(): + db_file.unlink() + print(f"Deleted the file: {db_file}") +else: + print(f"The file does not exist: {db_file}") + +with engine.connect() as connection: + connection.execute(text(""" + CREATE TABLE IF NOT EXISTS grid ( + x INTEGER, + y INTEGER, + value REAL, + PRIMARY KEY (x, y) + ); + """)) + + connection.execute(text(""" + INSERT OR REPLACE INTO grid (x, y, value) VALUES + (1, 1, 0), (1, 2, 0), (1, 3, 0), (1, 4, 0), (1, 5, 0), + (2, 1, 0), (2, 2, 0), (2, 3, 0), (2, 4, 0), (2, 5, 0), + (3, 1, 0), (3, 2, 0), (3, 3, 0), (3, 4, 0), (3, 5, 0), + (4, 1, 0), (4, 2, 0), (4, 3, 0), (4, 4, 0), (4, 5, 0), + (5, 1, 0), (5, 2, 0), (5, 3, 0), (5, 4, 0), (5, 5, 0); + """)) + + # Insert initial values (0) into the grid table + values = ",".join(["({}, {}, {})".format(i, j, 0) for i, j, _ in grid_points]) + connection.execute(text("INSERT INTO grid (x, y, value) VALUES {values};".format(values=values))) + + # Commit + connection.commit() + + # Verify data insertion + result = connection.execute(text("SELECT * FROM grid;")) + rows = result.fetchall() + print("Data in grid table:", rows) + + connection.execute(text(""" + INSERT INTO grid (x, y, value) VALUES + """ + ",".join(["({}, {}, {})".format(i, j, 0) for i, j, _ in grid_points]) + ";")) + +engine.dispose() + + + result = connection.execute(text("SELECT * FROM grid;")) + rows = result.fetchall() + print("Data in grid table:", rows) + +with engine.connect() as connection: + result = connection.execute(text("SELECT name FROM sqlite_master WHERE type='table';")) + print(result.fetchall()) + +with engine.connect() as connection: + # Describe the table schema + result = connection.execute(text("PRAGMA table_info(grid);")) + columns = result.fetchall() + print("Table schema:", columns) + +with engine.connect() as connection: + result = connection.execute(text("SELECT * FROM grid;")) + rows = result.fetchall() + for row in rows: + print(row) + +SQL(db_file, command="select") + + + + + import pandas as pd import numpy as np import matplotlib.pyplot as plt From 2374ce9b96d2a17dd02af8f63104dcbf5f74ecbf Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 24 Jul 2024 18:26:04 -0700 Subject: [PATCH 07/81] Commited SQL changes --- config_files/live_initialization_config.yml | 4 +- config_files/live_survey_year_2019_config.yml | 12 +- .../{liveacoustics.py => live_acoustics.py} | 0 echopop/live/{livecore.py => live_core.py} | 16 +- echopop/live/live_data_processing.py | 1029 ++++++++++++----- .../live/{livesurvey.py => live_survey.py} | 16 +- echopop/live/sql_methods.py | 234 ++-- echopop/mesh_generation.py | 113 +- echopop/zarr_read_ingest_test.py | 830 ++++++------- 9 files changed, 1419 insertions(+), 835 deletions(-) rename echopop/live/{liveacoustics.py => live_acoustics.py} (100%) rename echopop/live/{livecore.py => live_core.py} (86%) rename echopop/live/{livesurvey.py => live_survey.py} (78%) diff --git a/config_files/live_initialization_config.yml b/config_files/live_initialization_config.yml index 84c48bbb..a407520e 100644 --- a/config_files/live_initialization_config.yml +++ b/config_files/live_initialization_config.yml @@ -35,8 +35,8 @@ longitude: [-135.25, -117.00] # x/y (or E-W/N-S) grid resolution in nmi grid_resolution: - x: 25.0 - y: 25.0 + x_distance: 50.0 + y_distance: 50.0 projection: epsg:4326 # EPSG integer code for geodetic parameter dataset # TODO: Remember to convert this back to a string # NOTE: `link_biology_acoustics` defines how biological and acoustic data are linked with one another. This diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml index a8450039..b7b7aef4 100644 --- a/config_files/live_survey_year_2019_config.yml +++ b/config_files/live_survey_year_2019_config.yml @@ -19,22 +19,30 @@ data_root_dir: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files ############################################################################## # Input data directories input_directories: - acoustic: + acoustics: directory: acoustics/ + database_name: acoustics.db extension: zarr - biological: + biology: directory: biology/ + database_name: biology.db extension: csv file_name_formats: catch: "{DATE:YYYYMM}_{HAUL}_{FILE_ID:catch_perc}" length: "{DATE:YYYYMM}_{SPECIES_CODE}_{HAUL}_{FILE_ID:lf}" specimen: "{DATE:YYYYMM}_{SPECIES_CODE}_{HAUL}_{FILE_ID:spec}" + trawl_info: "{DATE:YYYYMM}_{HAUL}_{FILE_ID:operation_info}" file_index: catch: [haul_num] length: [haul_num, species_id] specimen: [haul_num, species_id] + trawl_info: [] file_ids: catch: catch_perc length: lf specimen: spec + trawl_info: operation_info + coastline: + directory: coastline/ + coastline_name: ne_110m_land ... diff --git a/echopop/live/liveacoustics.py b/echopop/live/live_acoustics.py similarity index 100% rename from echopop/live/liveacoustics.py rename to echopop/live/live_acoustics.py diff --git a/echopop/live/livecore.py b/echopop/live/live_core.py similarity index 86% rename from echopop/live/livecore.py rename to echopop/live/live_core.py index 83e72a86..95750f5f 100644 --- a/echopop/live/livecore.py +++ b/echopop/live/live_core.py @@ -63,6 +63,20 @@ "catch_perc": "catch_percentage", } }, + "trawl_info": { + "dtypes": { + "operation_number": int, + "td_timestamp": str, + "td_latitude": float, + "td_longitude": float, + }, + "names": { + "operation_number": "haul_num", + "td_timestamp": "datetime", + "td_latitude": "latitude", + "td_longitude": "longitude", + }, + }, "length": { "dtypes": { "sex": str, @@ -73,7 +87,7 @@ "sex": "sex", "rounded_length": "length", "frequency": "length_count", - } + }, }, "specimen": { "dtypes": { diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py index 293862c4..fd89993c 100644 --- a/echopop/live/live_data_processing.py +++ b/echopop/live/live_data_processing.py @@ -2,13 +2,13 @@ import re from pathlib import Path -from typing import Union, Tuple +from typing import Union, Tuple, Optional, List import pandas as pd import xarray as xr import numpy as np -from .livecore import( +from .live_core import( LIVE_DATA_STRUCTURE, LIVE_FILE_FORMAT_MAP, LIVE_INPUT_FILE_CONFIG_MAP @@ -34,7 +34,9 @@ def live_configuration(live_init_config_path: Union[str, Path], missing_config = [ files for files, exists in zip(config_files, config_existence) if not exists ] - raise FileNotFoundError(f"The following configuration files do not exist: {missing_config}") + raise FileNotFoundError( + f"The following configuration files do not exist: {missing_config}." + ) # Read the YAML configuration/recipe file to parameterize the `LiveSurvey` class # ---- Initialization settings @@ -56,21 +58,147 @@ def live_configuration(live_init_config_path: Union[str, Path], # Combine both into a dictionary output that can be added to the `LiveSurvey` class object return {**init_config, **file_config} -# TODO: Documentation -def compile_filename_format(file_name_format: str): +def validate_data_directory(root_directory: str, file_settings: dict) -> List[Path]: - # Create a copy of `file_name_format` - regex_pattern = file_name_format + # Get acoustic directory and initialization settings + # ---- Create the full filepath + directory_path = Path(root_directory) / file_settings["directory"] + # ---- Get the defined file extension + file_extension = file_settings["extension"] + + # Validate filepath, columns, datatypes + # ---- Error evaluation (if applicable) + if not directory_path.exists(): + raise FileNotFoundError( + f"The acoustic data directory [{directory_path}] does not exist." + ) - # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern - for key, value in LIVE_FILE_FORMAT_MAP.items(): - regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"]) - # ---- Replace the `FILE_ID` tag - regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P\1)', regex_pattern) + # Validate that files even exist + # ---- List available *.zarr files + data_files = list(directory_path.glob(f"*{'.'+file_extension}")) + # ---- Error evaluation (if applicable) + if not data_files: + raise FileNotFoundError( + f"No `*.{file_extension}` files found in [{directory_path}]!" + ) + + # Return the output + return data_files - # Compile the regex pattern and return the output - return re.compile(regex_pattern) +def query_processed_files(root_directory: str, file_settings: dict, files: List[Path]) -> dict: + + # Get the database name + db_name = file_settings["database_name"] + + # Create filepath to the SQL database + # ---- Create Path to SQL database file + db_directory = Path(root_directory) / "database" + # ---- Create the directory if it does not already exist + db_directory.mkdir(parents=True, exist_ok=True) + # ---- Complete path to the database file + db_file = db_directory / db_name + + # Create a list of string-formatted Path names + files_str = [str(file) for file in files] + # ---- Create DataFrame + current_files = pd.DataFrame(files_str, columns=["filepath"]) + + # Check for the table `files_read` + files_read_tbl = SQL(db_file, "validate", table_name="files_read") + + # Validate whether the table exists; if not, create the table and then insert + if not files_read_tbl: + # ---- Create table + SQL(db_file, "create", table_name="files_read", dataframe=current_files, + primary_keys = ["filepath"]) + # ---- Populate table + SQL(db_file, "insert", table_name="files_read", dataframe=current_files) + # ---- Break early + return files_str, db_file + + # Query already existing files + previous_files = SQL(db_file, "select", table_name="files_read", output_type=str) + # ---- Insert file list + SQL(db_file, "insert", table_name="files_read", dataframe=current_files, id_columns="filepath") + + # Filter out previously processed files + # ---- Apply filter by comparing sets and return the output + return list(set(files_str) - set(previous_files)), db_file + +def sql_data_exchange(database_file: Path, **kwargs): + + # Check whether the `table_name` table exists + table_exists = SQL(database_file, "validate", **kwargs) + + # If empty and table does not exist + if kwargs["dataframe"].empty and table_exists: + return SQL(database_file, "select", **kwargs) + + # Create table if it does not exist and run the initial insertion + if not table_exists: + # ---- Create table + SQL(database_file, "create", **kwargs) + # ---- Ignore the `id_columns` argument, if present + try: + del kwargs["id_columns"] + except KeyError: + pass + # ---- Insert into table + SQL(database_file, "insert", **kwargs) + # ---- Return the initial dataframe + return kwargs.get("dataframe") + + # Insert into the table + SQL(database_file, "insert", **kwargs) + + # Select existing data frame the database and return the output + return SQL(database_file, "select", **kwargs) + +def read_acoustic_zarr(acoustic_files: Path) -> tuple: + + # Get the file-specific settings, datatypes, columns, etc. + # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` + acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"] + # ---- Create list of coordinate data variables + specified_vars = list(acoustics_config_map["xarray_variables"].keys()) + # ---- Create set of coordinate variables + specified_coords = list(acoustics_config_map["xarray_coordinates"].keys()) + # ---- Concatenate into a full configuration map + full_config_map = {**acoustics_config_map["xarray_coordinates"], + **acoustics_config_map["xarray_variables"]} + + # Determine the file loading method for the `acoustic_files` + if len(acoustic_files) > 1: + zarr_data_ds = xr.open_mfdataset(acoustic_files, engine="zarr", chunks="auto", + data_vars=specified_vars, coords=specified_coords) + else: + zarr_data_ds = xr.open_dataset(acoustic_files[0], engine="zarr", chunks="auto") + + # Pre-process the Dataset, convert it to a DataFrame, and validate the structure + # ---- Convert to a DataFrame + zarr_data_df = zarr_data_ds.to_dataframe().reset_index() + # ---- Check for any missing columns + missing_columns = ( + [key for key in full_config_map.keys() if key not in zarr_data_df.columns] + ) + # ---- Raise Error, if needed + if missing_columns: + raise ValueError( + f"The following columns are missing from at least one file: in " + f"{', '.join(missing_columns)}!" + ) + # ---- Select defined columns + zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map) + + # Gather some of the units + data_units = { + "longitude": zarr_data_ds.longitude.units, + "latitude": zarr_data_ds.latitude.units, + "frequency": zarr_data_ds.frequency_nominal.units, + } + # Return a Tuple + return zarr_data_df_filtered, data_units # TODO: Documentation def configure_transmit_frequency(frequency_values: pd.Series, @@ -90,17 +218,104 @@ def configure_transmit_frequency(frequency_values: pd.Series, # ---- No change else: return frequency_values + +def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame, + file_configuration: dict) -> pd.DataFrame: + + # Get acoustic processing settings + acoustic_analysis_settings = file_configuration["acoustics"] + # ---- Extract the fined acoustic frequency + transmit_settings = acoustic_analysis_settings["transmit"] + + # Filter the dataset + # ---- Configure `frequency_nominal`, if necessary + prc_nasc_df["frequency_nominal"] = ( + configure_transmit_frequency(prc_nasc_df["frequency_nominal"], + transmit_settings, + acoustic_analysis_settings["dataset_units"]["frequency"]) + ) + # ---- Filter out any unused frequency coordinates + prc_nasc_df_filtered = ( + prc_nasc_df[prc_nasc_df["frequency_nominal"] == transmit_settings["frequency"]] + ) + + # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object + # ---- Replace NASC `NaN` values with `0.0` + prc_nasc_df_filtered.loc[:, "NASC"] = prc_nasc_df_filtered.loc[:, "NASC"].fillna(0.0) + # ---- Drop the `frequency_nominal` column and return the output + return prc_nasc_df_filtered.drop(columns = ["frequency_nominal"]) + +def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame]: + + # Get the acoustic file settings and root directory + # ---- File settings + file_settings = file_configuration["input_directories"]["acoustics"] + # ---- Root directory + root_directory = file_configuration["data_root_dir"] -# TODO: Documentation -def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict): + # Get and validate the acoustic data directory and files + acoustic_files = validate_data_directory(root_directory, file_settings) + + # Query `acoustics.db` to process only new files (or create the db file in the first place) + new_acoustic_files, file_configuration["database"]["acoustics"] = ( + query_processed_files(root_directory, file_settings, acoustic_files) + ) + + # Read in the acoustic data files + if new_acoustic_files: + # ! [REQUIRES DASK] ---- Read in the listed file + prc_nasc_df, acoustic_data_units = read_acoustic_zarr(new_acoustic_files) + # ---- Add the `acoustic_data_units` to the dictionary + file_configuration["acoustics"]["dataset_units"] = acoustic_data_units + # ---- Preprocess the acoustic dataset + prc_nasc_df_processed = preprocess_acoustic_data(prc_nasc_df, file_configuration) + # ---- Return output + return prc_nasc_df_processed + else: + return None + +def filter_filenames(directory_path: Path, filename_id: str, + files: List[Path], + file_extension: str): + + # Drop the `{FIELD_ID}` tag identifier + file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', filename_id) + # ---- Replace all other tags with `*` placeholders + file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) + # ---- Create Path object with the generalized format + subfile_path_obj = directory_path.glob(f"{file_id_format}.{file_extension}") + # ---- List all files that match this pattern + subfile_str = [str(file) for file in list(subfile_path_obj)] + + # Convert list of proposed files from Path to String + file_str = [str(file) for file in list(files)] + + # Find intersection with the proposed filenames and return the output + return list(set(subfile_str).intersection(set(file_str))) + +def compile_filename_format(file_name_format: str): + + # Create a copy of `file_name_format` + regex_pattern = file_name_format + + # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern + for key, value in LIVE_FILE_FORMAT_MAP.items(): + regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"]) + # ---- Replace the `FILE_ID` tag + regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P\1)', regex_pattern) + + # Compile the regex pattern and return the output + return re.compile(regex_pattern) + +def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict): # Read in the `*.csv` file - df = pd.read_csv(file, usecols=list(config_settings["dtypes"].keys())) + df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys())) # Validate the dataframe # ---- Check for any missing columns missing_columns = ( - [key for key in config_settings["dtypes"].keys() if key not in df.columns] + [key for key in config_map["dtypes"].keys() if key not in df.columns] ) # ---- Raise Error, if needed if missing_columns: @@ -108,9 +323,9 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict): f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!" ) # ---- Ensure the correct datatypes - df_validated = df.astype(config_settings["dtypes"]) + df_validated = df.astype(config_map["dtypes"]) # ---- Replace column names and drop - df_validated = df_validated.rename(columns=config_settings["names"]) + df_validated = df_validated.rename(columns=config_map["names"]) # Get the substring components that can be added to the DataFrame filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern) @@ -130,293 +345,545 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict): # Return the resulting DataFrame return df_validated -# TODO: Documentation -# TODO: Refactor, break up cyclomatic complexity -def load_biology_data(file_configuration: dict, update_config: bool = True): +def preprocess_biology_data(biology_output: dict, file_configuration: dict): + + # Get SQL database file + biology_db = file_configuration["database"]["biology"] + + # Get contrasts used for filtering the dataset + # ---- Species + species_filter = file_configuration["species"]["number_code"] + # ---- Trawl partition information + trawl_filter = file_configuration["biology"]["catch"]["partition"] + # ---- Create filter dictionary + filter_dict = dict(species_id=species_filter, trawl_partition=trawl_filter) - # Get acoustic directory and initialization settings - # ---- Files - biology_file_settings = file_configuration["input_directories"]["biological"] - # ---- General settings - biology_analysis_settings = file_configuration["biology"] + # Apply the filter + filtered_biology_output = { + key: biology_data_filter(df, filter_dict) + for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty + } + # ---- Swap this out if no new files are present + if not filtered_biology_output: + # ---- Get available tables + table_list = list(set(SQL(biology_db, "map")) - set(["files_read"])) + # ---- Plug into the dictionary + filtered_biology_output.update({key: pd.DataFrame() for key in table_list}) + # ---- Initialize the results dictionary + results_dict = {key: pd.DataFrame() for key in filtered_biology_output.keys()} + + # Update the SQL database + for table_name, df in filtered_biology_output.items(): + # ---- Get identifier columns + key_columns = get_table_key_names(biology_db, filtered_biology_output, table_name) + # ---- Create copy + df = df.copy() + # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint + df.loc[:, "id"] = "row" + df.index.astype(str) + "-" + "-".join(key_columns) + # ---- Insert the new data into the database & pull in the combined dataset + table_df = sql_data_exchange(biology_db, + dataframe=df, + table_name=table_name, + id_columns=["id"], + primary_keys=["id"], + output_type=pd.DataFrame) + # ---- Add to the outgoing dictionary (and drop SQL db identifier) + results_dict.update({table_name: table_df.drop(columns="id")}) + + # Return the output + return results_dict + +def get_table_key_names(db_file: Path, data_dict: dict, table_name: str) -> List[str]: + + # Get the data input column names + if data_dict[table_name].empty: + # ---- Inspect the table + inspected_table = SQL(db_file, "inspect", table_name=table_name) + # ---- Create a list of the data columns + table_columns = list(inspected_table.keys()) + else: + # ---- Get the DataFrame column names + table_columns = data_dict[table_name].columns + + # Create a list of the primary keys + key_columns = ( + set(table_columns) + .intersection(["trawl_partition", "sex", "haul_num", "species_id", "longitude", + "latitude"]) + ) + + # Return a list of the output + return list(key_columns) + +def load_biology_data(file_configuration: dict): + + # Get the acoustic file settings and root directory + # ---- File settings + file_settings = file_configuration["input_directories"]["biology"] + # ---- Root directory + root_directory = file_configuration["data_root_dir"] + + # Get and validate the acoustic data directory and files + biology_files = validate_data_directory(root_directory, file_settings) + + # Query `biology.db` to process only new files (or create the db file in the first place) + # SQL(biology_db, "drop", table_name="files_read") + new_biology_files, file_configuration["database"]["biology"] = ( + query_processed_files(root_directory, file_settings, biology_files) + ) # Get the file-specific settings, datatypes, columns, etc. # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] - # ---- Extract the expected file name ID's - biology_file_ids = biology_file_settings["file_name_formats"] + # ---- Extract the expected file name ID's + biology_file_ids = file_settings["file_name_formats"] # ---- Extract all of the file ids biology_config_ids = list(biology_file_ids.keys()) # ---- Initialize the dictionary that will define this key in the `input` attribute biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} - # ---- Initialize the SQL dictionary - sql_biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} + # ---- Create filepath object + directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"] - # Create full filepath - biology_directory_path = ( - Path(file_configuration["data_root_dir"]) / biology_file_settings["directory"] + # Add SQL file to dict + file_configuration["database"]["biology"] = ( + Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"] ) - # ---- Directory check - directory_existence = biology_directory_path.exists() - # ---- Error evaluation (if applicable) - if not directory_existence: - raise FileNotFoundError( - f"The acoustic data directory [{biology_directory_path}] does not exist." + + # Iterate through the different biology datasets and read them in + for dataset in list(biology_file_ids.keys()): + # ---- Get dataset-specific file lists + dataset_files = filter_filenames(directory_path, + file_settings["file_name_formats"][dataset], + new_biology_files, + file_settings["extension"]) + # ---- If there are dataset files available + if dataset_files: + # ---- Read in validated biology data + dataframe_list = [read_biology_csv(Path(file), + file_settings["file_name_formats"][dataset], + biology_config_map[dataset]) + for file in dataset_files] + # ---- Concatenate the dataset + dataframe_combined = pd.concat(dataframe_list, ignore_index=True) + # ---- Lower-case sex + if "sex" in dataframe_combined.columns: + dataframe_combined["sex"] = dataframe_combined["sex"].str.lower() + # ---- Lower-case trawl partition type + if "trawl_partition" in dataframe_combined.columns: + dataframe_combined["trawl_partition"] = dataframe_combined["trawl_partition"].str.lower() + # ---- Reformat datetime column + if "datetime" in dataframe_combined.columns: + dataframe_combined["datetime"] = convert_datetime(dataframe_combined["datetime"]) + # ---- Add to the data dictionary + biology_output[f"{dataset}_df"] = dataframe_combined + + # Pre-process and return the results + return preprocess_biology_data(biology_output, file_configuration) + +SPATIAL_CONFIG_MAP = { + "closest_haul": { + "proximity": { + "choices": ["distance", "time"], + }, + }, + "global" : {}, + "griddify": { + "bounds": { + "longitude": { + "types": [float] + }, + "latitude": { + "types": [float] + }, + "northings": { + "types": [float] + }, + "eastings": { + "types": [float] + }, + "pairs": [("longitude", "latitude"), ("northings", "eastings")], + }, + "grid_resolution": { + "x_distance": { + "types": float, + }, + "y_distance": { + "types": float, + }, + "d_longitude": { + "types": float, + }, + "d_latitude": { + "types": float, + }, + "grid_size_x": { + "types": int, + }, + "grid_size_y": { + "types": int, + }, + "pairs": [("x_distance", "y_distance"), ("d_longitude", "d_latitude"), + ("grid_size_x", "grid_size_y")], + }, + }, + "inpfc": { + "stratum_names": { + "types": [int, str] + }, + "latitude_max": { + "types": [float], + }, + }, + "weighted_haul": { + "proximity": { + "choices": ["distance", "time"] + }, + }, +} + +def validate_spatial_config(spatial_config: dict): + + # Check the link method + # ---- Extract string-formatted method name + link_method = spatial_config["link_biology_acoustics"].lower() + # ---- Validate + if link_method not in SPATIAL_CONFIG_MAP.keys(): + raise ValueError( + f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options " + f"include: 'global', 'closest_haul', 'weighted_haul', 'griddify', and 'INPFC'." ) - # ---- Get the defined file extension - file_extension = biology_file_settings["extension"] - # ---- Create Path.glob generator object - file_path_obj = biology_directory_path.glob(f"*{'.'+file_extension}") - #---- Create list of `*.csv`` files - csv_files = list(file_path_obj) - # ---- Ensure files exist or raise error otherwise - if len(csv_files) < 1: - raise FileNotFoundError( - f"No `*.csv` files found in [{biology_directory_path}]!" + + # Verify that associated parameters are present in the configuration settings + # ---- Get keys as a list + config_keys = list(spatial_config.keys()) + # ---- Check for specific methods + if link_method not in config_keys and link_method != "global": + raise ValueError( + f"No parameters provided for the biology-acoustic linking ([{link_method}])." + ) + + # Check key settings + if link_method == "griddify": + validate_griddify_config(spatial_config, link_method) + elif link_method == "inpfc": + validate_inpfc_config(spatial_config, link_method) + elif link_method != "global": + validate_hauls_config(spatial_config, link_method) + +def validate_hauls_config(spatial_config: dict, link_method: str): + + # Get the link method configuration map + link_method_settings = SPATIAL_CONFIG_MAP[link_method] + + # Extract the defined settings + input_method_settings = spatial_config[link_method] + + # Check for `proximity` + if "proximity" not in input_method_settings.keys(): + raise KeyError( + "The following parameters are missing from the biology-acoustic linking method: " + "'proximity'!" ) - else: - # ---- Create Path to SQL database file - db_directory = Path(file_configuration["data_root_dir"]) / "database" - # ---- Create the directory if it does not already exist - db_directory.mkdir(parents=True, exist_ok=True) - # ---- Complete path to `biology.db` - db_file = db_directory / "biology.db" - # ---- Query the external SQL database to see if the file tracking table exists - tables = SQL(db_file, "inspect") - # ---- Create a list of string-formatted Path names - csv_files_str = [str(file) for file in csv_files] - # ---- Create DataFrame - current_files = pd.DataFrame(csv_files_str, columns=["filepath"]) - # ---- Create if it is missing and then advance `csv_files` - if "files_read" not in tables: - # ---- Insert into the SQL database file - _ = SQL(db_file, "insert", table_name="files_read", columns="filepath", - dataframe=current_files) - # ---- Create empty list for later comparison - new_files = [] - else: - # ---- Pull already processed filenames - previous_files = SQL(db_file, "select", table_name="files_read") - # ---- Compare against the current filelist - new_files = ( - [file for file in csv_files_str if file not in set(previous_files["filepath"])] - ) - # ---- Create a DataFrame for the new files - new_files_df = pd.DataFrame(new_files, columns=["filepath"]) - # ---- Insert into the SQL database file - _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) - - # Iterate through each of the file ids and read in the data - for id in list(biology_file_ids.keys()): - # ---- Extract the specific config mapping for this tag/id - sub_config_map = biology_config_map[id] - # ---- Drop the `{FIELD_ID}` tag identifier - file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id]) - # ---- Replace all other tags with `*` placeholders - file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) - # ---- Create Path object with the generalized format - subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}") - # ---- List all files that match this pattern - subcsv_files_str = [str(file) for file in list(subfile_path_obj)] - # ---- Filter for only new files - subset_files = set(subcsv_files_str).intersection(set(new_files)) - # ---- Pull from SQL database, if applicable - if f"{id}_df" in tables: - # ---- SELECT - sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*") - # ---- Concatenate to the dictionary - sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df]) - # ---- Add data files not stored in SQL database - if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables: - if len(subset_files) > 0: - file_list = subset_files + + # Evaluate valid options for `proximity` + if input_method_settings["proximity"] not in link_method_settings["proximity"]["choices"]: + raise KeyError( + f"Value biology-acoustic linking method parameter `proximity` must be one of the : " + f"following: {link_method_settings["proximity"]["choices"]}." + ) + +def validate_griddify_config(spatial_config: dict, link_method: str): + + # Get the link method configuration map + link_method_settings = SPATIAL_CONFIG_MAP[link_method] + + # Extract the defined settings + input_method_settings = spatial_config[link_method] + + # Check for the required keys + key_diff = set(input_method_settings.keys()).difference(set(link_method_settings.keys())) + # ---- Raise Error + if key_diff: + raise KeyError( + f"The following parameters are missing from the biology-acoustic linking method: " + f"{list(key_diff)}!" + ) + + # Iterate through the keys to evaluate inputs + for key in list(input_method_settings.keys()): + # ---- Subset the input method config + input = input_method_settings[key] + # ---- Get the original config of the dtypes + model = link_method_settings[key] + # ---- Compare entries + parameter_diff = set(input.keys()).difference(set(model.keys())) + # ---- Raise Error + if parameter_diff: + raise KeyError( + f"Unexpected parameter(s) ('{parameter_diff}') detected in '{link_method}' " + f"configuration." + ) + # ---- Check if the appropriate coordinate pairs are present + coordinate_pairs = [set(param).intersection(set(input.keys())) for param in model["pairs"]] + # ---- Count the number of paired coordinates + pair_counts = [len(param) for param in coordinate_pairs] + # ---- If there are multiple pairs + if (np.array(pair_counts) == 2).sum() != 1: + raise ValueError( + f"A single coordinate-pair is allowed (and required) within the '{key}' parameter " + f"for the link method '{link_method}' defined via the following options: " + f"{model["pairs"]}." + ) + # ---- Check the datatypes + for parameter in input.keys(): + # ---- Get the datatypes + config_dtypes = model[parameter]["types"] + # ---- Get input parameter + input_parameter = input[parameter] + # ---- If List + if isinstance(config_dtypes, list): + if not isinstance(input_parameter, list): + raise TypeError( + f"Biology-acoustic linking method argument '{parameter}' within '{key}' " + f"for method '{link_method}' must be contained within a list." + ) else: - file_list = subcsv_files_str - # ---- Create a list of relevant dataframes - sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map) - for file in file_list] - # ---- Concatenate into a single DataFrame - sub_df = pd.concat(sub_df_lst, ignore_index=True) - # ---- Lower-case sex - if "sex" in sub_df.columns: - sub_df["sex"] = sub_df["sex"].str.lower() - # ---- Concatenate to the dictionary DataFrame - biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df]) + input_parameter = [input_parameter] + config_dtypes = [config_dtypes] + # ---- Check correct datatypes + if not np.all([type(value) in config_dtypes for value in input_parameter]): + raise TypeError( + f"Biology-acoustic linking method argument '{parameter}' within '{key}' " + f"for method '{link_method}' must be one of the following types within a list: " + f"{config_dtypes}." + ) - # Get contrasts used for filtering the dataset - # ---- Species - species_filter = file_configuration["species"]["number_code"] - # ---- Trawl partition information - trawl_filter = biology_analysis_settings["catch"]["partition"] - # ---- Apply the filter - filtered_biology_output = { - key: df[ - (df['species_id'] == species_filter if 'species_id' in df.columns else True) & - (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns else True) - ] - for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty - } +def validate_inpfc_config(spatial_config: dict, link_method: str): - # Update the SQL database - for table_name, df in filtered_biology_output.items(): - # ---- Update - _ = SQL(db_file, "insert", table_name=table_name, columns="*", - dataframe=df) + # Get the link method configuration map + link_method_settings = SPATIAL_CONFIG_MAP[link_method] + + # Extract the defined settings + input_method_settings = spatial_config[link_method] + + # Check for the required keys + key_diff = set(input_method_settings.keys()).difference(set(link_method_settings.keys())) + # ---- Raise Error + if key_diff: + raise KeyError( + f"The following parameters are missing from the biology-acoustic linking method: " + f"{list(key_diff)}!" + ) + + # Iterate through the keys to evaluate inputs + for key in list(input_method_settings.keys()): + # ---- Subset the input method config + input = input_method_settings[key] + # ---- Get the original config of the dtypes + model = link_method_settings[key]["types"] + # ---- Evaluate if a list + if not isinstance(input, list): + raise TypeError( + f"Biology-acoustic linking method argument '{key}' for method '{link_method}' must " + f"be contained within a list." + ) + # ---- Evaluate if it is a type within the list + if not type(input[0]) in model: + raise TypeError( + f"Biology-acoustic linking method argument '{key}' for method '{link_method}' must " + f"be one of the following types within a list: {model}." + ) - # Combine the two datasets - merged_output = { - key: pd.concat([ - sql_biology_output.get(key, pd.DataFrame()), - filtered_biology_output.get(key, pd.DataFrame()) - ]).drop_duplicates().reset_index(drop=True) - for key in set(sql_biology_output) | set(filtered_biology_output) - } - # ---- Return output - if update_config: - if file_configuration["database"]["biology"] is None: - file_configuration["database"]["biology"] = db_file - return merged_output, file_configuration - else: - return merged_output +def apply_inpfc_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict): -# TODO: Expand data validator and limit cases to '*.zarr' (for now) -# TODO: Refactor "extra" components such as the validation steps, xarray-to-dataframe piping, etc. -# TODO: Documentation -def load_acoustic_data(file_configuration: dict, update_config: bool = True) -> Tuple[pd.DataFrame, xr.Dataset]: - # Get acoustic directory and initialization settings - # ---- Files - acoustic_file_settings = file_configuration["input_directories"]["acoustic"] - # ---- General settings - acoustic_analysis_settings = file_configuration["acoustics"] + # Extract the INPFC definitions + inpfc_definitions = spatial_config["inpfc"] + + # Create latitude bins + latitude_bins = np.concatenate([[-90.0], inpfc_definitions["latitude_max"], [90.0]]) + # ---- Append 1 more stratum layer + bin_names = np.concatenate([inpfc_definitions["stratum_names"], + [np.max(inpfc_definitions["stratum_names"]) + 1]]) - # Get the file-specific settings, datatypes, columns, etc. - # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` - acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"] - # ---- Create list of coordinate data variables - specified_vars = list(acoustics_config_map["xarray_variables"].keys()) - # ---- Create set of coordinate variables - specified_coords = list(acoustics_config_map["xarray_coordinates"].keys()) - # ---- Concatenate into a full configuration map - full_config_map = {**acoustics_config_map["xarray_coordinates"], - **acoustics_config_map["xarray_variables"]} - # ---- Initialize the dictionary that will define this key in the `input` attribute - acoustics_output = {"prc_nasc_df": pd.DataFrame(), - "nasc_df": pd.DataFrame()} - # ---- Initialize the SQL dictionary - # sql_acoustics_output = {"sv_df": pd.DataFrame()} - - # Create full filepath - acoustic_directory_path = ( - Path(file_configuration["data_root_dir"]) / acoustic_file_settings["directory"] + # Create spatial key + spatial_config["spatial_key"] = pd.DataFrame({ + "latitude_limit": inpfc_definitions["latitude_max"], + }) + # ---- Cut + spatial_config["spatial_key"]["stratum"] = ( + pd.cut(inpfc_definitions["latitude_max"], + latitude_bins, + right = True, + labels = bin_names) ) - - # Validate filepath, columns, datatypes - # ---- Directory check - directory_existence = acoustic_directory_path.exists() - # ---- Error evaluation (if applicable) - if not directory_existence: - raise FileNotFoundError( - f"The acoustic data directory [{acoustic_directory_path}] does not exist." + + # Get the `prc_nasc_df` values, if they exist, and apply stratification information + if not acoustic_data["prc_nasc_df"].empty: + # ---- Bin the latitude data + acoustic_data["prc_nasc_df"]["stratum"] = pd.cut( + acoustic_data["prc_nasc_df"]["latitude"], + latitude_bins, + right = True, + labels = bin_names, ) - # ---- Get the defined file extension - file_extension = acoustic_file_settings["extension"] - # ---- Create Path.glob generator object (the case of a *.zarr file) - file_path_obj = acoustic_directory_path.glob(f"*{'.'+file_extension}") - # ---- Find all zarr files - zarr_files = list(file_path_obj) - # ---- Ensure files exist or raise error otherwise - if len(zarr_files) < 1: - raise FileNotFoundError( - f"No `*.zarr` files found in [{acoustic_directory_path}]!" + + # Get the `trawl_info_df` values, if they exist, and apply stratification information + if not biology_data["trawl_info_df"].empty: + # ---- Bin the latitude data + biology_data["trawl_info_df"]["stratum"] = pd.cut( + biology_data["trawl_info_df"]["latitude"], + latitude_bins, + right = True, + labels = bin_names, ) + +def define_boundary_box(boundary_dict: dict, projection: str): + + # Get x-coordinates + if "longitude" in boundary_dict.keys(): + x = np.array(boundary_dict["longitude"]) else: - # ---- Create Path to SQL database file - db_directory = Path(file_configuration["data_root_dir"]) / "database" - # ---- Create the directory if it does not already exist - db_directory.mkdir(parents=True, exist_ok=True) - # ---- Complete path to `biology.db` - db_file = db_directory / "acoustics.db" - # ---- Query the external SQL database to see if the file tracking table exists - tables = SQL(db_file, "inspect") - # ---- Create a list of string-formatted Path names - zarr_files_str = [str(file) for file in zarr_files] - # ---- Create DataFrame - current_files = pd.DataFrame(zarr_files_str, columns=["filepath"]) - # ---- Create if it is missing and then advance `zarr_files` - if "files_read" not in tables: - # ---- Insert into the SQL database file - _ = SQL(db_file, "insert", table_name="files_read", columns="filepath", - dataframe=current_files) - # ---- Create empty list for later comparison - new_files = [] - else: - # ---- Pull already processed filenames - previous_files = SQL(db_file, "select", table_name="files_read") - # ---- Compare against the current filelist - new_files = ( - [file for file in zarr_files_str if file not in set(previous_files["filepath"])] - ) - # ---- Create a DataFrame for the new files - new_files_df = pd.DataFrame(new_files, columns=["filepath"]) - # ---- Insert into the SQL database file - _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) - - # Find new files that have not yet been processed - if not new_files: - subset_files = zarr_files + x = np.array(boundary_dict["northings"]) + + # Get y-coordinates + if "latitude" in boundary_dict.keys(): + y = np.array(boundary_dict["latitude"]) else: - subset_files = set(zarr_files).intersection(set(new_files)) + y = np.array(boundary_dict["eastings"]) - # Read in the `*.zarr` file(s) - # ! [REQUIRES DASK] ---- Read in the listed file - if len(subset_files) > 1: - zarr_data_ds = xr.open_mfdataset(subset_files, engine="zarr", chunks="auto", - data_vars=specified_vars, coords=specified_coords) - elif len(subset_files) == 1: - zarr_data_ds = xr.open_dataset(subset_files[0], engine="zarr", chunks="auto") + # Create a boundary DataFrame + bound_df = pd.DataFrame({ + "x": np.array([x.min(), x.max(), x.max(), x.min(), x.min()]), + "y":np.array([y.min(), y.max(), y.max(), y.min(), y.min()]), + }) - # Pre-process the Dataset, convert it to a DataFrame, and validate the structure - # ---- Extract coordinate metadata - coordinate_metadata = zarr_data_ds[["longitude", "latitude"]] - # ---- Convert to a DataFrame - zarr_data_df = zarr_data_ds.to_dataframe().reset_index() - # ---- Check for any missing columns - missing_columns = ( - [key for key in full_config_map.keys() if key not in zarr_data_df.columns] + # Convert to a GeoDataFrame and return the GeoDataFrame + return gpd.GeoDataFrame( + data=bound_df, + geometry=gpd.points_from_xy(bound_df["x"], bound_df["y"]), + crs=projection, ) - # ---- Raise Error, if needed - if missing_columns: - raise ValueError( - f"The following columns are missing from at least one *.{file_extension} file in " - f"[{acoustic_directory_path}]: {', '.join(missing_columns)}!" - ) - # ---- Select defined columns - zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map) - # Extract defined acoustic frequency - # ---- From the configuration - transmit_settings = acoustic_analysis_settings["transmit"] - # ---- Transform `frequency_nominal`, if necessary - zarr_data_df_filtered["frequency_nominal"] = ( - configure_transmit_frequency(zarr_data_df_filtered["frequency_nominal"], - transmit_settings, - zarr_data_ds["frequency_nominal"].units) - ) - # ---- Filter out any unused frequency coordinates - zarr_data_df_output = ( - zarr_data_df_filtered - [zarr_data_df_filtered["frequency_nominal"] == transmit_settings["frequency"]] - ) - - # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object - # ---- Replace NASC `NaN` values with `0.0` - zarr_data_df_output.loc[:, "NASC"] = zarr_data_df_output.loc[:, "NASC"].fillna(0.0) - # ---- Drop frequency column and return the output - acoustics_output["prc_nasc_df"] = zarr_data_df_output.drop(columns = ["frequency_nominal"]) - # ---- Return output - if update_config: - if file_configuration["database"]["acoustics"] is None: - file_configuration["database"]["acoustics"] = db_file - return acoustics_output, file_configuration + +def apply_griddify_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict): + + # Extract the griddification definitions + griddify_definitions = spatial_config["griddify"] + + # Get the projection definition + projection = spatial_config["projection"] + + # Compute the boundary box GeoDataFrame + boundary_box = define_boundary_box(griddify_definitions["bounds"], projection) + + # Convert the coordinates, if needed + if not set(["northings", "eastings"]).intersection(set(griddify_definitions["bounds"].keys())): + # ---- Compute the equivalent UTM string + utm_num = int(utm_string_generator(np.median(boundary_box.loc[0:3, "x"]), + np.median(boundary_box.loc[0:3, "y"]))) + # ---- Compute the boundary box GeoDataFrame with the new projection + boundary_box = boundary_box.to_crs(utm_num) + # ---- Create a new projection for later + projection_new = f"epsg:{utm_num}" else: - return acoustics_output \ No newline at end of file + projection_new = projection + + # Define the step sizes + # ---- Define x step size + x_step = distance(nautical=griddify_definitions["grid_resolution"]["x_distance"]).meters + # ---- Define y step size + y_step = distance(nautical=griddify_definitions["grid_resolution"]["y_distance"]).meters + + # Get the boundary tuple + xmin, ymin, xmax, ymax = boundary_box.total_bounds + + # Generate the cells + grid_cells = [] + # ---- Iterate through + for y0 in np.arange(ymin, ymax+y_step, y_step): + for x0 in np.arange(xmin, xmax+x_step, x_step): + x1 = x0-x_step + y1 = y0+y_step + grid_cells.append(shapely.geometry.box(x0, y0, x1, y1)) + + # Convert to a GeoDataFrame + cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=projection_new) + + # Get the centroids + cells_gdf["cell_centroid"] = cells_gdf["geometry"].centroid + + # Get the `prc_nasc_df` values, if they exist, and apply stratification information + if not acoustic_data["prc_nasc_df"].empty: + + # + prc_nasc_df = acoustic_data["prc_nasc_df"] + + # to GDF + prc_nasc_gdf = gpd.GeoDataFrame( + data=prc_nasc_df, + geometry=gpd.points_from_xy(prc_nasc_df["longitude"], prc_nasc_df["latitude"]), + crs=projection, + ) + # to UTM + prc_nasc_new = prc_nasc_gdf.to_crs(projection_new) + + prc_nasc_new["x"] = prc_nasc_new["geometry"].x + prc_nasc_new["y"] = prc_nasc_new["geometry"].y + + # ---- Bin the latitude data + prc_nasc_new["stratum_x"] = pd.cut( + prc_nasc_new["x"], + np.arange(xmin, xmax+x_step, x_step), + right = True, + labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1), + ).astype(int) + 1 + + prc_nasc_new["stratum_y"] = pd.cut( + prc_nasc_new["y"], + np.arange(ymin, ymax+y_step, y_step), + right = True, + labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1), + ).astype(int) + 1 + + # + acoustic_data["prc_nasc_df"]["stratum"] = ( + prc_nasc_new["stratum_x"].astype(str) + "-" + prc_nasc_new["stratum_y"].astype(str) + ) + + if not biology_data["trawl_info_df"].empty: + + # + trawl_info_df = biology_data["trawl_info_df"] + + # to GDF + trawl_info_gdf = gpd.GeoDataFrame( + data=trawl_info_df, + geometry=gpd.points_from_xy(trawl_info_df["longitude"], trawl_info_df["latitude"]), + crs=projection, + ) + # to UTM + trawl_info_new = trawl_info_gdf.to_crs(projection_new) + + trawl_info_new["x"] = trawl_info_new["geometry"].x + trawl_info_new["y"] = trawl_info_new["geometry"].y + + # ---- Bin the latitude data + trawl_info_new["stratum_x"] = pd.cut( + trawl_info_new["x"], + np.arange(xmin, xmax+x_step, x_step), + right = True, + labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1), + ).astype(int) + 1 + + trawl_info_new["stratum_y"] = pd.cut( + trawl_info_new["y"], + np.arange(ymin, ymax+y_step, y_step), + right = True, + labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1), + ).astype(int) + 1 + + # + biology_data["trawl_info_df"]["stratum"] = ( + trawl_info_new["stratum_x"].astype(str) + "-" + trawl_info_new["stratum_y"].astype(str) + ) + diff --git a/echopop/live/livesurvey.py b/echopop/live/live_survey.py similarity index 78% rename from echopop/live/livesurvey.py rename to echopop/live/live_survey.py index 6d6a8621..e8c60da5 100644 --- a/echopop/live/livesurvey.py +++ b/echopop/live/live_survey.py @@ -3,10 +3,8 @@ import copy import yaml -from .livecore import( +from .live_core import( LIVE_DATA_STRUCTURE, - LIVE_FILE_FORMAT_MAP, - LIVE_INPUT_FILE_CONFIG_MAP ) from ..acoustics import ( @@ -37,7 +35,11 @@ def __init__( # initialize the Survey class object self.config = eldp.live_configuration(Path(live_init_config_path), Path(live_file_config_path)) - + # ---- Initialize config key for database files + self.config.update( + {"database": {key: None for key in self.config["input_directories"].keys()}} + ) + # Initialize input attribute self.input = copy.deepcopy(LIVE_DATA_STRUCTURE["input"]) @@ -50,11 +52,9 @@ def __init__( # TODO: Replace Tuple output by appending the "database" key to the respective dataset dict # Ingest data # ---- Acoustics - self.input["acoustics"]["prc_nasc_df"], self.config = eldp.load_acoustic_data(self.config, - update_config) + self.input["acoustics"]["prc_nasc_df"] = eldp.load_acoustic_data(self.config) # ---- Biology - self.input["biology"], self.config = eldp.load_biology_data(self.config, - update_config) + self.input["biology"] = eldp.load_biology_data(self.config) # TODO: Add verbosity for printing database filepaths/connections if verbose: diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index e8d8de93..4b282e13 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -3,7 +3,7 @@ import pandas as pd from typing import Optional -def sql_create(connection: sqla.Connection, df: pd.DataFrame, table_name: str, +def sql_create(connection: sqla.Connection, dataframe: pd.DataFrame, table_name: str, primary_keys: Optional[list] = None): """ Generate a SQL command to create a table with dynamic columns, primary keys, and indices. @@ -18,8 +18,8 @@ def sql_create(connection: sqla.Connection, df: pd.DataFrame, table_name: str, """ # Generate column definitions column_definitions = ( - ",\n".join(f"{col} {SQL_DTYPES[type(col).__name__]}" - for col in df.columns) + ",\n".join(f"{col} {SQL_DTYPES[type(dataframe[col][0]).__name__]}" + for col in dataframe.columns) ) # Generate primary key definition @@ -38,6 +38,12 @@ def sql_create(connection: sqla.Connection, df: pd.DataFrame, table_name: str, # Execute connection.execute(text(create_table_command.strip())) +def sql_map_tables(connection: sqla.Connection): + """ + """ + inspector = inspect(connection) + return inspector.get_table_names() + def sql_validate(connection: sqla.Connection, table_name: str): """ Check if a table exists in the database. @@ -52,7 +58,7 @@ def sql_validate(connection: sqla.Connection, table_name: str): inspector = inspect(connection) return table_name in inspector.get_table_names() -def sql_inspect(connection: sqla.Connection): +def sql_inspect(connection: sqla.Connection, table_name: str): """ Get a list of all tables present @@ -62,7 +68,16 @@ def sql_inspect(connection: sqla.Connection): Returns: list: True if the table exists, False otherwise. """ - return inspect(connection).get_table_names() + + # Create 'inspector' for the db file + inspector = inspect(connection) + + # Retrieve column information + column_info = inspector.get_columns(table_name) + + # Format as a dictionary + return {col['name']: {k: v for k, v in col.items() if k != 'name'} for col in column_info} + def sql_drop(connection: sqla.Connection, table_name: str): """ @@ -83,26 +98,42 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data """ # Prepare the SQL statement for insertion + # ---- Check whether `columns` is '*' + if "*" in columns: + # ---- Create 'inspector' for the db file + inspector = inspect(connection) + # ---- Get the column names from the db file + columns = [col['name'] for col in inspector.get_columns(table_name)] # ---- If not a List - if not isinstance(columns, list): - columns = list(columns) - + elif not isinstance(columns, list): + columns = [columns] + # ---- Prepare the columns as a string of column names column_names = ", ".join(columns) + + # Format `id_columns` + if id_columns is not None and not isinstance(id_columns, list): + id_columns = [id_columns] # Convert the DataFrame into a tuple and then into a string + # ---- Replace NaN with None + dataframe = dataframe.replace([np.nan], [None]) + # ---- Identify any possible DATETIME columns + # datetime_columns = ( + # {col["name"]: str for col in columns_info + # if isinstance(col["type"], sqla.sql.sqltypes.DATETIME)} + # ) + # ---- Encapsulate datetimes with quotes by converting to string + # dataframe = dataframe.astype(datetime_columns) # ---- DataFrame to Tuple data_tuple = [tuple(row) for row in dataframe.itertuples(index=False)] # ---- Tuple to String - if dataframe.columns.size == 1: - data_str = ", ".join( - f"{', '.join(map(str, row))}" - for row in data_tuple - ) - else: - data_str = ", ".join( - f"({', '.join(map(str, row))})" - for row in data_tuple - ) + data_str = ", ".join( + # f"({', '.join(map(lambda x: f'\'{x}\'' if isinstance(x, str) else str(x), row))})" + f"({', '.join(map(lambda x: f'\'{x}\'' + if isinstance(x, str) or isinstance(x, pd.Timestamp) + else 'NULL' if x is None else str(x), row))})" + for row in data_tuple + ) # Construct the "ON CONFLICT, DO UPDATE SET" if needed on_conflict_clause = "" @@ -115,57 +146,114 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data # Construct the SQL query sql_command = f""" INSERT INTO {table_name} ({column_names}) - VALUES ({data_str}) + VALUES {data_str} {on_conflict_clause} """ - + # Execute connection.execute(text(sql_command.strip())) # Commit connection.commit() - +from typing import Literal +import numpy as np +def sql_select(connection: sqla.Connection, table_name: str, columns: list, + output_type: type = pd.DataFrame): + + # Prepare the columns as a string of column names + column_names = ", ".join(columns) + + # Format the SQL command + sql_command = f"SELECT {column_names} FROM {table_name};" + + # Execute the command + table = connection.execute(text(sql_command)) + + # Fetch the data from the table + data = table.fetchall() + # Inspect the table to construct a dictionary of expected datatypes for each column + table_info = sql_inspect(connection, table_name=table_name) + # ---- Whittle down the information dictionary to isolate just the column datatypes + table_dtypes = {col: info['type'] for col, info in table_info.items()} + + # Raise error if `output_type` is invalid + if output_type not in [pd.DataFrame, np.ndarray, str, tuple]: + raise TypeError( + f"Argument `output_type` ([{output_type}]) must be either `str`, `tuple`, " + f"`pandas.DataFrame`, or `numpy.ndarray`." + ) + + # Format the output + # ---- DataFrame + if output_type is pd.DataFrame: + # ---- Create DataFrame + output_df = pd.DataFrame(data, columns=table.keys()) + # ---- Format the expected datatypes + df_dtypes = {col: SQL_DTYPES[type(dtype).__name__] for col, dtype in table_dtypes.items()} + # ---- Apply the dtypes + return output_df.astype(df_dtypes) + else: + # ---- Get the datatypes that will correspond to each value of the tuples + tuple_dtypes = [SQL_DTYPES[type(dtype).__name__] for _, dtype in table_dtypes.items()] + # ---- Convert the `Row` objects to tuples + converted_data = [ + tuple(dtype(value) if value is not None else None + for value, dtype in zip(row, tuple_dtypes)) + for row in data + ] + # ---- String + if output_type is str: + return [item[0] for item in converted_data] + # ---- Array + elif output_type is np.ndarray: + return np.array([item[0] for item in converted_data]) + # ---- Tuple + else: + return converted_data + SQL_COMMANDS = { - "create": sql_create, - "drop": sql_drop, - "inspect": sql_inspect, - "validate": sql_validate, - + "create": dict(function=sql_create, args=["table_name", "dataframe", "primary_keys"]), + "drop": dict(function=sql_drop, args=["table_name"]), + "insert": dict(function=sql_insert, args=["table_name", "columns", "dataframe", "id_columns"]), + "inspect": dict(function=sql_inspect, args=["table_name"]), + "map": dict(function=sql_map_tables, args=[]), + "select": dict(function=sql_select, args=["table_name", "columns", "output_type"]), + "validate": dict(function=sql_validate, args=["table_name"]), +} - "check": "SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';", - "drop": "DROP TABLE IF EXISTS {table_name};", - "select": "SELECT {columns} FROM {table_name};", - "index": "CREATE UNIQUE INDEX IF NOT EXISTS {index_name} ON {table_name} ({columns})", - # "insert": "INSERT INTO {table_name} ({columns});", - "insert": """ - INSERT INTO {table_name} ({columns}) - SELECT {columns} - FROM (SELECT VALUES {values} FROM (VALUES {value_placeholder})) AS source ({columns}) - {filter_clause}; - """, - "inspect": None, -} - SQL_DTYPES = { 'int32': 'INTEGER', 'int64': 'INTEGER', 'float64': 'FLOAT', + "float": "FLOAT", + "int": "INTEGER", 'bool': 'BOOLEAN', - 'datetime64[ns]': 'DATETIME', + "Timestamp": "DATETIME", 'object': 'TEXT', "str": "TEXT", -} + "FLOAT": float, + "INTEGER": int, + "DATETIME": str, + "TEXT": str, +} def format_sql_columns(kwargs: dict): + + # Columns if "columns" in kwargs: - if isinstance(kwargs["columns"], list): + if isinstance(kwargs["columns"], list) or isinstance(kwargs["columns"], pd.Index): kwargs["columns"] = ", ".join(kwargs["columns"]) else: kwargs["columns"] = "*" - + + # ID/Conflict columns + if "id_columns" in kwargs: + if isinstance(kwargs["id_columns"], list) or isinstance(kwargs["id_columns"], pd.Index): + kwargs["id_columns"] = ", ".join(kwargs["id_columns"]) + # Return the updated `kwargs` dictionary return kwargs @@ -184,31 +272,39 @@ def SQL(db_file: str, command: str, **kwargs): # Run the command try: with engine.connect() as connection: - # ---- SELECT - if command == "select": - return pd.read_sql(text(SQL_COMMANDS[command].format(**kwargs)), con=connection) - # ---- REPLACE - elif command == "replace": - # ---- Extract dataframe - df_to_add = kwargs["dataframe"] - # ---- Replace current - df_to_add.to_sql(name=kwargs["table_name"], - con=connection, - if_exists="replace", index=False) - - # ---- INSERT - elif command == "insert": - # ---- Extract dataframe - df_to_add = kwargs["dataframe"] - # ---- Insert into the table - df_to_add.to_sql(name=kwargs["table_name"], con=connection, if_exists="append", - index=False) - # ---- INSPECT - elif command == "inspect": - return inspect(engine).get_table_names() - # ---- OTHER COMMAND - else: - connection.execute(text(SQL_COMMANDS[command].format(**kwargs))) + # ---- Get the function name + command_function = SQL_COMMANDS[command]["function"] + # ---- Get the function arguments + command_args = SQL_COMMANDS[command]["args"] + # ---- Drop unnecessary keys (update `kwargs`) + kwargs = {key: value for key, value in kwargs.items() if key in command_args} + # ---- Return output + return command_function(connection, **kwargs) + # # ---- SELECT + # if command == "select": + # return pd.read_sql(text(SQL_COMMANDS[command].format(**kwargs)), con=connection) + # # ---- REPLACE + # elif command == "replace": + # # ---- Extract dataframe + # df_to_add = kwargs["dataframe"] + # # ---- Replace current + # df_to_add.to_sql(name=kwargs["table_name"], + # con=connection, + # if_exists="replace", index=False) + + # # ---- INSERT + # elif command == "insert": + # # ---- Extract dataframe + # df_to_add = kwargs["dataframe"] + # # ---- Insert into the table + # df_to_add.to_sql(name=kwargs["table_name"], con=connection, if_exists="append", + # index=False) + # # ---- INSPECT + # elif command == "inspect": + # return inspect(engine).get_table_names() + # # ---- OTHER COMMAND + # else: + # connection.execute(text(SQL_COMMANDS[command].format(**kwargs))) finally: # ---- Dispose of the engine to release any resources being pooled/used engine.dispose() diff --git a/echopop/mesh_generation.py b/echopop/mesh_generation.py index 077d9c93..bb78e1ba 100644 --- a/echopop/mesh_generation.py +++ b/echopop/mesh_generation.py @@ -12,8 +12,109 @@ # Create the grid points grid_points = [(i, j, 0) for i in x for j in y] +def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame]: + + # Get the acoustic file settings and root directory + # ---- File settings + file_settings = file_configuration["input_directories"]["acoustics"] + # ---- Root directory + root_directory = file_configuration["data_root_dir"] + + # Get and validate the acoustic data directory and files + acoustic_files = validate_data_directory(root_directory, file_settings) + + # Query `acoustics.db` to process only new files (or create the db file in the first place) + new_acoustic_files = query_acoustic_db_files(file_configuration, acoustic_files) + + # Read in the acoustic data files + # ! [REQUIRES DASK] ---- Read in the listed file + prc_nasc_df, acoustic_data_units = read_acoustic_zarr(new_acoustic_files) + # ---- Add the `acoustic_data_units` to the dictionary + file_configuration["acoustics"]["dataset_units"] = acoustic_data_units + + # Preprocess the acoustic dataset + prc_nasc_df_processed = preprocess_acoustic_data(prc_nasc_df, file_configuration) + + # Return output + return prc_nasc_df_processed + +def read_acoustic_zarr(acoustic_files: Path) -> tuple: + + # Iterate through each of the file ids and read in the data + for id in list(biology_file_ids.keys()): + # ---- Extract the specific config mapping for this tag/id + sub_config_map = biology_config_map[id] + # ---- Drop the `{FIELD_ID}` tag identifier + file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id]) + # ---- Replace all other tags with `*` placeholders + file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) + # ---- Create Path object with the generalized format + subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}") + # ---- List all files that match this pattern + subcsv_files_str = [str(file) for file in list(subfile_path_obj)] + # ---- Filter for only new files + subset_files = set(subcsv_files_str).intersection(set(new_files)) + # ---- Pull from SQL database, if applicable + if f"{id}_df" in tables: + # ---- SELECT + sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*") + # ---- Concatenate to the dictionary + sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df]) + # ---- Add data files not stored in SQL database + if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables: + if len(subset_files) > 0: + file_list = subset_files + else: + file_list = subcsv_files_str + # ---- Create a list of relevant dataframes + sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map) + for file in file_list] + # ---- Concatenate into a single DataFrame + sub_df = pd.concat(sub_df_lst, ignore_index=True) + # ---- Lower-case sex + if "sex" in sub_df.columns: + sub_df["sex"] = sub_df["sex"].str.lower() + # ---- Concatenate to the dictionary DataFrame + biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df]) + + # Get contrasts used for filtering the dataset + # ---- Species + species_filter = file_configuration["species"]["number_code"] + # ---- Trawl partition information + trawl_filter = biology_analysis_settings["catch"]["partition"] + # ---- Apply the filter + filtered_biology_output = { + key: df[ + (df['species_id'] == species_filter if 'species_id' in df.columns else True) & + (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns else True) + ] + for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty + } + + # Update the SQL database + for table_name, df in filtered_biology_output.items(): + # ---- Update + _ = SQL(db_file, "insert", table_name=table_name, columns="*", + dataframe=df) + + # Combine the two datasets + merged_output = { + key: pd.concat([ + sql_biology_output.get(key, pd.DataFrame()), + filtered_biology_output.get(key, pd.DataFrame()) + ]).drop_duplicates().reset_index(drop=True) + for key in set(sql_biology_output) | set(filtered_biology_output) + } + # ---- Return output + if update_config: + if file_configuration["database"]["biology"] is None: + file_configuration["database"]["biology"] = db_file + return merged_output, file_configuration + else: + return merged_output + + -# data_root_dir = Path("C:/Users/Brandyn/Documents/GitHub/EchoPro_data/") db_directory = data_root_dir / "database" # ---- Create the directory if it does not already exist @@ -118,7 +219,7 @@ def create_table_sql(table_name, columns, primary_keys=None, index_columns=None) for row in rows: print(row) - +converted_data[0] check_table_exists(engine, "files_read") zarr_files_str = ["A", "B", "C", "D"] @@ -247,11 +348,13 @@ def update_specific_rows(engine, table_name, updates, conditions): condition_columns = ['x', 'y'] # Define the updates and conditions -dd = {"x": np.array([1, 2, 3 , 4, 5]), "y": np.array([1, 2, 3 , 4, 5]), "value": np.array([1, 2, 3 , 4, 5]).astype(float)} +dd = {"x": np.array([1, 2, 3 , 4, 5]),"" "y": np.array([1, 2, 3 , 4, 5]), "value": np.array([1, 2, 3 , 4, 5]).astype(float)} new_data = pd.DataFrame(dd) new_data df = new_data +kwargs = {"table_name": "grid", "columns": df.columns, "df": df} + with engine.connect() as connection: # sql_create(connection, table_name = "grid", df = df) # sql_validate(connection, "grid") @@ -1911,6 +2014,8 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict): # Return the resulting DataFrame return df_validated +boundary_dict = griddify_definitions["bounds"] + ## grid_settings["grid_resolution"]["x"] = 50 grid_settings["grid_resolution"]["y"] = 50 @@ -1928,7 +2033,7 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict): geometry=gpd.points_from_xy(bound_df["lon"], bound_df["lat"]), crs = projection ) - +from echopop.spatial.projection import utm_string_generator utm_string_generator(-117.0, 33.75) bound_gdf.total_bounds # Convert to UTM diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py index 44a83ab4..c01445b3 100644 --- a/echopop/zarr_read_ingest_test.py +++ b/echopop/zarr_read_ingest_test.py @@ -13,49 +13,9 @@ import re import contextlib from sqlalchemy import create_engine, text, Engine, inspect +from echopop.live.live_core import LIVE_DATA_STRUCTURE, LIVE_FILE_FORMAT_MAP, LIVE_INPUT_FILE_CONFIG_MAP +from echopop.live import live_data_processing as eldp -#################################################################################################### -# * Functionality for a) loading YAML configuration file, b) search defined directory for -# * input files, c) ingest *.zarr/*.csv -# TODO: Incorporate complete YAML file validator -# TODO: Documentation -def live_configuration(live_init_config_path: Union[str, Path], - live_file_config_path: Union[str, Path]): - - # Validate file existence - # ---- str-to-Path conversion, if necessary - live_init_config_path = Path(live_init_config_path) - live_file_config_path = Path(live_file_config_path) - # ---- Create list of both config paths - config_files = [live_init_config_path, live_file_config_path] - # ---- List of file existence checks - config_existence = [live_init_config_path.exists(), live_file_config_path.exists()] - # ---- Error evaluation and print message (if applicable) - if not all(config_existence): - missing_config = [ - files for files, exists in zip(config_files, config_existence) if not exists - ] - raise FileNotFoundError(f"The following configuration files do not exist: {missing_config}") - - # Read the YAML configuration/recipe file to parameterize the `LiveSurvey` class - # ---- Initialization settings - init_config = yaml.safe_load(Path(live_init_config_path).read_text()) - # ---- Filepath/directory settings - file_config = yaml.safe_load(Path(live_file_config_path).read_text()) - - # Check for intersecting/duplicative configuration keys - # ---- Compare sets of keys from each dictionary - config_intersect = set(init_config.keys()).intersection(set(file_config.keys())) - # ---- Raise error if needed - if config_intersect: - raise ValueError( - f"The initialization and file configuration files comprise the following intersecting " - f"keys: {' ,'.join(config_intersect)}. Key names must be unique for each configuration " - f"file." - ) - - # Combine both into a dictionary output that can be added to the `LiveSurvey` class object - return {**init_config, **file_config} #################################################################################################### # TEST: YAML FILE CONFIGURATION # ---- Define filepaths @@ -66,460 +26,394 @@ def live_configuration(live_init_config_path: Union[str, Path], file_configuration.update({"database": {"acoustics": None, "biology": None}}) #################################################################################################### # * Accessory function for tuning the acoustic transmit frequency units/scaling + + + + + +#################################################################################################### +# * Functionality for reading in processed acoustic data +# TODO: Expand data validator and limit cases to '*.zarr' (for now) +# TODO: Refactor "extra" components such as the validation steps, xarray-to-dataframe piping, etc. # TODO: Documentation -def configure_transmit_frequency(frequency_values: pd.Series, - transmit_settings: dict, - current_units: str): - - # Extract transmit frequency units defined in configuration file - configuration_units = transmit_settings["units"] - - # Transform the units, if necessary - # ---- Hz to kHz - if current_units == "Hz" and configuration_units == "kHz": - return frequency_values * 1e-3 - # ---- kHz to Hz - elif current_units == "kHz" and configuration_units == "Hz": - return frequency_values * 1e3 - # ---- No change +file_settings = file_configuration["input_directories"]["acoustics"] +root_directory = file_configuration["data_root_dir"] + + +#################################################################################################### +# TEST: ACOUSTIC ZARR FILE INGESTION CONFIGURATION +# NOTE: +# ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration` +acoustic_data = load_acoustic_data(file_configuration) +acoustic_data +file_configuration["database"] + +def estimate_echometrics(acoustic_data_df: pd.DataFrame): + + # Create copy + acoustic_df = acoustic_data_df.copy().reset_index(drop=True) + + # Pre-compute the change in depth + acoustic_df["dz"] = acoustic_df["depth"].diff() + + # Initialize echometrics dictionary + echometrics = {} + + # Compute the metrics center-of-mass + if acoustic_df["NASC"].sum() == 0.0: + echometrics.update({ + "n_layers": 0, + "mean_Sv": -999, + "max_Sv": -999, + "nasc_db": np.nan, + "center_of_mass": np.nan, + "dispersion": np.nan, + "evenness": np.nan, + "aggregation": np.nan, + "occupied_area": 0.0, + }) else: - return frequency_values + + # Compute the number of layers + echometrics.update({ + "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size + }) + + # Compute ABC + # ---- Convert NASC to ABC + acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2) + # ---- Estimate mean Sv + echometrics.update({ + "mean_Sv": 10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max()) + }) + # --- Estimate max Sv (i.e. ) + echometrics.update({ + "max_Sv": 10 * np.log10(acoustic_df["ABC"].max() + / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"]) + }) + + # Compute (acoustic) abundance + echometrics.update({ + "nasc_db": 10 * np.log10(acoustic_df["ABC"].sum()) + }) + + # Compute center of mass + echometrics.update({ + "center_of_mass": ( + (acoustic_df["depth"] * acoustic_df["NASC"]).sum() + / (acoustic_df["NASC"]).sum() + ) + }) + + # Compute the dispersion + echometrics.update({ + "dispersion": ( + ((acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2 + * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum() + ) + }) + + # Compute the evenness + echometrics.update({ + "evenness": (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2 + }) + + # Compute the index of aggregation + echometrics.update({ + "aggregation": 1 / echometrics["evenness"] + }) + + # Get the occupied area + echometrics.update({ + "occupied_area": ( + acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max() + ) + }) + + # Return the dictionary + return echometrics + +def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True): + + # Vertically integrate PRC NASC + nasc_dict = {"nasc": acoustic_data_df["NASC"].sum()} + + # Horizontally concatenate `echometrics`, if `True` + if echometrics: + # ---- Compute values + # NOTE: This uses NASC instead of linear `sv` + echometrics_dict = estimate_echometrics(acoustic_data_df) + # ---- Merge + nasc_dict.update(echometrics_dict) + + # Convert `nasc_dict` to a DataFrame and return the output + return pd.Series(nasc_dict) + + +acoustic_data_df = acoustic_data["prc_nasc_df"] + + + +# SQL(database_file, "drop", table_name="nasc_df") +# SQL(database_file, "validate", **kwargs) +# SQL(database_file, "create", table_name="nasc_df", primary_keys=["latitude", "longitude", "ping_time"], dataframe=nasc_data_df) +# SQL(database_file, "validate", **kwargs) +# SQL(database_file, "select", table_name="nasc_df") +# SQL(database_file, "insert", table_name="nasc_df", id_columns=["latitude", "longitude", "ping_time"], dataframe=nasc_data_df) +# SQL(database_file, "select", table_name="nasc_df") +# SQL(database_file, "insert", table_name="nasc_df", id_columns=["latitude", "longitude", "ping_time"], dataframe=nasc_data_df) +# SQL(database_file, "select", table_name="nasc_df") +# SQL(database_file, "insert", table_name="nasc_df", dataframe=nasc_data_df) +# SQL(database_file, "drop", table_name="nasc_df") +# SQL_DTYPES[type(dataframe["ping_time"][0]).__name__] + +def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: dict, + echometrics: bool = True): + + # Integrate NASC (and compute the echometrics, if necessary) + nasc_data_df = ( + acoustic_data_df.groupby(["longitude", "latitude", "ping_time"]) + .apply(lambda group: integrate_nasc(group, echometrics), include_groups=False) + .reset_index() + ) + # ---- Amend the dtypes if echometrics were computed + if echometrics: + nasc_data_df = ( + nasc_data_df + .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float, + "center_of_mass": float, "dispersion": float, "evenness": float, + "aggregation": float, "occupied_area": float}) + ) + + # Get the acoustics database file + acoustics_db = file_configuration["database"]["acoustics"] + + # Insert the new data into the database and pull in the combined previous and new data combined + full_nasc_df = sql_data_exchange(acoustics_db, dataframe=nasc_data_df, + table_name="nasc_df", + id_columns=["longitude", "latitude", "ping_time"], + primary_keys=["longitude", "latitude", "ping_time"], + output_type=pd.DataFrame) + + # Return the output + return full_nasc_df + #################################################################################################### -# * Define `LIVE_INPUT_FILE_CONFIG_MAP` configuration mapping (this will be in an equivalent -# * `core.py`) -# TODO: Update structure with additional information (as needed) -# TODO: Documentation -LIVE_INPUT_FILE_CONFIG_MAP = { - "acoustics": { - "xarray_coordinates": { - "distance": float, - "depth": float, +def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str, List[str]]] = None): + + # Get all database files + database_files = file_configuration["database"] + + # Iterate through all keys + for _, db_file in database_files.items(): + # ---- Map the table names + table_names = SQL(db_file, "map") + # ---- Drop any noted exceptions + if not isinstance(table_exception, list): + table_exception = [table_exception] + # ---- Drop exception table name + if None not in table_exception: + table_names = list(set(table_names) - set(table_exception)) + _ = [SQL(db_file, "drop", table_name=table) for table in table_names] + # ---- Validate that all tables were removed + if set(table_names).intersection(set(SQL(table_names, "map"))): + raise ValueError( + f"Attempted reset of [{str(db_file)}] failed." + ) + +SPATIAL_CONFIG_MAP = { + "closest_haul": { + "proximity": { + "choices": ["distance", "time"], }, - "xarray_variables": { - "NASC": float, - "frequency_nominal": float, - "latitude": float, - "longitude": float, - "ping_time": "datetime64[ns]", - } }, - "biology": { - "catch": { - "dtypes": { - "partition": str, - "species_code": int, - "sample_weight_kg": float, - "catch_perc": float, + "global" : {}, + "griddify": { + "bounds": { + "longitude": { + "types": [float] }, - "names": { - "partition": "trawl_partition", - "species_code": "species_id", - "sample_weight_kg": "haul_weight", - "catch_perc": "catch_percentage", - } - }, - "length": { - "dtypes": { - "sex": str, - "rounded_length": int, - "frequency": int, + "latitude": { + "types": [float] + }, + "northings": { + "types": [float] + }, + "eastings": { + "types": [float] }, - "names": { - "sex": "sex", - "rounded_length": "length", - "frequency": "length_count", - } + "pairs": [("longitude", "latitude"), ("northings", "eastings")], }, - "specimen": { - "dtypes": { - "rounded_length": int, - "organism_weight": float, - "sex": str, + "grid_resolution": { + "x_distance": { + "types": float, }, - "names": { - "sex": "sex", - "rounded_length": "length", - "organism_weight": "weight" + "y_distance": { + "types": float, }, + "d_longitude": { + "types": float, + }, + "d_latitude": { + "types": float, + }, + "grid_size_x": { + "types": int, + }, + "grid_size_y": { + "types": int, + }, + "pairs": [("x_distance", "y_distance"), ("d_longitude", "d_latitude"), + ("grid_size_x", "grid_size_y")], }, }, -} - -LIVE_FILE_FORMAT_MAP = { - "DATE:YYYYMM": { - "name": "date", - "dtype": "datetime[ns]", - "expression": r"(?P\d{6})", - }, - "DATE:YYYYMMDD": { - "name": "date", - "dtype": "datetime[ns]", - "expression": r"(?P\d{8})", - }, - "HAUL": { - "name": "haul_num", - "dtype": int, - "expression": r"(?P\d+)", - }, - "SPECIES_CODE": { - "name": "species_id", - "dtype": int, - "expression": r"(?P\d+)" + "inpfc": { + "stratum_names": { + "types": [int, str] + }, + "latitude_max": { + "types": [float], + }, }, - "FILE_ID": { - "name": "file_id", - "dtype": str, - "expression": r"(?P.+)" + "weighted_haul": { + "proximity": { + "choices": ["distance", "time"] + }, }, } -def compile_filename_format(file_name_format: str): - # Create a copy of `file_name_format` - regex_pattern = file_name_format - - # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern - for key, value in LIVE_FILE_FORMAT_MAP.items(): - regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"]) - # ---- Replace the `FILE_ID` tag - regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P\1)', regex_pattern) - # Compile the regex pattern and return the output - return re.compile(regex_pattern) +reset_db_files(file_configuration, table_exception = "files_read") +reset_db_files(file_configuration) -def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict): - - # Read in the `*.csv` file - df = pd.read_csv(file, usecols=list(config_settings["dtypes"].keys())) +stamp = 20240714194248 +stamp.astype(int) +int(stamp) +import re +from datetime import datetime - # Validate the dataframe - # ---- Check for any missing columns - missing_columns = ( - [key for key in config_settings["dtypes"].keys() if key not in df.columns] - ) - # ---- Raise Error, if needed - if missing_columns: - raise ValueError( - f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!" - ) - # ---- Ensure the correct datatypes - df_validated = df.astype(config_settings["dtypes"]) - # ---- Replace column names and drop - df_validated = df_validated.rename(columns=config_settings["names"]) - - # Get the substring components that can be added to the DataFrame - filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern) - # ---- Create sub-list of columns that can be added to the DataFrame - valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings))) - - # Compile the filename regular expression - compiled_regex = compile_filename_format(pattern) - # ---- Create the `Match` object that will be used to parse the string - match_obj = compiled_regex.search(file.name) - - # Iterate through the filename-derived tags and add them to the DataFrame - for i in valid_tags: - matched_key = LIVE_FILE_FORMAT_MAP[i] - df_validated[matched_key["name"]] = matched_key["dtype"](match_obj.group(i)) - - # Return the resulting DataFrame - return df_validated -#################################################################################################### -# * Functionality for reading in processed acoustic data -# TODO: Expand data validator and limit cases to '*.zarr' (for now) -# TODO: Refactor "extra" components such as the validation steps, xarray-to-dataframe piping, etc. -# TODO: Documentation -def load_acoustic_data(file_configuration: dict, update_config: bool = True) -> Tuple[pd.DataFrame, xr.Dataset]: - # Get acoustic directory and initialization settings - # ---- Files - acoustic_file_settings = file_configuration["input_directories"]["acoustic"] - # ---- General settings - acoustic_analysis_settings = file_configuration["acoustics"] +def infer_datetime_format(timestamp_str: Union[int, str]): + patterns = { + r"^\d{14}$": "%Y%m%d%H%M%S", # YYYYMMDDHHMMSS + r"^\d{8}$": "%Y%m%d", # YYYYMMDD + r"^\d{6}$": "%H%M%S", # HHMMSS + r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$": "%Y-%m-%d %H:%M:%S", # YYYY-MM-DD HH:MM:SS + r"^\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}$": "%Y/%m/%d %H:%M:%S", # YYYY/MM/DD HH:MM:SS + r"^\d{4}-\d{2}-\d{2}$": "%Y-%m-%d", # YYYY-MM-DD + r"^\d{4}/\d{2}/\d{2}$": "%Y/%m/%d" # YYYY/MM/DD + } - # Get the file-specific settings, datatypes, columns, etc. - # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` - acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"] - # ---- Create list of coordinate data variables - specified_vars = list(acoustics_config_map["xarray_variables"].keys()) - # ---- Create set of coordinate variables - specified_coords = list(acoustics_config_map["xarray_coordinates"].keys()) - # ---- Concatenate into a full configuration map - full_config_map = {**acoustics_config_map["xarray_coordinates"], - **acoustics_config_map["xarray_variables"]} - # ---- Initialize the dictionary that will define this key in the `input` attribute - acoustics_output = {"prc_nasc_df": pd.DataFrame(), - "nasc_df": pd.DataFrame()} - # ---- Initialize the SQL dictionary - # sql_acoustics_output = {"sv_df": pd.DataFrame()} - - # Create full filepath - acoustic_directory_path = ( - Path(file_configuration["data_root_dir"]) / acoustic_file_settings["directory"] - ) + for pattern, date_format in patterns.items(): + if re.match(pattern, timestamp_str): + return date_format - # Validate filepath, columns, datatypes - # ---- Directory check - directory_existence = acoustic_directory_path.exists() - # ---- Error evaluation (if applicable) - if not directory_existence: - raise FileNotFoundError( - f"The acoustic data directory [{acoustic_directory_path}] does not exist." - ) - # ---- Get the defined file extension - file_extension = acoustic_file_settings["extension"] - # ---- Create Path.glob generator object (the case of a *.zarr file) - file_path_obj = acoustic_directory_path.glob(f"*{'.'+file_extension}") - # ---- Find all zarr files - zarr_files = list(file_path_obj) - # ---- Ensure files exist or raise error otherwise - if len(zarr_files) < 1: - raise FileNotFoundError( - f"No `*.zarr` files found in [{acoustic_directory_path}]!" - ) + raise ValueError("Unknown timestamp format") + +filter_dict = dict(species_filer=species_filter, trawl_filter=trawl_filter) + +def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict): + + # Create dataframe copy + data_copy = biology_data.copy() + + # Iterate through dictionary to apply filters (if present) + for column, value in filter_dict.items(): + if column in data_copy.columns: + data_copy = data_copy[data_copy[column] == value] + + # Return output + return data_copy + + + +df[(df['species_id'] == species_filter if 'species_id' in df.columns else True)] +df[(df["species_id"] == 17 if "species_id" in df.columns)] + +(df[df["haul_num"] == 17 if "haul_num" in df.columns] else True) + + +from datetime import datetime + +df = biology_output["trawl_info_df"] +df.loc[(df['species_id'] == species_filter if 'species_id' in df.columns else True), :] +df.index + +biology_output["trawl_info_df"].reset_index().index +df = biology_output["catch_df"] +df = df.loc[0, :].to_frame().T +df.index +df.loc[(df['species_id'] == species_filter if 'species_id' in df.columns else True)] + +def convert_datetime(timestamp: Union[int, str, pd.Series]): + + if isinstance(timestamp, pd.Series): + test_timestamp = str(timestamp[0]) else: - # ---- Create Path to SQL database file - db_directory = Path(file_configuration["data_root_dir"]) / "database" - # ---- Create the directory if it does not already exist - db_directory.mkdir(parents=True, exist_ok=True) - # ---- Complete path to `biology.db` - db_file = db_directory / "acoustics.db" - # ---- Query the external SQL database to see if the file tracking table exists - tables = SQL(db_file, "inspect") - # ---- Create a list of string-formatted Path names - zarr_files_str = [str(file) for file in zarr_files] - # ---- Create DataFrame - current_files = pd.DataFrame(zarr_files_str, columns=["filepath"]) - # ---- Create if it is missing and then advance `zarr_files` - if "files_read" not in tables: - # ---- Insert into the SQL database file - _ = SQL(db_file, "insert", table_name="files_read", columns="filepath", - dataframe=current_files) - # ---- Create empty list for later comparison - new_files = [] - else: - # ---- Pull already processed filenames - previous_files = SQL(db_file, "select", table_name="files_read") - # ---- Compare against the current filelist - new_files = ( - [file for file in zarr_files_str if file not in set(previous_files["filepath"])] - ) - # ---- Create a DataFrame for the new files - new_files_df = pd.DataFrame(new_files, columns=["filepath"]) - # ---- Insert into the SQL database file - _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) - - # Find new files that have not yet been processed - if not new_files: - subset_files = zarr_files + test_timestamp = str(timestamp) + + # Approximate the datetime format + datetime_format = infer_datetime_format(str(test_timestamp)) + + # + if isinstance(timestamp, pd.Series): + return timestamp.apply(lambda x: datetime.strptime(x, datetime_format)) else: - subset_files = set(zarr_files).intersection(set(new_files)) - - # Read in the `*.zarr` file(s) - # ! [REQUIRES DASK] ---- Read in the listed file - if len(subset_files) > 1: - zarr_data_ds = xr.open_mfdataset(subset_files, engine="zarr", chunks="auto", - data_vars=specified_vars, coords=specified_coords) - elif len(subset_files) == 1: - zarr_data_ds = xr.open_dataset(subset_files[0], engine="zarr", chunks="auto") - - # Pre-process the Dataset, convert it to a DataFrame, and validate the structure - # ---- Extract coordinate metadata - coordinate_metadata = zarr_data_ds[["longitude", "latitude"]] - # ---- Convert to a DataFrame - zarr_data_df = zarr_data_ds.to_dataframe().reset_index() - # ---- Check for any missing columns - missing_columns = ( - [key for key in full_config_map.keys() if key not in zarr_data_df.columns] - ) - # ---- Raise Error, if needed - if missing_columns: - raise ValueError( - f"The following columns are missing from at least one *.{file_extension} file in " - f"[{acoustic_directory_path}]: {', '.join(missing_columns)}!" - ) - # ---- Select defined columns - zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map) - - # Extract defined acoustic frequency - # ---- From the configuration - transmit_settings = acoustic_analysis_settings["transmit"] - # ---- Transform `frequency_nominal`, if necessary - zarr_data_df_filtered["frequency_nominal"] = ( - configure_transmit_frequency(zarr_data_df_filtered["frequency_nominal"], - transmit_settings, - zarr_data_ds["frequency_nominal"].units) - ) - # ---- Filter out any unused frequency coordinates - zarr_data_df_output = ( - zarr_data_df_filtered - [zarr_data_df_filtered["frequency_nominal"] == transmit_settings["frequency"]] - ) + return datetime.strptime(timestamp, datetime_format) - # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object - # ---- Replace NASC `NaN` values with `0.0` - zarr_data_df_output.loc[:, "NASC"] = zarr_data_df_output.loc[:, "NASC"].fillna(0.0) - # ---- Drop frequency column and return the output - acoustics_output["prc_nasc_df"] = zarr_data_df_output.drop(columns = ["frequency_nominal"]) - # ---- Return output - if update_config: - if file_configuration["database"]["acoustics"] is None: - file_configuration["database"]["acoustics"] = db_file - return acoustics_output, file_configuration - else: - return acoustics_output -#################################################################################################### -# TEST: ACOUSTIC ZARR FILE INGESTION CONFIGURATION -# NOTE: -# ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration` -acoustic_data, file_configuration = load_acoustic_data(file_configuration) -acoustic_data -#################################################################################################### -def load_biology_data(file_configuration: dict, update_config: bool = True): - - # Get acoustic directory and initialization settings - # ---- Files - biology_file_settings = file_configuration["input_directories"]["biological"] - # ---- General settings - biology_analysis_settings = file_configuration["biology"] - - # Get the file-specific settings, datatypes, columns, etc. - # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` - biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] - # ---- Extract the expected file name ID's - biology_file_ids = biology_file_settings["file_name_formats"] - # ---- Extract all of the file ids - biology_config_ids = list(biology_file_ids.keys()) - # ---- Initialize the dictionary that will define this key in the `input` attribute - biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} - # ---- Initialize the SQL dictionary - sql_biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} +infer_datetime_format(stamp) +convert_datetime(stamp) +infer_datetime_format(202407) + +# {'global': False, 'INPFC': True, 'closest_haul': False, 'weighted_haul': False} +file_configuration["geospatial"]["link_biology_acoustics"] = "INPFC" +file_configuration["geospatial"] +spatial_config = file_configuration["geospatial"] +############### + +acoustic_data = self.input["acoustics"] +biology_data = self.input["biology"] + +def load_spatial_data(acoustic_data: dict, + biology_data: dict, + file_configuration: dict,): - # Create full filepath - biology_directory_path = ( - Path(file_configuration["data_root_dir"]) / biology_file_settings["directory"] + # Extract spatial strata *only* if spatial information from the configuration settings + # ---- Get (geo)spatial config + spatial_config = file_configuration["geospatial"] + # ---- Remove case sensitivity + spatial_config = {key.lower(): value for key, value in spatial_config.items()} + # ---- Extract the projection + projection = spatial_config["projection"] + # ---- Extract the biology-acoustics linking method options + acoustics_biology_link = spatial_config["link_biology_acoustics"] + + # Validate the configuration + validate_spatial_config(spatial_config) + + # Assign the spatial link constraints to the acoustic and biological data + if acoustics_biology_link == "INPFC": + apply_inpfc_definitions(acoustic_data, biology_data, spatial_config) + + + + # Convert the DataFrame to a GeoDataFrame + acoustic_data_gdf = gpd.GeoDataFrame( + data=acoustic_data, + geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]), + crs=projection ) - # ---- Directory check - directory_existence = biology_directory_path.exists() - # ---- Error evaluation (if applicable) - if not directory_existence: - raise FileNotFoundError( - f"The acoustic data directory [{biology_directory_path}] does not exist." - ) - # ---- Get the defined file extension - file_extension = biology_file_settings["extension"] - # ---- Create Path.glob generator object - file_path_obj = biology_directory_path.glob(f"*{'.'+file_extension}") - #---- Create list of `*.csv`` files - csv_files = list(file_path_obj) - # ---- Ensure files exist or raise error otherwise - if len(csv_files) < 1: - raise FileNotFoundError( - f"No `*.csv` files found in [{biology_directory_path}]!" - ) - else: - # ---- Create Path to SQL database file - db_directory = Path(file_configuration["data_root_dir"]) / "database" - # ---- Create the directory if it does not already exist - db_directory.mkdir(parents=True, exist_ok=True) - # ---- Complete path to `biology.db` - db_file = db_directory / "biology.db" - # ---- Query the external SQL database to see if the file tracking table exists - tables = SQL(db_file, "inspect") - # ---- Create a list of string-formatted Path names - csv_files_str = [str(file) for file in csv_files] - # ---- Create DataFrame - current_files = pd.DataFrame(csv_files_str, columns=["filepath"]) - # ---- Create if it is missing and then advance `csv_files` - if "files_read" not in tables: - # ---- Insert into the SQL database file - _ = SQL(db_file, "insert", table_name="files_read", columns="filepath", - dataframe=current_files) - # ---- Create empty list for later comparison - new_files = [] - else: - # ---- Pull already processed filenames - previous_files = SQL(db_file, "select", table_name="files_read") - # ---- Compare against the current filelist - new_files = ( - [file for file in csv_files_str if file not in set(previous_files["filepath"])] - ) - # ---- Create a DataFrame for the new files - new_files_df = pd.DataFrame(new_files, columns=["filepath"]) - # ---- Insert into the SQL database file - _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) - - # Iterate through each of the file ids and read in the data - for id in list(biology_file_ids.keys()): - # ---- Extract the specific config mapping for this tag/id - sub_config_map = biology_config_map[id] - # ---- Drop the `{FIELD_ID}` tag identifier - file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id]) - # ---- Replace all other tags with `*` placeholders - file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) - # ---- Create Path object with the generalized format - subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}") - # ---- List all files that match this pattern - subcsv_files_str = [str(file) for file in list(subfile_path_obj)] - # ---- Filter for only new files - subset_files = set(subcsv_files_str).intersection(set(new_files)) - # ---- Pull from SQL database, if applicable - if f"{id}_df" in tables: - # ---- SELECT - sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*") - # ---- Concatenate to the dictionary - sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df]) - # ---- Add data files not stored in SQL database - if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables: - if len(subset_files) > 0: - file_list = subset_files - else: - file_list = subcsv_files_str - # ---- Create a list of relevant dataframes - sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map) - for file in file_list] - # ---- Concatenate into a single DataFrame - sub_df = pd.concat(sub_df_lst, ignore_index=True) - # ---- Lower-case sex - if "sex" in sub_df.columns: - sub_df["sex"] = sub_df["sex"].str.lower() - # ---- Concatenate to the dictionary DataFrame - biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df]) - - # Get contrasts used for filtering the dataset - # ---- Species - species_filter = file_configuration["species"]["number_code"] - # ---- Trawl partition information - trawl_filter = biology_analysis_settings["catch"]["partition"] - # ---- Apply the filter - filtered_biology_output = { - key: df[ - (df['species_id'] == species_filter if 'species_id' in df.columns else True) & - (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns else True) - ] - for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty - } - # Update the SQL database - for table_name, df in filtered_biology_output.items(): - # ---- Update - _ = SQL(db_file, "insert", table_name=table_name, columns="*", - dataframe=df) - - # Combine the two datasets - merged_output = { - key: pd.concat([ - sql_biology_output.get(key, pd.DataFrame()), - filtered_biology_output.get(key, pd.DataFrame()) - ]).drop_duplicates().reset_index(drop=True) - for key in set(sql_biology_output) | set(filtered_biology_output) - } - # ---- Return output - if update_config: - if file_configuration["database"]["biology"] is None: - file_configuration["database"]["biology"] = db_file - return merged_output, file_configuration - else: - return merged_output + # Validate the spatial biology-acoustics linking method + # ---- Get the biology-acoustics linking method + link_method = next(key for key, value in acoustics_biology_link.items() if value) + # ---- Flag Error if unexpected method + if link_method not in ["global", "closest_haul", "INPFC", "weighted_haul"]: + raise ValueError( + f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options " + f"include: 'global', 'closest_haul', 'weighted_haul', and 'INPFC'." + ) + #################################################################################################### # TEST: BIOLOGY FILE INGESTION CONFIGURATION # NOTE: From 7f49f316a7dcacca48940b076df8b75923250b96 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Thu, 25 Jul 2024 09:51:06 -0700 Subject: [PATCH 08/81] Reorganize loading functions --- echopop/live/live_core.py | 61 ++ echopop/live/live_data_loading.py | 612 +++++++++++++++++++ echopop/live/live_data_processing.py | 877 +-------------------------- echopop/live/live_spatial_methods.py | 198 ++++++ echopop/live/live_survey.py | 8 +- echopop/live/sql_methods.py | 137 +++-- echopop/zarr_read_ingest_test.py | 17 +- 7 files changed, 986 insertions(+), 924 deletions(-) create mode 100644 echopop/live/live_data_loading.py create mode 100644 echopop/live/live_spatial_methods.py diff --git a/echopop/live/live_core.py b/echopop/live/live_core.py index 95750f5f..28a63237 100644 --- a/echopop/live/live_core.py +++ b/echopop/live/live_core.py @@ -131,3 +131,64 @@ "expression": r"(?P.+)" }, } + +SPATIAL_CONFIG_MAP = { + "closest_haul": { + "proximity": { + "choices": ["distance", "time"], + }, + }, + "global" : {}, + "griddify": { + "bounds": { + "longitude": { + "types": [float] + }, + "latitude": { + "types": [float] + }, + "northings": { + "types": [float] + }, + "eastings": { + "types": [float] + }, + "pairs": [("longitude", "latitude"), ("northings", "eastings")], + }, + "grid_resolution": { + "x_distance": { + "types": float, + }, + "y_distance": { + "types": float, + }, + "d_longitude": { + "types": float, + }, + "d_latitude": { + "types": float, + }, + "grid_size_x": { + "types": int, + }, + "grid_size_y": { + "types": int, + }, + "pairs": [("x_distance", "y_distance"), ("d_longitude", "d_latitude"), + ("grid_size_x", "grid_size_y")], + }, + }, + "inpfc": { + "stratum_names": { + "types": [int, str] + }, + "latitude_max": { + "types": [float], + }, + }, + "weighted_haul": { + "proximity": { + "choices": ["distance", "time"] + }, + }, +} \ No newline at end of file diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py new file mode 100644 index 00000000..ce5a06f7 --- /dev/null +++ b/echopop/live/live_data_loading.py @@ -0,0 +1,612 @@ +from pathlib import Path +from typing import Union, Tuple, Optional, List +import yaml +import re +from .sql_methods import SQL, query_processed_files, sql_data_exchange +import pandas as pd +from datetime import datetime +import xarray as xr + +from .live_core import( + LIVE_FILE_FORMAT_MAP, + LIVE_INPUT_FILE_CONFIG_MAP, + SPATIAL_CONFIG_MAP +) + +# TODO: Incorporate complete YAML file validator +# TODO: Documentation +def live_configuration(live_init_config_path: Union[str, Path], + live_file_config_path: Union[str, Path]): + + # Validate file existence + # ---- str-to-Path conversion, if necessary + live_init_config_path = Path(live_init_config_path) + live_file_config_path = Path(live_file_config_path) + # ---- Create list of both config paths + config_files = [live_init_config_path, live_file_config_path] + # ---- List of file existence checks + config_existence = [live_init_config_path.exists(), live_file_config_path.exists()] + # ---- Error evaluation and print message (if applicable) + if not all(config_existence): + missing_config = [ + files for files, exists in zip(config_files, config_existence) if not exists + ] + raise FileNotFoundError( + f"The following configuration files do not exist: {missing_config}." + ) + + # Read the YAML configuration/recipe file to parameterize the `LiveSurvey` class + # ---- Initialization settings + init_config = yaml.safe_load(Path(live_init_config_path).read_text()) + # ---- Filepath/directory settings + file_config = yaml.safe_load(Path(live_file_config_path).read_text()) + + # Check for intersecting/duplicative configuration keys + # ---- Compare sets of keys from each dictionary + config_intersect = set(init_config.keys()).intersection(set(file_config.keys())) + # ---- Raise error if needed + if config_intersect: + raise ValueError( + f"The initialization and file configuration files comprise the following intersecting " + f"keys: {' ,'.join(config_intersect)}. Key names must be unique for each configuration " + f"file." + ) + + # Combine both into a dictionary output that can be added to the `LiveSurvey` class object + return {**init_config, **file_config} + +# TODO: Documentation +def validate_data_directory(root_directory: str, file_settings: dict) -> List[Path]: + + # Get acoustic directory and initialization settings + # ---- Create the full filepath + directory_path = Path(root_directory) / file_settings["directory"] + # ---- Get the defined file extension + file_extension = file_settings["extension"] + + # Validate filepath, columns, datatypes + # ---- Error evaluation (if applicable) + if not directory_path.exists(): + raise FileNotFoundError( + f"The acoustic data directory [{directory_path}] does not exist." + ) + + # Validate that files even exist + # ---- List available *.zarr files + data_files = list(directory_path.glob(f"*{'.'+file_extension}")) + # ---- Error evaluation (if applicable) + if not data_files: + raise FileNotFoundError( + f"No `*.{file_extension}` files found in [{directory_path}]!" + ) + + # Return the output + return data_files + +def read_acoustic_zarr(acoustic_files: Path) -> tuple: + + # Get the file-specific settings, datatypes, columns, etc. + # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` + acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"] + # ---- Create list of coordinate data variables + specified_vars = list(acoustics_config_map["xarray_variables"].keys()) + # ---- Create set of coordinate variables + specified_coords = list(acoustics_config_map["xarray_coordinates"].keys()) + # ---- Concatenate into a full configuration map + full_config_map = {**acoustics_config_map["xarray_coordinates"], + **acoustics_config_map["xarray_variables"]} + + # Determine the file loading method for the `acoustic_files` + if len(acoustic_files) > 1: + zarr_data_ds = xr.open_mfdataset(acoustic_files, engine="zarr", chunks="auto", + data_vars=specified_vars, coords=specified_coords) + else: + zarr_data_ds = xr.open_dataset(acoustic_files[0], engine="zarr", chunks="auto") + + # Pre-process the Dataset, convert it to a DataFrame, and validate the structure + # ---- Convert to a DataFrame + zarr_data_df = zarr_data_ds.to_dataframe().reset_index() + # ---- Check for any missing columns + missing_columns = ( + [key for key in full_config_map.keys() if key not in zarr_data_df.columns] + ) + # ---- Raise Error, if needed + if missing_columns: + raise ValueError( + f"The following columns are missing from at least one file: in " + f"{', '.join(missing_columns)}!" + ) + # ---- Select defined columns + zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map) + + # Gather some of the units + data_units = { + "longitude": zarr_data_ds.longitude.units, + "latitude": zarr_data_ds.latitude.units, + "frequency": zarr_data_ds.frequency_nominal.units, + } + + # Return a Tuple + return zarr_data_df_filtered, data_units + +# TODO: Documentation +def configure_transmit_frequency(frequency_values: pd.Series, + transmit_settings: dict, + current_units: str): + + # Extract transmit frequency units defined in configuration file + configuration_units = transmit_settings["units"] + + # Transform the units, if necessary + # ---- Hz to kHz + if current_units == "Hz" and configuration_units == "kHz": + return frequency_values * 1e-3 + # ---- kHz to Hz + elif current_units == "kHz" and configuration_units == "Hz": + return frequency_values * 1e3 + # ---- No change + else: + return frequency_values + +# TODO: Documentation +def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame, + file_configuration: dict) -> pd.DataFrame: + + # Get acoustic processing settings + acoustic_analysis_settings = file_configuration["acoustics"] + # ---- Extract the fined acoustic frequency + transmit_settings = acoustic_analysis_settings["transmit"] + + # Filter the dataset + # ---- Configure `frequency_nominal`, if necessary + prc_nasc_df["frequency_nominal"] = ( + configure_transmit_frequency(prc_nasc_df["frequency_nominal"], + transmit_settings, + acoustic_analysis_settings["dataset_units"]["frequency"]) + ) + # ---- Filter out any unused frequency coordinates + prc_nasc_df_filtered = ( + prc_nasc_df[prc_nasc_df["frequency_nominal"] == transmit_settings["frequency"]] + ) + + # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object + # ---- Replace NASC `NaN` values with `0.0` + prc_nasc_df_filtered.loc[:, "NASC"] = prc_nasc_df_filtered.loc[:, "NASC"].fillna(0.0) + # ---- Drop the `frequency_nominal` column and return the output + return prc_nasc_df_filtered.drop(columns = ["frequency_nominal"]) + +# TODO: Documentation +def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame]: + + # Get the acoustic file settings and root directory + # ---- File settings + file_settings = file_configuration["input_directories"]["acoustics"] + # ---- Root directory + root_directory = file_configuration["data_root_dir"] + + # Get and validate the acoustic data directory and files + acoustic_files = validate_data_directory(root_directory, file_settings) + + # Query `acoustics.db` to process only new files (or create the db file in the first place) + new_acoustic_files, file_configuration["database"]["acoustics"] = ( + query_processed_files(root_directory, file_settings, acoustic_files) + ) + + # Read in the acoustic data files + if new_acoustic_files: + # ! [REQUIRES DASK] ---- Read in the listed file + prc_nasc_df, acoustic_data_units = read_acoustic_zarr(new_acoustic_files) + # ---- Add the `acoustic_data_units` to the dictionary + file_configuration["acoustics"]["dataset_units"] = acoustic_data_units + # ---- Preprocess the acoustic dataset + prc_nasc_df_processed = preprocess_acoustic_data(prc_nasc_df, file_configuration) + # ---- Return output + return prc_nasc_df_processed + else: + return None + +def filter_filenames(directory_path: Path, filename_id: str, + files: List[Path], + file_extension: str): + + # Drop the `{FIELD_ID}` tag identifier + file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', filename_id) + # ---- Replace all other tags with `*` placeholders + file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) + # ---- Create Path object with the generalized format + subfile_path_obj = directory_path.glob(f"{file_id_format}.{file_extension}") + # ---- List all files that match this pattern + subfile_str = [str(file) for file in list(subfile_path_obj)] + + # Convert list of proposed files from Path to String + file_str = [str(file) for file in list(files)] + + # Find intersection with the proposed filenames and return the output + return list(set(subfile_str).intersection(set(file_str))) + +def compile_filename_format(file_name_format: str): + + # Create a copy of `file_name_format` + regex_pattern = file_name_format + + # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern + for key, value in LIVE_FILE_FORMAT_MAP.items(): + regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"]) + # ---- Replace the `FILE_ID` tag + regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P\1)', regex_pattern) + + # Compile the regex pattern and return the output + return re.compile(regex_pattern) + +def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict): + + # Read in the `*.csv` file + df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys())) + + # Validate the dataframe + # ---- Check for any missing columns + missing_columns = ( + [key for key in config_map["dtypes"].keys() if key not in df.columns] + ) + # ---- Raise Error, if needed + if missing_columns: + raise ValueError( + f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!" + ) + # ---- Ensure the correct datatypes + df_validated = df.astype(config_map["dtypes"]) + # ---- Replace column names and drop + df_validated = df_validated.rename(columns=config_map["names"]) + + # Get the substring components that can be added to the DataFrame + filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern) + # ---- Create sub-list of columns that can be added to the DataFrame + valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings))) + + # Compile the filename regular expression + compiled_regex = compile_filename_format(pattern) + # ---- Create the `Match` object that will be used to parse the string + match_obj = compiled_regex.search(file.name) + + # Iterate through the filename-derived tags and add them to the DataFrame + for i in valid_tags: + matched_key = LIVE_FILE_FORMAT_MAP[i] + df_validated[matched_key["name"]] = matched_key["dtype"](match_obj.group(i)) + + # Return the resulting DataFrame + return df_validated + +def get_table_key_names(db_file: Path, data_dict: dict, table_name: str) -> List[str]: + + # Get the data input column names + if data_dict[table_name].empty: + # ---- Inspect the table + inspected_table = SQL(db_file, "inspect", table_name=table_name) + # ---- Create a list of the data columns + table_columns = list(inspected_table.keys()) + else: + # ---- Get the DataFrame column names + table_columns = data_dict[table_name].columns + + # Create a list of the primary keys + key_columns = ( + set(table_columns) + .intersection(["trawl_partition", "sex", "haul_num", "species_id", "longitude", + "latitude"]) + ) + + # Return a list of the output + return list(key_columns) + +def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict): + + # Create dataframe copy + data_copy = biology_data.copy() + + # Iterate through dictionary to apply filters (if present) + for column, value in filter_dict.items(): + if column in data_copy.columns: + data_copy = data_copy[data_copy[column] == value] + + # Return output + return data_copy + +def preprocess_biology_data(biology_output: dict, file_configuration: dict): + + # Get SQL database file + biology_db = file_configuration["database"]["biology"] + + # Get contrasts used for filtering the dataset + # ---- Species + species_filter = file_configuration["species"]["number_code"] + # ---- Trawl partition information + trawl_filter = file_configuration["biology"]["catch"]["partition"] + # ---- Create filter dictionary + filter_dict = dict(species_id=species_filter, trawl_partition=trawl_filter) + + # Apply the filter + filtered_biology_output = { + key: biology_data_filter(df, filter_dict) + for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty + } + # ---- Swap this out if no new files are present + if not filtered_biology_output: + # ---- Get available tables + table_list = list(set(SQL(biology_db, "map")) - set(["files_read"])) + # ---- Plug into the dictionary + filtered_biology_output.update({key: pd.DataFrame() for key in table_list}) + # ---- Initialize the results dictionary + results_dict = {key: pd.DataFrame() for key in filtered_biology_output.keys()} + + # Update the SQL database + for table_name, df in filtered_biology_output.items(): + # ---- Get identifier columns + key_columns = get_table_key_names(biology_db, filtered_biology_output, table_name) + # ---- Create copy + df = df.copy() + # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint + df.loc[:, "id"] = "row" + df.index.astype(str) + "-" + "-".join(key_columns) + # ---- Insert the new data into the database & pull in the combined dataset + table_df = sql_data_exchange(biology_db, + dataframe=df, + table_name=table_name, + id_columns=["id"], + primary_keys=["id"], + output_type=pd.DataFrame) + # ---- Add to the outgoing dictionary (and drop SQL db identifier) + results_dict.update({table_name: table_df.drop(columns="id")}) + + # Return the output + return results_dict + +def infer_datetime_format(timestamp_str: Union[int, str]): + patterns = { + r"^\d{14}$": "%Y%m%d%H%M%S", # YYYYMMDDHHMMSS + r"^\d{8}$": "%Y%m%d", # YYYYMMDD + r"^\d{6}$": "%H%M%S", # HHMMSS + r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$": "%Y-%m-%d %H:%M:%S", # YYYY-MM-DD HH:MM:SS + r"^\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}$": "%Y/%m/%d %H:%M:%S", # YYYY/MM/DD HH:MM:SS + r"^\d{4}-\d{2}-\d{2}$": "%Y-%m-%d", # YYYY-MM-DD + r"^\d{4}/\d{2}/\d{2}$": "%Y/%m/%d" # YYYY/MM/DD + } + + for pattern, date_format in patterns.items(): + if re.match(pattern, timestamp_str): + return date_format + + raise ValueError("Unknown timestamp format") + +def convert_datetime(timestamp: Union[int, str, pd.Series]): + + if isinstance(timestamp, pd.Series): + test_timestamp = str(timestamp[0]) + else: + test_timestamp = str(timestamp) + + # Approximate the datetime format + datetime_format = infer_datetime_format(str(test_timestamp)) + + # + if isinstance(timestamp, pd.Series): + return timestamp.apply(lambda x: datetime.strptime(x, datetime_format)) + else: + return datetime.strptime(timestamp, datetime_format) + +def load_biology_data(file_configuration: dict): + + # Get the acoustic file settings and root directory + # ---- File settings + file_settings = file_configuration["input_directories"]["biology"] + # ---- Root directory + root_directory = file_configuration["data_root_dir"] + + # Get and validate the acoustic data directory and files + biology_files = validate_data_directory(root_directory, file_settings) + + # Query `biology.db` to process only new files (or create the db file in the first place) + # SQL(biology_db, "drop", table_name="files_read") + new_biology_files, file_configuration["database"]["biology"] = ( + query_processed_files(root_directory, file_settings, biology_files) + ) + + # Get the file-specific settings, datatypes, columns, etc. + # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` + biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] + # ---- Extract the expected file name ID's + biology_file_ids = file_settings["file_name_formats"] + # ---- Extract all of the file ids + biology_config_ids = list(biology_file_ids.keys()) + # ---- Initialize the dictionary that will define this key in the `input` attribute + biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} + # ---- Create filepath object + directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"] + + # Add SQL file to dict + file_configuration["database"]["biology"] = ( + Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"] + ) + + # Iterate through the different biology datasets and read them in + for dataset in list(biology_file_ids.keys()): + # ---- Get dataset-specific file lists + dataset_files = filter_filenames(directory_path, + file_settings["file_name_formats"][dataset], + new_biology_files, + file_settings["extension"]) + # ---- If there are dataset files available + if dataset_files: + # ---- Read in validated biology data + dataframe_list = [read_biology_csv(Path(file), + file_settings["file_name_formats"][dataset], + biology_config_map[dataset]) + for file in dataset_files] + # ---- Concatenate the dataset + dataframe_combined = pd.concat(dataframe_list, ignore_index=True) + # ---- Lower-case sex + if "sex" in dataframe_combined.columns: + dataframe_combined["sex"] = dataframe_combined["sex"].str.lower() + # ---- Lower-case trawl partition type + if "trawl_partition" in dataframe_combined.columns: + dataframe_combined["trawl_partition"] = dataframe_combined["trawl_partition"].str.lower() + # ---- Reformat datetime column + if "datetime" in dataframe_combined.columns: + dataframe_combined["datetime"] = convert_datetime(dataframe_combined["datetime"]) + # ---- Add to the data dictionary + biology_output[f"{dataset}_df"] = dataframe_combined + + # Pre-process and return the results + return preprocess_biology_data(biology_output, file_configuration) + +def validate_hauls_config(spatial_config: dict, link_method: str): + + # Get the link method configuration map + link_method_settings = SPATIAL_CONFIG_MAP[link_method] + + # Extract the defined settings + input_method_settings = spatial_config[link_method] + + # Check for `proximity` + if "proximity" not in input_method_settings.keys(): + raise KeyError( + "The following parameters are missing from the biology-acoustic linking method: " + "'proximity'!" + ) + + # Evaluate valid options for `proximity` + if input_method_settings["proximity"] not in link_method_settings["proximity"]["choices"]: + raise KeyError( + f"Value biology-acoustic linking method parameter `proximity` must be one of the : " + f"following: {link_method_settings['proximity']['choices']}." + ) + +def validate_griddify_config(spatial_config: dict, link_method: str): + + # Get the link method configuration map + link_method_settings = SPATIAL_CONFIG_MAP[link_method] + + # Extract the defined settings + input_method_settings = spatial_config[link_method] + + # Check for the required keys + key_diff = set(input_method_settings.keys()).difference(set(link_method_settings.keys())) + # ---- Raise Error + if key_diff: + raise KeyError( + f"The following parameters are missing from the biology-acoustic linking method: " + f"{list(key_diff)}!" + ) + + # Iterate through the keys to evaluate inputs + for key in list(input_method_settings.keys()): + # ---- Subset the input method config + input = input_method_settings[key] + # ---- Get the original config of the dtypes + model = link_method_settings[key] + # ---- Compare entries + parameter_diff = set(input.keys()).difference(set(model.keys())) + # ---- Raise Error + if parameter_diff: + raise KeyError( + f"Unexpected parameter(s) ('{parameter_diff}') detected in '{link_method}' " + f"configuration." + ) + # ---- Check if the appropriate coordinate pairs are present + coordinate_pairs = [set(param).intersection(set(input.keys())) for param in model["pairs"]] + # ---- Count the number of paired coordinates + pair_counts = [len(param) for param in coordinate_pairs] + # ---- If there are multiple pairs + if (np.array(pair_counts) == 2).sum() != 1: + raise ValueError( + f"A single coordinate-pair is allowed (and required) within the '{key}' parameter " + f"for the link method '{link_method}' defined via the following options: " + f"{model['pairs']}." + ) + # ---- Check the datatypes + for parameter in input.keys(): + # ---- Get the datatypes + config_dtypes = model[parameter]["types"] + # ---- Get input parameter + input_parameter = input[parameter] + # ---- If List + if isinstance(config_dtypes, list): + if not isinstance(input_parameter, list): + raise TypeError( + f"Biology-acoustic linking method argument '{parameter}' within '{key}' " + f"for method '{link_method}' must be contained within a list." + ) + else: + input_parameter = [input_parameter] + config_dtypes = [config_dtypes] + # ---- Check correct datatypes + if not np.all([type(value) in config_dtypes for value in input_parameter]): + raise TypeError( + f"Biology-acoustic linking method argument '{parameter}' within '{key}' " + f"for method '{link_method}' must be one of the following types within a list: " + f"{config_dtypes}." + ) + +def validate_inpfc_config(spatial_config: dict, link_method: str): + + # Get the link method configuration map + link_method_settings = SPATIAL_CONFIG_MAP[link_method] + + # Extract the defined settings + input_method_settings = spatial_config[link_method] + + # Check for the required keys + key_diff = set(input_method_settings.keys()).difference(set(link_method_settings.keys())) + # ---- Raise Error + if key_diff: + raise KeyError( + f"The following parameters are missing from the biology-acoustic linking method: " + f"{list(key_diff)}!" + ) + + # Iterate through the keys to evaluate inputs + for key in list(input_method_settings.keys()): + # ---- Subset the input method config + input = input_method_settings[key] + # ---- Get the original config of the dtypes + model = link_method_settings[key]["types"] + # ---- Evaluate if a list + if not isinstance(input, list): + raise TypeError( + f"Biology-acoustic linking method argument '{key}' for method '{link_method}' must " + f"be contained within a list." + ) + # ---- Evaluate if it is a type within the list + if not type(input[0]) in model: + raise TypeError( + f"Biology-acoustic linking method argument '{key}' for method '{link_method}' must " + f"be one of the following types within a list: {model}." + ) + + +def validate_spatial_config(spatial_config: dict): + + # Check the link method + # ---- Extract string-formatted method name + link_method = spatial_config["link_biology_acoustics"].lower() + # ---- Validate + if link_method not in SPATIAL_CONFIG_MAP.keys(): + raise ValueError( + f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options " + f"include: 'global', 'closest_haul', 'weighted_haul', 'griddify', and 'INPFC'." + ) + + # Verify that associated parameters are present in the configuration settings + # ---- Get keys as a list + config_keys = list(spatial_config.keys()) + # ---- Check for specific methods + if link_method not in config_keys and link_method != "global": + raise ValueError( + f"No parameters provided for the biology-acoustic linking ([{link_method}])." + ) + + # Check key settings + if link_method == "griddify": + validate_griddify_config(spatial_config, link_method) + elif link_method == "inpfc": + validate_inpfc_config(spatial_config, link_method) + elif link_method != "global": + validate_hauls_config(spatial_config, link_method) diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py index fd89993c..cf126230 100644 --- a/echopop/live/live_data_processing.py +++ b/echopop/live/live_data_processing.py @@ -5,885 +5,10 @@ from typing import Union, Tuple, Optional, List import pandas as pd -import xarray as xr + import numpy as np from .live_core import( - LIVE_DATA_STRUCTURE, LIVE_FILE_FORMAT_MAP, LIVE_INPUT_FILE_CONFIG_MAP ) - -from .sql_methods import SQL - -# TODO: Incorporate complete YAML file validator -# TODO: Documentation -def live_configuration(live_init_config_path: Union[str, Path], - live_file_config_path: Union[str, Path]): - - # Validate file existence - # ---- str-to-Path conversion, if necessary - live_init_config_path = Path(live_init_config_path) - live_file_config_path = Path(live_file_config_path) - # ---- Create list of both config paths - config_files = [live_init_config_path, live_file_config_path] - # ---- List of file existence checks - config_existence = [live_init_config_path.exists(), live_file_config_path.exists()] - # ---- Error evaluation and print message (if applicable) - if not all(config_existence): - missing_config = [ - files for files, exists in zip(config_files, config_existence) if not exists - ] - raise FileNotFoundError( - f"The following configuration files do not exist: {missing_config}." - ) - - # Read the YAML configuration/recipe file to parameterize the `LiveSurvey` class - # ---- Initialization settings - init_config = yaml.safe_load(Path(live_init_config_path).read_text()) - # ---- Filepath/directory settings - file_config = yaml.safe_load(Path(live_file_config_path).read_text()) - - # Check for intersecting/duplicative configuration keys - # ---- Compare sets of keys from each dictionary - config_intersect = set(init_config.keys()).intersection(set(file_config.keys())) - # ---- Raise error if needed - if config_intersect: - raise ValueError( - f"The initialization and file configuration files comprise the following intersecting " - f"keys: {' ,'.join(config_intersect)}. Key names must be unique for each configuration " - f"file." - ) - - # Combine both into a dictionary output that can be added to the `LiveSurvey` class object - return {**init_config, **file_config} - -def validate_data_directory(root_directory: str, file_settings: dict) -> List[Path]: - - # Get acoustic directory and initialization settings - # ---- Create the full filepath - directory_path = Path(root_directory) / file_settings["directory"] - # ---- Get the defined file extension - file_extension = file_settings["extension"] - - # Validate filepath, columns, datatypes - # ---- Error evaluation (if applicable) - if not directory_path.exists(): - raise FileNotFoundError( - f"The acoustic data directory [{directory_path}] does not exist." - ) - - # Validate that files even exist - # ---- List available *.zarr files - data_files = list(directory_path.glob(f"*{'.'+file_extension}")) - # ---- Error evaluation (if applicable) - if not data_files: - raise FileNotFoundError( - f"No `*.{file_extension}` files found in [{directory_path}]!" - ) - - # Return the output - return data_files - -def query_processed_files(root_directory: str, file_settings: dict, files: List[Path]) -> dict: - - # Get the database name - db_name = file_settings["database_name"] - - # Create filepath to the SQL database - # ---- Create Path to SQL database file - db_directory = Path(root_directory) / "database" - # ---- Create the directory if it does not already exist - db_directory.mkdir(parents=True, exist_ok=True) - # ---- Complete path to the database file - db_file = db_directory / db_name - - # Create a list of string-formatted Path names - files_str = [str(file) for file in files] - # ---- Create DataFrame - current_files = pd.DataFrame(files_str, columns=["filepath"]) - - # Check for the table `files_read` - files_read_tbl = SQL(db_file, "validate", table_name="files_read") - - # Validate whether the table exists; if not, create the table and then insert - if not files_read_tbl: - # ---- Create table - SQL(db_file, "create", table_name="files_read", dataframe=current_files, - primary_keys = ["filepath"]) - # ---- Populate table - SQL(db_file, "insert", table_name="files_read", dataframe=current_files) - # ---- Break early - return files_str, db_file - - # Query already existing files - previous_files = SQL(db_file, "select", table_name="files_read", output_type=str) - # ---- Insert file list - SQL(db_file, "insert", table_name="files_read", dataframe=current_files, id_columns="filepath") - - # Filter out previously processed files - # ---- Apply filter by comparing sets and return the output - return list(set(files_str) - set(previous_files)), db_file - -def sql_data_exchange(database_file: Path, **kwargs): - - # Check whether the `table_name` table exists - table_exists = SQL(database_file, "validate", **kwargs) - - # If empty and table does not exist - if kwargs["dataframe"].empty and table_exists: - return SQL(database_file, "select", **kwargs) - - # Create table if it does not exist and run the initial insertion - if not table_exists: - # ---- Create table - SQL(database_file, "create", **kwargs) - # ---- Ignore the `id_columns` argument, if present - try: - del kwargs["id_columns"] - except KeyError: - pass - # ---- Insert into table - SQL(database_file, "insert", **kwargs) - # ---- Return the initial dataframe - return kwargs.get("dataframe") - - # Insert into the table - SQL(database_file, "insert", **kwargs) - - # Select existing data frame the database and return the output - return SQL(database_file, "select", **kwargs) - -def read_acoustic_zarr(acoustic_files: Path) -> tuple: - - # Get the file-specific settings, datatypes, columns, etc. - # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` - acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"] - # ---- Create list of coordinate data variables - specified_vars = list(acoustics_config_map["xarray_variables"].keys()) - # ---- Create set of coordinate variables - specified_coords = list(acoustics_config_map["xarray_coordinates"].keys()) - # ---- Concatenate into a full configuration map - full_config_map = {**acoustics_config_map["xarray_coordinates"], - **acoustics_config_map["xarray_variables"]} - - # Determine the file loading method for the `acoustic_files` - if len(acoustic_files) > 1: - zarr_data_ds = xr.open_mfdataset(acoustic_files, engine="zarr", chunks="auto", - data_vars=specified_vars, coords=specified_coords) - else: - zarr_data_ds = xr.open_dataset(acoustic_files[0], engine="zarr", chunks="auto") - - # Pre-process the Dataset, convert it to a DataFrame, and validate the structure - # ---- Convert to a DataFrame - zarr_data_df = zarr_data_ds.to_dataframe().reset_index() - # ---- Check for any missing columns - missing_columns = ( - [key for key in full_config_map.keys() if key not in zarr_data_df.columns] - ) - # ---- Raise Error, if needed - if missing_columns: - raise ValueError( - f"The following columns are missing from at least one file: in " - f"{', '.join(missing_columns)}!" - ) - # ---- Select defined columns - zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map) - - # Gather some of the units - data_units = { - "longitude": zarr_data_ds.longitude.units, - "latitude": zarr_data_ds.latitude.units, - "frequency": zarr_data_ds.frequency_nominal.units, - } - - # Return a Tuple - return zarr_data_df_filtered, data_units - -# TODO: Documentation -def configure_transmit_frequency(frequency_values: pd.Series, - transmit_settings: dict, - current_units: str): - - # Extract transmit frequency units defined in configuration file - configuration_units = transmit_settings["units"] - - # Transform the units, if necessary - # ---- Hz to kHz - if current_units == "Hz" and configuration_units == "kHz": - return frequency_values * 1e-3 - # ---- kHz to Hz - elif current_units == "kHz" and configuration_units == "Hz": - return frequency_values * 1e3 - # ---- No change - else: - return frequency_values - -def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame, - file_configuration: dict) -> pd.DataFrame: - - # Get acoustic processing settings - acoustic_analysis_settings = file_configuration["acoustics"] - # ---- Extract the fined acoustic frequency - transmit_settings = acoustic_analysis_settings["transmit"] - - # Filter the dataset - # ---- Configure `frequency_nominal`, if necessary - prc_nasc_df["frequency_nominal"] = ( - configure_transmit_frequency(prc_nasc_df["frequency_nominal"], - transmit_settings, - acoustic_analysis_settings["dataset_units"]["frequency"]) - ) - # ---- Filter out any unused frequency coordinates - prc_nasc_df_filtered = ( - prc_nasc_df[prc_nasc_df["frequency_nominal"] == transmit_settings["frequency"]] - ) - - # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object - # ---- Replace NASC `NaN` values with `0.0` - prc_nasc_df_filtered.loc[:, "NASC"] = prc_nasc_df_filtered.loc[:, "NASC"].fillna(0.0) - # ---- Drop the `frequency_nominal` column and return the output - return prc_nasc_df_filtered.drop(columns = ["frequency_nominal"]) - -def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame]: - - # Get the acoustic file settings and root directory - # ---- File settings - file_settings = file_configuration["input_directories"]["acoustics"] - # ---- Root directory - root_directory = file_configuration["data_root_dir"] - - # Get and validate the acoustic data directory and files - acoustic_files = validate_data_directory(root_directory, file_settings) - - # Query `acoustics.db` to process only new files (or create the db file in the first place) - new_acoustic_files, file_configuration["database"]["acoustics"] = ( - query_processed_files(root_directory, file_settings, acoustic_files) - ) - - # Read in the acoustic data files - if new_acoustic_files: - # ! [REQUIRES DASK] ---- Read in the listed file - prc_nasc_df, acoustic_data_units = read_acoustic_zarr(new_acoustic_files) - # ---- Add the `acoustic_data_units` to the dictionary - file_configuration["acoustics"]["dataset_units"] = acoustic_data_units - # ---- Preprocess the acoustic dataset - prc_nasc_df_processed = preprocess_acoustic_data(prc_nasc_df, file_configuration) - # ---- Return output - return prc_nasc_df_processed - else: - return None - -def filter_filenames(directory_path: Path, filename_id: str, - files: List[Path], - file_extension: str): - - # Drop the `{FIELD_ID}` tag identifier - file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', filename_id) - # ---- Replace all other tags with `*` placeholders - file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) - # ---- Create Path object with the generalized format - subfile_path_obj = directory_path.glob(f"{file_id_format}.{file_extension}") - # ---- List all files that match this pattern - subfile_str = [str(file) for file in list(subfile_path_obj)] - - # Convert list of proposed files from Path to String - file_str = [str(file) for file in list(files)] - - # Find intersection with the proposed filenames and return the output - return list(set(subfile_str).intersection(set(file_str))) - -def compile_filename_format(file_name_format: str): - - # Create a copy of `file_name_format` - regex_pattern = file_name_format - - # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern - for key, value in LIVE_FILE_FORMAT_MAP.items(): - regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"]) - # ---- Replace the `FILE_ID` tag - regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P\1)', regex_pattern) - - # Compile the regex pattern and return the output - return re.compile(regex_pattern) - -def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict): - - # Read in the `*.csv` file - df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys())) - - # Validate the dataframe - # ---- Check for any missing columns - missing_columns = ( - [key for key in config_map["dtypes"].keys() if key not in df.columns] - ) - # ---- Raise Error, if needed - if missing_columns: - raise ValueError( - f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!" - ) - # ---- Ensure the correct datatypes - df_validated = df.astype(config_map["dtypes"]) - # ---- Replace column names and drop - df_validated = df_validated.rename(columns=config_map["names"]) - - # Get the substring components that can be added to the DataFrame - filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern) - # ---- Create sub-list of columns that can be added to the DataFrame - valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings))) - - # Compile the filename regular expression - compiled_regex = compile_filename_format(pattern) - # ---- Create the `Match` object that will be used to parse the string - match_obj = compiled_regex.search(file.name) - - # Iterate through the filename-derived tags and add them to the DataFrame - for i in valid_tags: - matched_key = LIVE_FILE_FORMAT_MAP[i] - df_validated[matched_key["name"]] = matched_key["dtype"](match_obj.group(i)) - - # Return the resulting DataFrame - return df_validated - -def preprocess_biology_data(biology_output: dict, file_configuration: dict): - - # Get SQL database file - biology_db = file_configuration["database"]["biology"] - - # Get contrasts used for filtering the dataset - # ---- Species - species_filter = file_configuration["species"]["number_code"] - # ---- Trawl partition information - trawl_filter = file_configuration["biology"]["catch"]["partition"] - # ---- Create filter dictionary - filter_dict = dict(species_id=species_filter, trawl_partition=trawl_filter) - - # Apply the filter - filtered_biology_output = { - key: biology_data_filter(df, filter_dict) - for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty - } - # ---- Swap this out if no new files are present - if not filtered_biology_output: - # ---- Get available tables - table_list = list(set(SQL(biology_db, "map")) - set(["files_read"])) - # ---- Plug into the dictionary - filtered_biology_output.update({key: pd.DataFrame() for key in table_list}) - # ---- Initialize the results dictionary - results_dict = {key: pd.DataFrame() for key in filtered_biology_output.keys()} - - # Update the SQL database - for table_name, df in filtered_biology_output.items(): - # ---- Get identifier columns - key_columns = get_table_key_names(biology_db, filtered_biology_output, table_name) - # ---- Create copy - df = df.copy() - # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint - df.loc[:, "id"] = "row" + df.index.astype(str) + "-" + "-".join(key_columns) - # ---- Insert the new data into the database & pull in the combined dataset - table_df = sql_data_exchange(biology_db, - dataframe=df, - table_name=table_name, - id_columns=["id"], - primary_keys=["id"], - output_type=pd.DataFrame) - # ---- Add to the outgoing dictionary (and drop SQL db identifier) - results_dict.update({table_name: table_df.drop(columns="id")}) - - # Return the output - return results_dict - -def get_table_key_names(db_file: Path, data_dict: dict, table_name: str) -> List[str]: - - # Get the data input column names - if data_dict[table_name].empty: - # ---- Inspect the table - inspected_table = SQL(db_file, "inspect", table_name=table_name) - # ---- Create a list of the data columns - table_columns = list(inspected_table.keys()) - else: - # ---- Get the DataFrame column names - table_columns = data_dict[table_name].columns - - # Create a list of the primary keys - key_columns = ( - set(table_columns) - .intersection(["trawl_partition", "sex", "haul_num", "species_id", "longitude", - "latitude"]) - ) - - # Return a list of the output - return list(key_columns) - -def load_biology_data(file_configuration: dict): - - # Get the acoustic file settings and root directory - # ---- File settings - file_settings = file_configuration["input_directories"]["biology"] - # ---- Root directory - root_directory = file_configuration["data_root_dir"] - - # Get and validate the acoustic data directory and files - biology_files = validate_data_directory(root_directory, file_settings) - - # Query `biology.db` to process only new files (or create the db file in the first place) - # SQL(biology_db, "drop", table_name="files_read") - new_biology_files, file_configuration["database"]["biology"] = ( - query_processed_files(root_directory, file_settings, biology_files) - ) - - # Get the file-specific settings, datatypes, columns, etc. - # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` - biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] - # ---- Extract the expected file name ID's - biology_file_ids = file_settings["file_name_formats"] - # ---- Extract all of the file ids - biology_config_ids = list(biology_file_ids.keys()) - # ---- Initialize the dictionary that will define this key in the `input` attribute - biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} - # ---- Create filepath object - directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"] - - # Add SQL file to dict - file_configuration["database"]["biology"] = ( - Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"] - ) - - # Iterate through the different biology datasets and read them in - for dataset in list(biology_file_ids.keys()): - # ---- Get dataset-specific file lists - dataset_files = filter_filenames(directory_path, - file_settings["file_name_formats"][dataset], - new_biology_files, - file_settings["extension"]) - # ---- If there are dataset files available - if dataset_files: - # ---- Read in validated biology data - dataframe_list = [read_biology_csv(Path(file), - file_settings["file_name_formats"][dataset], - biology_config_map[dataset]) - for file in dataset_files] - # ---- Concatenate the dataset - dataframe_combined = pd.concat(dataframe_list, ignore_index=True) - # ---- Lower-case sex - if "sex" in dataframe_combined.columns: - dataframe_combined["sex"] = dataframe_combined["sex"].str.lower() - # ---- Lower-case trawl partition type - if "trawl_partition" in dataframe_combined.columns: - dataframe_combined["trawl_partition"] = dataframe_combined["trawl_partition"].str.lower() - # ---- Reformat datetime column - if "datetime" in dataframe_combined.columns: - dataframe_combined["datetime"] = convert_datetime(dataframe_combined["datetime"]) - # ---- Add to the data dictionary - biology_output[f"{dataset}_df"] = dataframe_combined - - # Pre-process and return the results - return preprocess_biology_data(biology_output, file_configuration) - -SPATIAL_CONFIG_MAP = { - "closest_haul": { - "proximity": { - "choices": ["distance", "time"], - }, - }, - "global" : {}, - "griddify": { - "bounds": { - "longitude": { - "types": [float] - }, - "latitude": { - "types": [float] - }, - "northings": { - "types": [float] - }, - "eastings": { - "types": [float] - }, - "pairs": [("longitude", "latitude"), ("northings", "eastings")], - }, - "grid_resolution": { - "x_distance": { - "types": float, - }, - "y_distance": { - "types": float, - }, - "d_longitude": { - "types": float, - }, - "d_latitude": { - "types": float, - }, - "grid_size_x": { - "types": int, - }, - "grid_size_y": { - "types": int, - }, - "pairs": [("x_distance", "y_distance"), ("d_longitude", "d_latitude"), - ("grid_size_x", "grid_size_y")], - }, - }, - "inpfc": { - "stratum_names": { - "types": [int, str] - }, - "latitude_max": { - "types": [float], - }, - }, - "weighted_haul": { - "proximity": { - "choices": ["distance", "time"] - }, - }, -} - -def validate_spatial_config(spatial_config: dict): - - # Check the link method - # ---- Extract string-formatted method name - link_method = spatial_config["link_biology_acoustics"].lower() - # ---- Validate - if link_method not in SPATIAL_CONFIG_MAP.keys(): - raise ValueError( - f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options " - f"include: 'global', 'closest_haul', 'weighted_haul', 'griddify', and 'INPFC'." - ) - - # Verify that associated parameters are present in the configuration settings - # ---- Get keys as a list - config_keys = list(spatial_config.keys()) - # ---- Check for specific methods - if link_method not in config_keys and link_method != "global": - raise ValueError( - f"No parameters provided for the biology-acoustic linking ([{link_method}])." - ) - - # Check key settings - if link_method == "griddify": - validate_griddify_config(spatial_config, link_method) - elif link_method == "inpfc": - validate_inpfc_config(spatial_config, link_method) - elif link_method != "global": - validate_hauls_config(spatial_config, link_method) - -def validate_hauls_config(spatial_config: dict, link_method: str): - - # Get the link method configuration map - link_method_settings = SPATIAL_CONFIG_MAP[link_method] - - # Extract the defined settings - input_method_settings = spatial_config[link_method] - - # Check for `proximity` - if "proximity" not in input_method_settings.keys(): - raise KeyError( - "The following parameters are missing from the biology-acoustic linking method: " - "'proximity'!" - ) - - # Evaluate valid options for `proximity` - if input_method_settings["proximity"] not in link_method_settings["proximity"]["choices"]: - raise KeyError( - f"Value biology-acoustic linking method parameter `proximity` must be one of the : " - f"following: {link_method_settings["proximity"]["choices"]}." - ) - -def validate_griddify_config(spatial_config: dict, link_method: str): - - # Get the link method configuration map - link_method_settings = SPATIAL_CONFIG_MAP[link_method] - - # Extract the defined settings - input_method_settings = spatial_config[link_method] - - # Check for the required keys - key_diff = set(input_method_settings.keys()).difference(set(link_method_settings.keys())) - # ---- Raise Error - if key_diff: - raise KeyError( - f"The following parameters are missing from the biology-acoustic linking method: " - f"{list(key_diff)}!" - ) - - # Iterate through the keys to evaluate inputs - for key in list(input_method_settings.keys()): - # ---- Subset the input method config - input = input_method_settings[key] - # ---- Get the original config of the dtypes - model = link_method_settings[key] - # ---- Compare entries - parameter_diff = set(input.keys()).difference(set(model.keys())) - # ---- Raise Error - if parameter_diff: - raise KeyError( - f"Unexpected parameter(s) ('{parameter_diff}') detected in '{link_method}' " - f"configuration." - ) - # ---- Check if the appropriate coordinate pairs are present - coordinate_pairs = [set(param).intersection(set(input.keys())) for param in model["pairs"]] - # ---- Count the number of paired coordinates - pair_counts = [len(param) for param in coordinate_pairs] - # ---- If there are multiple pairs - if (np.array(pair_counts) == 2).sum() != 1: - raise ValueError( - f"A single coordinate-pair is allowed (and required) within the '{key}' parameter " - f"for the link method '{link_method}' defined via the following options: " - f"{model["pairs"]}." - ) - # ---- Check the datatypes - for parameter in input.keys(): - # ---- Get the datatypes - config_dtypes = model[parameter]["types"] - # ---- Get input parameter - input_parameter = input[parameter] - # ---- If List - if isinstance(config_dtypes, list): - if not isinstance(input_parameter, list): - raise TypeError( - f"Biology-acoustic linking method argument '{parameter}' within '{key}' " - f"for method '{link_method}' must be contained within a list." - ) - else: - input_parameter = [input_parameter] - config_dtypes = [config_dtypes] - # ---- Check correct datatypes - if not np.all([type(value) in config_dtypes for value in input_parameter]): - raise TypeError( - f"Biology-acoustic linking method argument '{parameter}' within '{key}' " - f"for method '{link_method}' must be one of the following types within a list: " - f"{config_dtypes}." - ) - -def validate_inpfc_config(spatial_config: dict, link_method: str): - - # Get the link method configuration map - link_method_settings = SPATIAL_CONFIG_MAP[link_method] - - # Extract the defined settings - input_method_settings = spatial_config[link_method] - - # Check for the required keys - key_diff = set(input_method_settings.keys()).difference(set(link_method_settings.keys())) - # ---- Raise Error - if key_diff: - raise KeyError( - f"The following parameters are missing from the biology-acoustic linking method: " - f"{list(key_diff)}!" - ) - - # Iterate through the keys to evaluate inputs - for key in list(input_method_settings.keys()): - # ---- Subset the input method config - input = input_method_settings[key] - # ---- Get the original config of the dtypes - model = link_method_settings[key]["types"] - # ---- Evaluate if a list - if not isinstance(input, list): - raise TypeError( - f"Biology-acoustic linking method argument '{key}' for method '{link_method}' must " - f"be contained within a list." - ) - # ---- Evaluate if it is a type within the list - if not type(input[0]) in model: - raise TypeError( - f"Biology-acoustic linking method argument '{key}' for method '{link_method}' must " - f"be one of the following types within a list: {model}." - ) - -def apply_inpfc_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict): - - # Extract the INPFC definitions - inpfc_definitions = spatial_config["inpfc"] - - # Create latitude bins - latitude_bins = np.concatenate([[-90.0], inpfc_definitions["latitude_max"], [90.0]]) - # ---- Append 1 more stratum layer - bin_names = np.concatenate([inpfc_definitions["stratum_names"], - [np.max(inpfc_definitions["stratum_names"]) + 1]]) - - # Create spatial key - spatial_config["spatial_key"] = pd.DataFrame({ - "latitude_limit": inpfc_definitions["latitude_max"], - }) - # ---- Cut - spatial_config["spatial_key"]["stratum"] = ( - pd.cut(inpfc_definitions["latitude_max"], - latitude_bins, - right = True, - labels = bin_names) - ) - - # Get the `prc_nasc_df` values, if they exist, and apply stratification information - if not acoustic_data["prc_nasc_df"].empty: - # ---- Bin the latitude data - acoustic_data["prc_nasc_df"]["stratum"] = pd.cut( - acoustic_data["prc_nasc_df"]["latitude"], - latitude_bins, - right = True, - labels = bin_names, - ) - - # Get the `trawl_info_df` values, if they exist, and apply stratification information - if not biology_data["trawl_info_df"].empty: - # ---- Bin the latitude data - biology_data["trawl_info_df"]["stratum"] = pd.cut( - biology_data["trawl_info_df"]["latitude"], - latitude_bins, - right = True, - labels = bin_names, - ) - -def define_boundary_box(boundary_dict: dict, projection: str): - - # Get x-coordinates - if "longitude" in boundary_dict.keys(): - x = np.array(boundary_dict["longitude"]) - else: - x = np.array(boundary_dict["northings"]) - - # Get y-coordinates - if "latitude" in boundary_dict.keys(): - y = np.array(boundary_dict["latitude"]) - else: - y = np.array(boundary_dict["eastings"]) - - # Create a boundary DataFrame - bound_df = pd.DataFrame({ - "x": np.array([x.min(), x.max(), x.max(), x.min(), x.min()]), - "y":np.array([y.min(), y.max(), y.max(), y.min(), y.min()]), - }) - - # Convert to a GeoDataFrame and return the GeoDataFrame - return gpd.GeoDataFrame( - data=bound_df, - geometry=gpd.points_from_xy(bound_df["x"], bound_df["y"]), - crs=projection, - ) - - -def apply_griddify_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict): - - # Extract the griddification definitions - griddify_definitions = spatial_config["griddify"] - - # Get the projection definition - projection = spatial_config["projection"] - - # Compute the boundary box GeoDataFrame - boundary_box = define_boundary_box(griddify_definitions["bounds"], projection) - - # Convert the coordinates, if needed - if not set(["northings", "eastings"]).intersection(set(griddify_definitions["bounds"].keys())): - # ---- Compute the equivalent UTM string - utm_num = int(utm_string_generator(np.median(boundary_box.loc[0:3, "x"]), - np.median(boundary_box.loc[0:3, "y"]))) - # ---- Compute the boundary box GeoDataFrame with the new projection - boundary_box = boundary_box.to_crs(utm_num) - # ---- Create a new projection for later - projection_new = f"epsg:{utm_num}" - else: - projection_new = projection - - # Define the step sizes - # ---- Define x step size - x_step = distance(nautical=griddify_definitions["grid_resolution"]["x_distance"]).meters - # ---- Define y step size - y_step = distance(nautical=griddify_definitions["grid_resolution"]["y_distance"]).meters - - # Get the boundary tuple - xmin, ymin, xmax, ymax = boundary_box.total_bounds - - # Generate the cells - grid_cells = [] - # ---- Iterate through - for y0 in np.arange(ymin, ymax+y_step, y_step): - for x0 in np.arange(xmin, xmax+x_step, x_step): - x1 = x0-x_step - y1 = y0+y_step - grid_cells.append(shapely.geometry.box(x0, y0, x1, y1)) - - # Convert to a GeoDataFrame - cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=projection_new) - - # Get the centroids - cells_gdf["cell_centroid"] = cells_gdf["geometry"].centroid - - # Get the `prc_nasc_df` values, if they exist, and apply stratification information - if not acoustic_data["prc_nasc_df"].empty: - - # - prc_nasc_df = acoustic_data["prc_nasc_df"] - - # to GDF - prc_nasc_gdf = gpd.GeoDataFrame( - data=prc_nasc_df, - geometry=gpd.points_from_xy(prc_nasc_df["longitude"], prc_nasc_df["latitude"]), - crs=projection, - ) - # to UTM - prc_nasc_new = prc_nasc_gdf.to_crs(projection_new) - - prc_nasc_new["x"] = prc_nasc_new["geometry"].x - prc_nasc_new["y"] = prc_nasc_new["geometry"].y - - # ---- Bin the latitude data - prc_nasc_new["stratum_x"] = pd.cut( - prc_nasc_new["x"], - np.arange(xmin, xmax+x_step, x_step), - right = True, - labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1), - ).astype(int) + 1 - - prc_nasc_new["stratum_y"] = pd.cut( - prc_nasc_new["y"], - np.arange(ymin, ymax+y_step, y_step), - right = True, - labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1), - ).astype(int) + 1 - - # - acoustic_data["prc_nasc_df"]["stratum"] = ( - prc_nasc_new["stratum_x"].astype(str) + "-" + prc_nasc_new["stratum_y"].astype(str) - ) - - if not biology_data["trawl_info_df"].empty: - - # - trawl_info_df = biology_data["trawl_info_df"] - - # to GDF - trawl_info_gdf = gpd.GeoDataFrame( - data=trawl_info_df, - geometry=gpd.points_from_xy(trawl_info_df["longitude"], trawl_info_df["latitude"]), - crs=projection, - ) - # to UTM - trawl_info_new = trawl_info_gdf.to_crs(projection_new) - - trawl_info_new["x"] = trawl_info_new["geometry"].x - trawl_info_new["y"] = trawl_info_new["geometry"].y - - # ---- Bin the latitude data - trawl_info_new["stratum_x"] = pd.cut( - trawl_info_new["x"], - np.arange(xmin, xmax+x_step, x_step), - right = True, - labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1), - ).astype(int) + 1 - - trawl_info_new["stratum_y"] = pd.cut( - trawl_info_new["y"], - np.arange(ymin, ymax+y_step, y_step), - right = True, - labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1), - ).astype(int) + 1 - - # - biology_data["trawl_info_df"]["stratum"] = ( - trawl_info_new["stratum_x"].astype(str) + "-" + trawl_info_new["stratum_y"].astype(str) - ) - diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py new file mode 100644 index 00000000..f38b130b --- /dev/null +++ b/echopop/live/live_spatial_methods.py @@ -0,0 +1,198 @@ +import geopandas as gpd +import pandas as pd +import numpy as np +from geopy.distance import distance +from ..spatial.projection import utm_string_generator +import shapely.geometry + +def apply_inpfc_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict): + + # Extract the INPFC definitions + inpfc_definitions = spatial_config["inpfc"] + + # Create latitude bins + latitude_bins = np.concatenate([[-90.0], inpfc_definitions["latitude_max"], [90.0]]) + # ---- Append 1 more stratum layer + bin_names = np.concatenate([inpfc_definitions["stratum_names"], + [np.max(inpfc_definitions["stratum_names"]) + 1]]) + + # Create spatial key + spatial_config["spatial_key"] = pd.DataFrame({ + "latitude_limit": inpfc_definitions["latitude_max"], + }) + # ---- Cut + spatial_config["spatial_key"]["stratum"] = ( + pd.cut(inpfc_definitions["latitude_max"], + latitude_bins, + right = True, + labels = bin_names) + ) + + # Get the `prc_nasc_df` values, if they exist, and apply stratification information + if not acoustic_data["prc_nasc_df"].empty: + # ---- Bin the latitude data + acoustic_data["prc_nasc_df"]["stratum"] = pd.cut( + acoustic_data["prc_nasc_df"]["latitude"], + latitude_bins, + right = True, + labels = bin_names, + ) + + # Get the `trawl_info_df` values, if they exist, and apply stratification information + if not biology_data["trawl_info_df"].empty: + # ---- Bin the latitude data + biology_data["trawl_info_df"]["stratum"] = pd.cut( + biology_data["trawl_info_df"]["latitude"], + latitude_bins, + right = True, + labels = bin_names, + ) + +def define_boundary_box(boundary_dict: dict, projection: str): + + # Get x-coordinates + if "longitude" in boundary_dict.keys(): + x = np.array(boundary_dict["longitude"]) + else: + x = np.array(boundary_dict["northings"]) + + # Get y-coordinates + if "latitude" in boundary_dict.keys(): + y = np.array(boundary_dict["latitude"]) + else: + y = np.array(boundary_dict["eastings"]) + + # Create a boundary DataFrame + bound_df = pd.DataFrame({ + "x": np.array([x.min(), x.max(), x.max(), x.min(), x.min()]), + "y":np.array([y.min(), y.max(), y.max(), y.min(), y.min()]), + }) + + # Convert to a GeoDataFrame and return the GeoDataFrame + return gpd.GeoDataFrame( + data=bound_df, + geometry=gpd.points_from_xy(bound_df["x"], bound_df["y"]), + crs=projection, + ) + +def apply_griddify_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict): + + # Extract the griddification definitions + griddify_definitions = spatial_config["griddify"] + + # Get the projection definition + projection = spatial_config["projection"] + + # Compute the boundary box GeoDataFrame + boundary_box = define_boundary_box(griddify_definitions["bounds"], projection) + + # Convert the coordinates, if needed + if not set(["northings", "eastings"]).intersection(set(griddify_definitions["bounds"].keys())): + # ---- Compute the equivalent UTM string + utm_num = int(utm_string_generator(np.median(boundary_box.loc[0:3, "x"]), + np.median(boundary_box.loc[0:3, "y"]))) + # ---- Compute the boundary box GeoDataFrame with the new projection + boundary_box = boundary_box.to_crs(utm_num) + # ---- Create a new projection for later + projection_new = f"epsg:{utm_num}" + else: + projection_new = projection + + # Define the step sizes + # ---- Define x step size + x_step = distance(nautical=griddify_definitions["grid_resolution"]["x_distance"]).meters + # ---- Define y step size + y_step = distance(nautical=griddify_definitions["grid_resolution"]["y_distance"]).meters + + # Get the boundary tuple + xmin, ymin, xmax, ymax = boundary_box.total_bounds + + # Generate the cells + grid_cells = [] + # ---- Iterate through + for y0 in np.arange(ymin, ymax+y_step, y_step): + for x0 in np.arange(xmin, xmax+x_step, x_step): + x1 = x0-x_step + y1 = y0+y_step + grid_cells.append(shapely.geometry.box(x0, y0, x1, y1)) + + # Convert to a GeoDataFrame + cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=projection_new) + + # Get the centroids + cells_gdf["cell_centroid"] = cells_gdf["geometry"].centroid + + # Get the `prc_nasc_df` values, if they exist, and apply stratification information + if not acoustic_data["prc_nasc_df"].empty: + + # + prc_nasc_df = acoustic_data["prc_nasc_df"] + + # to GDF + prc_nasc_gdf = gpd.GeoDataFrame( + data=prc_nasc_df, + geometry=gpd.points_from_xy(prc_nasc_df["longitude"], prc_nasc_df["latitude"]), + crs=projection, + ) + # to UTM + prc_nasc_new = prc_nasc_gdf.to_crs(projection_new) + + prc_nasc_new["x"] = prc_nasc_new["geometry"].x + prc_nasc_new["y"] = prc_nasc_new["geometry"].y + + # ---- Bin the latitude data + prc_nasc_new["stratum_x"] = pd.cut( + prc_nasc_new["x"], + np.arange(xmin, xmax+x_step, x_step), + right = True, + labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1), + ).astype(int) + 1 + + prc_nasc_new["stratum_y"] = pd.cut( + prc_nasc_new["y"], + np.arange(ymin, ymax+y_step, y_step), + right = True, + labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1), + ).astype(int) + 1 + + # + acoustic_data["prc_nasc_df"]["stratum"] = ( + prc_nasc_new["stratum_x"].astype(str) + "-" + prc_nasc_new["stratum_y"].astype(str) + ) + + if not biology_data["trawl_info_df"].empty: + + # + trawl_info_df = biology_data["trawl_info_df"] + + # to GDF + trawl_info_gdf = gpd.GeoDataFrame( + data=trawl_info_df, + geometry=gpd.points_from_xy(trawl_info_df["longitude"], trawl_info_df["latitude"]), + crs=projection, + ) + # to UTM + trawl_info_new = trawl_info_gdf.to_crs(projection_new) + + trawl_info_new["x"] = trawl_info_new["geometry"].x + trawl_info_new["y"] = trawl_info_new["geometry"].y + + # ---- Bin the latitude data + trawl_info_new["stratum_x"] = pd.cut( + trawl_info_new["x"], + np.arange(xmin, xmax+x_step, x_step), + right = True, + labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1), + ).astype(int) + 1 + + trawl_info_new["stratum_y"] = pd.cut( + trawl_info_new["y"], + np.arange(ymin, ymax+y_step, y_step), + right = True, + labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1), + ).astype(int) + 1 + + # + biology_data["trawl_info_df"]["stratum"] = ( + trawl_info_new["stratum_x"].astype(str) + "-" + trawl_info_new["stratum_y"].astype(str) + ) diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index e8c60da5..579cf463 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -1,7 +1,6 @@ from typing import Union from pathlib import Path import copy -import yaml from .live_core import( LIVE_DATA_STRUCTURE, @@ -14,7 +13,7 @@ ) from . import live_data_processing as eldp - +from . import live_data_loading as eldl class LiveSurvey: """ A real-time processing version of the `echopop` base `Survey` class that ingests biological, @@ -25,7 +24,6 @@ def __init__( self, live_init_config_path: Union[str, Path], live_file_config_path: Union[str, Path], - update_config: bool = True, verbose: bool = True, ): # Initialize `meta` attribute @@ -33,7 +31,7 @@ def __init__( # Loading the configuration settings and definitions that are used to # initialize the Survey class object - self.config = eldp.live_configuration(Path(live_init_config_path), + self.config = eldl.live_configuration(Path(live_init_config_path), Path(live_file_config_path)) # ---- Initialize config key for database files self.config.update( @@ -52,7 +50,7 @@ def __init__( # TODO: Replace Tuple output by appending the "database" key to the respective dataset dict # Ingest data # ---- Acoustics - self.input["acoustics"]["prc_nasc_df"] = eldp.load_acoustic_data(self.config) + self.input["acoustics"]["prc_nasc_df"] = eldl.load_acoustic_data(self.config) # ---- Biology self.input["biology"] = eldp.load_biology_data(self.config) diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index 4b282e13..0d6a6d58 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -1,7 +1,9 @@ from sqlalchemy import create_engine, text, Engine, inspect import sqlalchemy as sqla import pandas as pd -from typing import Optional +from typing import Optional, Literal, Union, List +import numpy as np +from pathlib import Path def sql_create(connection: sqla.Connection, dataframe: pd.DataFrame, table_name: str, primary_keys: Optional[list] = None): @@ -117,23 +119,31 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data # Convert the DataFrame into a tuple and then into a string # ---- Replace NaN with None dataframe = dataframe.replace([np.nan], [None]) - # ---- Identify any possible DATETIME columns - # datetime_columns = ( - # {col["name"]: str for col in columns_info - # if isinstance(col["type"], sqla.sql.sqltypes.DATETIME)} - # ) - # ---- Encapsulate datetimes with quotes by converting to string - # dataframe = dataframe.astype(datetime_columns) # ---- DataFrame to Tuple data_tuple = [tuple(row) for row in dataframe.itertuples(index=False)] + + def format_value(x): + if isinstance(x, str): + return "'{}'".format(x.replace("'", "''")) + elif isinstance(x, pd.Timestamp): + return "'{}'".format(x) + elif x is None: + return 'NULL' + else: + return str(x) + # ---- Tuple to String - data_str = ", ".join( - # f"({', '.join(map(lambda x: f'\'{x}\'' if isinstance(x, str) else str(x), row))})" - f"({', '.join(map(lambda x: f'\'{x}\'' - if isinstance(x, str) or isinstance(x, pd.Timestamp) - else 'NULL' if x is None else str(x), row))})" - for row in data_tuple - ) + # data_str = ", ".join( + # # f"({', '.join(map(lambda x: f'\'{x}\'' if isinstance(x, str) or isinstance(x, pd.Timestamp) else 'NULL' if x is None else str(x), row))})" + # f"({', '.join(map(lambda x: f'\'{x.replace('\\', '\\\\')}\'' if isinstance(x, str) or isinstance(x, pd.Timestamp) else 'NULL' if x is None else str(x), row))})" + # for row in data_tuple + # ) + flattened_data = [format_value(x) for row in data_tuple for x in row] + data_str = "({})".format(", ".join(flattened_data)) + # data_str = ", ".join( + # "({})".format(", ".join(map(format_value, row))) + # for row in data_tuple + # ) # Construct the "ON CONFLICT, DO UPDATE SET" if needed on_conflict_clause = "" @@ -156,8 +166,7 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data # Commit connection.commit() -from typing import Literal -import numpy as np + def sql_select(connection: sqla.Connection, table_name: str, columns: list, output_type: type = pd.DataFrame): @@ -258,6 +267,75 @@ def format_sql_columns(kwargs: dict): return kwargs # TODO: Documentation +def query_processed_files(root_directory: str, file_settings: dict, files: List[Path]) -> dict: + + # Get the database name + db_name = file_settings["database_name"] + + # Create filepath to the SQL database + # ---- Create Path to SQL database file + db_directory = Path(root_directory) / "database" + # ---- Create the directory if it does not already exist + db_directory.mkdir(parents=True, exist_ok=True) + # ---- Complete path to the database file + db_file = db_directory / db_name + + # Create a list of string-formatted Path names + files_str = [str(file) for file in files] + # ---- Create DataFrame + current_files = pd.DataFrame(files_str, columns=["filepath"]) + + # Check for the table `files_read` + files_read_tbl = SQL(db_file, "validate", table_name="files_read") + + # Validate whether the table exists; if not, create the table and then insert + if not files_read_tbl: + # ---- Create table + SQL(db_file, "create", table_name="files_read", dataframe=current_files, + primary_keys = ["filepath"]) + # ---- Populate table + SQL(db_file, "insert", table_name="files_read", dataframe=current_files) + # ---- Break early + return files_str, db_file + + # Query already existing files + previous_files = SQL(db_file, "select", table_name="files_read", output_type=str) + # ---- Insert file list + SQL(db_file, "insert", table_name="files_read", dataframe=current_files, id_columns="filepath") + + # Filter out previously processed files + # ---- Apply filter by comparing sets and return the output + return list(set(files_str) - set(previous_files)), db_file + +# TODO: Documentation +def sql_data_exchange(database_file: Path, **kwargs): + + # Check whether the `table_name` table exists + table_exists = SQL(database_file, "validate", **kwargs) + + # If empty and table does not exist + if kwargs["dataframe"].empty and table_exists: + return SQL(database_file, "select", **kwargs) + + # Create table if it does not exist and run the initial insertion + if not table_exists: + # ---- Create table + SQL(database_file, "create", **kwargs) + # ---- Ignore the `id_columns` argument, if present + try: + del kwargs["id_columns"] + except KeyError: + pass + # ---- Insert into table + SQL(database_file, "insert", **kwargs) + # ---- Return the initial dataframe + return kwargs.get("dataframe") + + # Insert into the table + SQL(database_file, "insert", **kwargs) + + # Select existing data frame the database and return the output + return SQL(database_file, "select", **kwargs) # TODO: Documentation @@ -280,31 +358,6 @@ def SQL(db_file: str, command: str, **kwargs): kwargs = {key: value for key, value in kwargs.items() if key in command_args} # ---- Return output return command_function(connection, **kwargs) - # # ---- SELECT - # if command == "select": - # return pd.read_sql(text(SQL_COMMANDS[command].format(**kwargs)), con=connection) - # # ---- REPLACE - # elif command == "replace": - # # ---- Extract dataframe - # df_to_add = kwargs["dataframe"] - # # ---- Replace current - # df_to_add.to_sql(name=kwargs["table_name"], - # con=connection, - # if_exists="replace", index=False) - - # # ---- INSERT - # elif command == "insert": - # # ---- Extract dataframe - # df_to_add = kwargs["dataframe"] - # # ---- Insert into the table - # df_to_add.to_sql(name=kwargs["table_name"], con=connection, if_exists="append", - # index=False) - # # ---- INSPECT - # elif command == "inspect": - # return inspect(engine).get_table_names() - # # ---- OTHER COMMAND - # else: - # connection.execute(text(SQL_COMMANDS[command].format(**kwargs))) finally: # ---- Dispose of the engine to release any resources being pooled/used engine.dispose() diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py index c01445b3..ba7c2a2c 100644 --- a/echopop/zarr_read_ingest_test.py +++ b/echopop/zarr_read_ingest_test.py @@ -26,8 +26,23 @@ file_configuration.update({"database": {"acoustics": None, "biology": None}}) #################################################################################################### # * Accessory function for tuning the acoustic transmit frequency units/scaling +def format_vlaue(x): + pass + +def format_value(x): + if isinstance(x, str): + return "'{}'".format(x.replace("'", "''")) + elif isinstance(x, pd.Timestamp): + return "'{}'".format(x) + elif x is None: + return 'NULL' + else: + return str(x) - +data_str = ", ".join( + "({})".format(", ".join(format_value(x) for x in row)) + for row in data_tuple +) From 6d439bb18a8118fe12d4dd982627182ac9208b13 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 31 Jul 2024 18:31:20 -0700 Subject: [PATCH 09/81] General changes --- config_files/live_initialization_config.yml | 6 +- echopop/live/live_acoustics.py | 156 ++-- echopop/live/live_biology.py | 812 ++++++++++++++++++++ echopop/live/live_core.py | 12 +- echopop/live/live_data_loading.py | 498 ++++++------ echopop/live/live_spatial_methods.py | 112 ++- echopop/live/live_survey.py | 79 +- echopop/live/sql_methods.py | 289 ++++++- echopop/utils/operations.py | 9 +- echopop/zarr_read_ingest_test.py | 266 ++++++- 10 files changed, 1809 insertions(+), 430 deletions(-) create mode 100644 echopop/live/live_biology.py diff --git a/config_files/live_initialization_config.yml b/config_files/live_initialization_config.yml index a407520e..9436cefc 100644 --- a/config_files/live_initialization_config.yml +++ b/config_files/live_initialization_config.yml @@ -45,11 +45,7 @@ # `INPFC` --> NASC for each INPFC stratum associated with matched stratum-specific sigma_bs # `closest_haul` --> NASC associated with sigma_bs calculated from the closest (spatially) trawls # `weighted_haul` --> NASC associated with sigma_bs calculated from all survey data weighted by distance from haul coordinates - link_biology_acoustics: - global: False - INPFC: True - closest_haul: False - weighted_haul: False + link_biology_acoustics: INPFC ##################################################################################################################### # Acoustics settings# diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py index f526f578..21ba1e23 100644 --- a/echopop/live/live_acoustics.py +++ b/echopop/live/live_acoustics.py @@ -1,9 +1,55 @@ from typing import Union, Optional - +import numpy as np import pandas as pd from echopop.acoustics import ts_length_regression, to_linear, to_dB +# TODO: Documentation +def configure_transmit_frequency(frequency_values: pd.Series, + transmit_settings: dict, + current_units: str): + + # Extract transmit frequency units defined in configuration file + configuration_units = transmit_settings["units"] + + # Transform the units, if necessary + # ---- Hz to kHz + if current_units == "Hz" and configuration_units == "kHz": + return frequency_values * 1e-3 + # ---- kHz to Hz + elif current_units == "kHz" and configuration_units == "Hz": + return frequency_values * 1e3 + # ---- No change + else: + return frequency_values + +# TODO: Documentation +def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame, + file_configuration: dict) -> pd.DataFrame: + + # Get acoustic processing settings + acoustic_analysis_settings = file_configuration["acoustics"] + # ---- Extract the fined acoustic frequency + transmit_settings = acoustic_analysis_settings["transmit"] + + # Filter the dataset + # ---- Configure `frequency_nominal`, if necessary + prc_nasc_df["frequency_nominal"] = ( + configure_transmit_frequency(prc_nasc_df["frequency_nominal"], + transmit_settings, + acoustic_analysis_settings["dataset_units"]["frequency"]) + ) + # ---- Filter out any unused frequency coordinates + prc_nasc_df_filtered = ( + prc_nasc_df[prc_nasc_df["frequency_nominal"] == transmit_settings["frequency"]] + ) + + # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object + # ---- Replace NASC `NaN` values with `0.0` + prc_nasc_df_filtered.loc[:, "NASC"] = prc_nasc_df_filtered.loc[:, "NASC"].fillna(0.0) + # ---- Drop the `frequency_nominal` column and return the output + return prc_nasc_df_filtered.drop(columns = ["frequency_nominal"]) + # TODO: Documentation def average_sigma_bs(length: Union[pd.DataFrame, float, int], weights: Optional[Union[float, int, str]] = None): @@ -46,6 +92,10 @@ def estimate_echometrics(acoustic_data_df: pd.DataFrame): # Create copy acoustic_df = acoustic_data_df.copy().reset_index(drop=True) + # Compute ABC + # ---- Convert NASC to ABC + acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2) + # Pre-compute the change in depth acoustic_df["dz"] = acoustic_df["depth"].diff() @@ -62,65 +112,49 @@ def estimate_echometrics(acoustic_data_df: pd.DataFrame): "center_of_mass": np.nan, "dispersion": np.nan, "evenness": np.nan, - "aggregation": np.nan, + "aggregation_index": np.nan, "occupied_area": 0.0, }) else: - # Compute the number of layers - echometrics.update({ - "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size - }) - - # Compute ABC - # ---- Convert NASC to ABC - acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2) - # ---- Estimate mean Sv - echometrics.update({ - "mean_Sv": 10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max()) - }) - # --- Estimate max Sv (i.e. ) - echometrics.update({ - "max_Sv": 10 * np.log10(acoustic_df["ABC"].max() - / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"]) - }) - - # Compute (acoustic) abundance - echometrics.update({ - "nasc_db": 10 * np.log10(acoustic_df["ABC"].sum()) - }) - - # Compute center of mass + # Create the `echometrics` dictionary echometrics.update({ - "center_of_mass": ( - (acoustic_df["depth"] * acoustic_df["NASC"]).sum() - / (acoustic_df["NASC"]).sum() + # ---- Number of layers + "n_layers": int(acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size), + # ---- Mean Sv (back-calculated) + "mean_Sv": float( + 10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max()) + ), + # ---- Max Sv (back-calculated) + "max_Sv": float( + 10 * np.log10(acoustic_df["ABC"].max() + / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"]) + ), + # ---- (Logarithmic) acoustic abundance + "nasc_db": float(10 * np.log10(acoustic_df["ABC"].sum())), + # ---- Center-of-mass + "center_of_mass": float( + (acoustic_df["depth"] * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum() + ), + # ---- Evenness + "evenness": float( + (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2 + ), + # ---- Occupied area + "occupied_area": float( + acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max() ) }) - # Compute the dispersion + # Update variable-dependent metrics echometrics.update({ - "dispersion": ( + # ---- Dispersion + "dispersion": float( ((acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2 * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum() - ) - }) - - # Compute the evenness - echometrics.update({ - "evenness": (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2 - }) - - # Compute the index of aggregation - echometrics.update({ - "aggregation": 1 / echometrics["evenness"] - }) - - # Get the occupied area - echometrics.update({ - "occupied_area": ( - acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max() - ) + ), + # ---- Index of aggregation + "aggregation_index": float(1 / echometrics["evenness"]), }) # Return the dictionary @@ -141,3 +175,27 @@ def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True): # Convert `nasc_dict` to a DataFrame and return the output return pd.Series(nasc_dict) + +def compute_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True): + + # Integrate NASC (and compute the echometrics, if necessary) + nasc_data_df = ( + acoustic_data_df.groupby(["longitude", "latitude", "ping_time"]) + .apply(integrate_nasc, echometrics, include_groups=False) + .unstack().reset_index() + ) + # ---- Amend the dtypes if echometrics were computed + if echometrics: + # ---- Set dtypes + nasc_data_df = ( + nasc_data_df + .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float, + "center_of_mass": float, "dispersion": float, "evenness": float, + "aggregation_index": float, "occupied_area": float}) + ) + # ---- Reorder columns + nasc_data_df = nasc_data_df[[ + "longitude", "latitude", "ping_time", "nasc", "n_layers", "nasc_db", "mean_Sv", + "max_Sv", "aggregation_index", "center_of_mass", "dispersion", "evenness", + "occupied_area" + ]] diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py new file mode 100644 index 00000000..cf04589b --- /dev/null +++ b/echopop/live/live_biology.py @@ -0,0 +1,812 @@ +import pandas as pd +import numpy as np +from .sql_methods import SQL, sql_data_exchange, get_table_key_names +from echopop.acoustics import ts_length_regression, to_dB, to_linear +from echopop.utils.operations import group_interpolator_creator +from functools import reduce + +def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict): + + # Create dataframe copy + data_copy = biology_data.copy() + + # Iterate through dictionary to apply filters (if present) + for column, value in filter_dict.items(): + if column in data_copy.columns: + data_copy = data_copy[data_copy[column] == value] + + # Return output + return data_copy + +def merge_trawl_info(biology_dict: dict): + + # Get the trawl information dictionary + trawl_info_df = biology_dict["trawl_info_df"] + + # Update `catch_df` + biology_dict["catch_df"] = biology_dict["catch_df"].merge(trawl_info_df) + + # Update `length_df` + biology_dict["length_df"] = biology_dict["length_df"].merge(trawl_info_df) + + # Update `specimen_df` + biology_dict["specimen_df"] = biology_dict["specimen_df"].merge(trawl_info_df) + + # Drop the trawl information + del biology_dict["trawl_info_df"] + +def prepare_length_distribution(file_configuration: dict): + + # Get the length distribution parameters + distrib_params = file_configuration["biology"]["length_distribution"]["bins"] + + # Create histogram bins + length_bins = ( + np.linspace(**{key: value for key, value in zip(["start", "stop", "num"], distrib_params)}, + dtype=float) + ) + + # Get the binwidths + binwidth = np.diff(length_bins / 2.0).mean() + + # Generate the equivalent interval boundaries for each bin + intervals = np.concatenate([length_bins[:1] - binwidth, length_bins + binwidth]) + + # Format as a DataFrame and return the output + # ---- Add Categorical interval column + length_bins_df = ( + pd.DataFrame({"length_bin": length_bins, "interval": pd.cut(length_bins, intervals)}) + ) + # ---- Add numeric lower boundary + length_bins_df["lower"] = length_bins_df["interval"].apply(lambda x: x.left).astype(float) + # ---- Add numeric upper boundary + length_bins_df["upper"] = length_bins_df["interval"].apply(lambda x: x.right).astype(float) + + # Return the dataframe that will be incorporated into the biological data attribute + return length_bins_df + +def preprocess_biology_data(biology_output: dict, spatial_dict: dict, file_configuration: dict): + + # Get SQL database file + biology_db = file_configuration["database"]["biology"] + + # Get contrasts used for filtering the dataset + # ---- Species + species_filter = file_configuration["species"]["number_code"] + # ---- Trawl partition information + trawl_filter = file_configuration["biology"]["catch"]["partition"] + # ---- Create filter dictionary + filter_dict = dict(species_id=species_filter, trawl_partition=trawl_filter) + + # Apply the filter + filtered_biology_output = { + key: biology_data_filter(df, filter_dict) + for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty + } + # ---- Create new data flag + file_configuration["length_distribution"] = prepare_length_distribution(file_configuration) + # ---- Incorporate additional data, if new data are present + if filtered_biology_output: + # ---- Merge the trawl information and app + merge_trawl_info(filtered_biology_output) + # ---- Apply spatial definitions/stratification, if any + apply_spatial_definitions(filtered_biology_output, spatial_dict) + # ---- Swap this out if no new files are present + if not filtered_biology_output: + # ---- Get available tables + table_list = list(set(SQL(biology_db, "map")) - set(["files_read"])) + # ---- Plug into the dictionary + filtered_biology_output.update({key: pd.DataFrame() for key in table_list}) + # ---- Initialize the results dictionary + sql_results_dict = {key: pd.DataFrame() for key in filtered_biology_output.keys()} + + # Update the SQL database + for table_name, df in filtered_biology_output.items(): + # ---- Get identifier columns + key_columns = get_table_key_names(biology_db, filtered_biology_output, table_name) + # ---- Create copy + df = df.copy() + # ---- Assign values for key values + key_values = [str(index) + "-" + "-".join(df.loc[index, key_columns].values.astype(str)) + for index in df.index] + # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint + df.loc[:, "id"] = key_values + # ---- Insert the new data into the database & pull in the combined dataset + table_df = sql_data_exchange(biology_db, + dataframe=df, + table_name=table_name, + id_columns=["id"], + primary_keys=["id"], + output_type=pd.DataFrame) + # ---- Add to the outgoing dictionary (and drop SQL db identifier) + sql_results_dict.update({table_name: table_df.drop(columns="id")}) + + # Return the output + return filtered_biology_output, sql_results_dict + +def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, + file_configuration: dict): + + # Assign contrast columns + contrast_list = [] + # ---- Check for "stratum" column + if "stratum" in specimen_data.columns and "stratum" in length_data.columns: + contrast_list.append(["stratum"]) + # ---- Add the additional columns + contrast_list.append(["haul_num", "species_id", "length"]) + # ---- Concatenate + contrast_columns = list(np.concatenate(contrast_list)) + + # Meld the biological datasets + length_datasets = specimen_data.meld(length_data, + contrasts=contrast_columns) + + # Get the TS-length model parameterization + ts_length_parameters_spp = [ + spp + for spp in file_configuration["acoustics"]["TS_length_regression_parameters"].values() + if spp["number_code"] in np.unique(length_datasets.species_id).astype(int) + ] + + # Extract the target species information + target_species = pd.DataFrame.from_dict(ts_length_parameters_spp) + # ---- Filter out non-target species + length_datasets = ( + length_datasets[length_datasets["species_id"].isin(target_species["number_code"])] + ) + # ---- Merge with `length_datasets` + ts_length_df = length_datasets.merge(target_species, + left_on=["species_id"], right_on=["number_code"]) + + # Compute the mean sigma_bs for this particular haul + # ---- Create primary key list + key_list = list(set(contrast_columns) - set(["length"])) + # ---- Compute haul-specific means + sigma_bs_df = ( + ts_length_df + .groupby(list(set(contrast_columns) - set(["length"])), observed=False) + .apply(lambda x: average_sigma_bs(x, weighted="length_count"), include_groups=False) + .reset_index(name="sigma_bs") + ) + + # For SQL database storage purposes, the sum and count are stored instead + # ---- Count sum + sigma_bs_df["sigma_bs_count"] = ts_length_df["length_count"].sum() + # ---- Value sum + sigma_bs_df["sigma_bs_sum"] = sigma_bs_df["sigma_bs"] * sigma_bs_df["sigma_bs_count"] + + # Get the database file name + acoustic_db = file_configuration["database"]["acoustics"] + + # Check for `sigma_bs_mean_df` in the database file + # ---- Query database + if not SQL(acoustic_db, "validate", table_name="sigma_bs_mean_df"): + # ---- Create + SQL(acoustic_db, "create", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df, + primary_keys=list(set(contrast_columns) - set(["length"]))) + # ---- Populate table + SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df) + else: + # ---- Create a filter condition command + condition_str = " & ".join([f"{key} in {np.unique(sigma_bs_df[key])}" for key in key_list]) + # ---- Update the table key + SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df, + operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], condition=condition_str) + # ---- Update the actual `sigma_bs` value in the table + SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", columns=["sigma_bs"], + operation="sigma_bs_sum / sigma_bs_count", condition=condition_str) + +def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.DataFrame, + file_configuration: dict): + + # Get the spatial column name, if there is one + contrast_columns = file_configuration["spatial_column"].copy() + # ---- Append additional columns that will be used + contrast_columns.extend(["trawl_partition", "sex", "haul_num", "species_id", "length_bin"]) + + # Gather specimen measurements to represent 'all' fish + specimen_data_all = specimen_data.assign(sex="all") + + # Combine sexed and 'all' specimens + # ---- Vertical concatenation + specimen_data_all = pd.concat( + [specimen_data[specimen_data["sex"].isin(["male", "female"])], specimen_data_all], + ignore_index=True + ) + # ---- Remove bad values + specimen_data_all.dropna(subset=["length", "weight"], inplace=True) + + # Get SQL database file + biology_db = file_configuration["database"]["biology"] + + # Check for `specimen_data_df` in the database file + # ---- Query database + # if not SQL(biology_db, "validate", table_name="specimen_data_df"): + # ---- Assign values for key values + key_values = [str(index) + "-" + + "-".join(specimen_data_all.loc[index, contrast_columns].values.astype(str)) + for index in specimen_data_all.index] + # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint + specimen_data_all.loc[:, "id"] = key_values + # ---- Insert the new data into the database & pull in the combined dataset + specimen_data_sql = sql_data_exchange(biology_db, + dataframe=specimen_data_all, + table_name="specimen_data_df", + id_columns=["id"], + primary_keys=["id"], + output_type=pd.DataFrame) + # ---- Drop SQL db identifier + specimen_data_sql.drop(columns="id", inplace=True) + + # Fit length-weight linear regression by male, female, and all fish + length_weight_regression_df = ( + specimen_data_sql.groupby(["species_id", "sex"]) + .apply( + lambda df: pd.Series( + np.polyfit(np.log10(df["length"]), np.log10(df["weight"]), 1), + index=["rate", "initial"], + ), + include_groups=False, + ) + .reset_index() + ) + + # Predict weights for binned lengths + # ---- Initialize dataframe + weight_fitted_df = distribution_df.copy() + # ---- Expand/merge with length-weight regression coefficients + weight_fitted_df = weight_fitted_df.merge(length_weight_regression_df, how="cross") + # ---- Predict weight per bin + weight_fitted_df["weight_modeled"] = ( + 10.0 ** weight_fitted_df["initial"] + * weight_fitted_df["length_bin"] ** weight_fitted_df["rate"] + ) + # ---- Drop unused columns + weight_fitted_df = weight_fitted_df.filter( + ["length_bin", "species_id", "sex", "weight_modeled"] + ) + + # Adjust for cases where there are too few (< 5) specimens within a given length bin + # ---- Count number of specimens across length bins + weight_fitted_distribution_df = specimen_data_all.count_variable( + contrasts=["species_id", "sex", "length_bin"], variable="length", fun="size" + ).set_index(["species_id", "sex", "length_bin"]) + # ---- Get mean weight per bin as well + weight_fitted_distribution_df["weight_mean"] = ( + specimen_data_all.groupby(["species_id", "sex", "length_bin"], observed=False)["weight"] + .mean() + .fillna(0.0) + ) + # ---- Merge with the fitted weights + weight_fitted_distribution_df = weight_fitted_distribution_df.merge( + weight_fitted_df, + on=["species_id", "sex", "length_bin"], + how="outer" + ) + # ---- Fill missing counts + weight_fitted_distribution_df["weight_mean"] = ( + weight_fitted_distribution_df["weight_mean"].fillna(0.0) + ) + # ---- Fill missing weights + weight_fitted_distribution_df["count"] = ( + weight_fitted_distribution_df["count"].fillna(0).astype(int) + ) + # ---- Find fitted weights accounting for low sample sizes + weight_fitted_distribution_df["weight_fitted"] = np.where( + weight_fitted_distribution_df["count"] < 5, + weight_fitted_distribution_df["weight_modeled"], + weight_fitted_distribution_df["weight_mean"], + ) + # ---- Pull out unused columns + weight_fitted_distribution_df = weight_fitted_distribution_df.filter( + ["species_id", "sex", "length_bin", "weight_fitted"] + ) + + # Check for `weight_fitted_df` in the database file + # ---- Create id/primary key + key_values = ["-".join(weight_fitted_distribution_df + .loc[idx, ["species_id", "sex", "length_bin"]] + .values.astype(str)) + for idx in weight_fitted_distribution_df.index] + # ---- Add to the output + output_df = weight_fitted_distribution_df.assign(id=key_values) + # ---- Query database + if not SQL(biology_db, "validate", table_name="weight_fitted_df"): + # ---- Create + SQL(biology_db, "create", table_name="weight_fitted_df", + dataframe=output_df, primary_keys=["id"]) + # ---- Populate table + SQL(biology_db, "insert", table_name="weight_fitted_df", + dataframe=output_df, id_columns=["id"]) + else: + # ---- Update the table + sql_group_update(db_file=biology_db, + dataframe=output_df, + table_name="weight_fitted_df", + columns=["weight_fitted"], + unique_columns=["species_id", "sex", "length_bin"], + id_columns=["id"]) + + # Return the dataframe + return weight_fitted_distribution_df + +def length_bin_weights(length_data: pd.DataFrame, specimen_data: pd.DataFrame, + length_weight_df: pd.DataFrame, file_configuration: dict): + + # Get the spatial column name, if there is one + contrast_columns = file_configuration["spatial_column"].copy() + # ---- Get the spatial key + spatial_key = contrast_columns.copy() + # ---- Append additional columns that will be used + contrast_columns.extend(["sex", "species_id"]) + + # Get database + biology_db = file_configuration["database"]["biology"] + + # Pull the relevant data + # SQL(biology_db, "select", table_name="length_df", + # columns=list(set(length_data.columns) - set(["length_bin"]))) + # list(set(length_data.columns) - set(["length_bin"])) + # Get length distribution + # distribution_df = file_configuration["length_distribution"] + + # Generate sex-specific interpolators for fitted length-weight values for binned length counts + # ---- Parse the male- and female-specific fitted weight values + length_weight_sex = length_weight_df.copy()[length_weight_df["sex"].isin(["male", "female"])] + # ---- Create interpolator functions + interpolators = group_interpolator_creator( + grouped_data=length_weight_sex, + independent_var="length_bin", + dependent_var="weight_fitted", + contrast=["sex", "species_id"], + ) + # ---- Create helper/lambda function + def weight_interpolator(dataframe_row): + sex = dataframe_row["sex"] + species_id = dataframe_row["species_id"] + length = dataframe_row["length"] + if (sex, species_id) in interpolators: + return interpolators[(sex, species_id)](length) + else: + return None + + # Extract only sexed fish from the unaged (station 1) length dataset + length_data_sexed = length_data[length_data["sex"].isin(["male", "female"])].copy() + # ---- Add interpolated weights to the general length dataset + length_data_sexed.loc[:, "weight_interp"] = ( + length_data_sexed.apply(weight_interpolator, axis=1) * length_data_sexed["length_count"] + ) + # ---- Convert interpolated weights (summed across length counts) into a table + length_table_sexed = ( + length_data_sexed + .groupby(list(set(contrast_columns).union(set(["length_bin"]))))["weight_interp"].sum() + ).reset_index() + + # Remove specimen data with missing data required for this analysis + # ---- Drop unsexed fish + specimen_data_filtered = specimen_data[specimen_data["sex"].isin(["male", "female"])].copy() + # ---- Remove NaN + specimen_data_filtered = specimen_data_filtered.dropna(subset=["length", "weight"]) + # ---- Convert to a table + specimen_table_sexed = ( + specimen_data_filtered + .groupby(list(set(contrast_columns).union(set(["length_bin"]))))["weight"].sum() + ).reset_index() + + # Check for `length_weight_df` in the database file + # ---- Create id/primary key + key_values = ["-".join(length_table_sexed.reset_index() + .loc[idx, ["species_id", "sex", "length_bin"]] + .values.astype(str)) + for idx in length_table_sexed.reset_index().index] + # ---- Add to the output + length_table_sexed["id"] = key_values + # ---- Query database + if not SQL(biology_db, "validate", table_name="length_weight_df"): + # ---- Create + SQL(biology_db, "create", table_name="length_weight_df", + dataframe=length_table_sexed, primary_keys=["id"]) + # ---- Populate table + SQL(biology_db, "insert", table_name="length_weight_df", + dataframe=length_table_sexed, id_columns=["id"]) + else: + # ---- Update the table + sql_group_update(db_file=biology_db, + dataframe=length_table_sexed, + table_name="length_weight_df", + columns=["weight_interp"], + unique_columns=contrast_columns, + id_columns=["id"]) + # length_sql_sexed + + + # , specimen_sql_sexed + + # Return outputs + return length_table_sexed, specimen_table_sexed + +def number_proportions(specimen_binned: pd.DataFrame, specimen_binned_filtered: pd.DataFrame, + length_binned: pd.DataFrame, file_configuration: dict): + + # Get the spatial column name, if there is one + contrast_columns = file_configuration["spatial_column"].copy() + # ---- Append additional columns that will be used + contrast_columns.extend(["sex", "species_id"]) + + + # Get unique values of each contrast column across the biological datasets + dfs = [pd.DataFrame({col: df[col].unique().tolist()}) + for col, df in zip(contrast_columns, [specimen_binned, + specimen_binned_filtered, + length_binned])] + # ---- Reduce into a single DataFrame + count_total = reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs) + # ---- Set the indices + count_total.set_index(contrast_columns, inplace=True) + # ---- Specimen count + count_total["total_specimen"] = specimen_binned.groupby(contrast_columns)["count"].sum() + # ---- Specimen filtered count + count_total["total_specimen_filtered"] = ( + specimen_binned_filtered.groupby(contrast_columns)["count"].sum() + ) + # ---- Length count + count_total["total_length"] = length_binned.groupby(contrast_columns)["count"].sum() + # ---- Fill NaN + count_total.fillna(0, inplace=True) + count_total = ( + count_total.reset_index().set_index(list(set(contrast_columns) - set(["sex", "species_id"]))) + ) + # ---- Grand totals + count_total["total_overall"] = ( + count_total.loc[count_total.sex == "all", "total_specimen_filtered"] + + count_total.loc[count_total.sex == "all", "total_length"] + ) + # ---- Reset index + count_total = count_total.reset_index() + + # Compute the number proportions for the specimen data + specimen_number_proportion = specimen_binned_filtered[ + specimen_binned_filtered["sex"].isin(["male", "female", "all"]) + ].merge( + count_total[list(set(contrast_columns).union(set(["total_specimen_filtered", "total_overall"])))], + on=contrast_columns + ) + # ---- Within-dataset proportion + specimen_number_proportion["proportion_number_specimen"] = ( + specimen_number_proportion["count"] / specimen_number_proportion["total_specimen_filtered"] + ) + # ---- Overall survey proportion + specimen_number_proportion["proportion_number_specimen_overall"] = ( + specimen_number_proportion["count"] / specimen_number_proportion["total_overall"] + ) + # ---- Compute the sex proportions + sex_number_proportions = ( + specimen_number_proportion.groupby(contrast_columns, observed=False)[ + "proportion_number_specimen_overall" + ] + .sum() + .reset_index() + ) + + # Compute the number proportions for the length data + length_number_proportion = length_binned[ + length_binned["sex"].isin(["male", "female", "all"]) + ].merge( + count_total[list(set(contrast_columns).union(set(["total_length", "total_overall"])))], + on=contrast_columns + ) + # ---- Within-dataset proportion + length_number_proportion["proportion_number_length"] = ( + length_number_proportion["count"] / length_number_proportion["total_length"] + ) + # ---- Overall survey proportion + length_number_proportion["proportion_number_length_overall"] = ( + length_number_proportion["count"] / length_number_proportion["total_overall"] + ) + + # Gather unaged (sexed) number proportions + # ---- Merge + sex_number_proportions = sex_number_proportions.merge( + length_number_proportion.groupby(contrast_columns)[ + "proportion_number_length_overall" + ] + .sum() + .reset_index(), + how="outer", + ).fillna(0.0) + # ---- Sum overall total across datasets + sex_number_proportions["proportion_number_overall"] = ( + sex_number_proportions.proportion_number_specimen_overall + + sex_number_proportions.proportion_number_length_overall + ) + + # Return the output + return specimen_number_proportion, length_number_proportion, sex_number_proportions + +def length_bin_counts(length_data: pd.DataFrame, specimen_data: pd.DataFrame, + file_configuration: dict): + + # Get the spatial column name, if there is one + contrast_columns = file_configuration["spatial_column"].copy() + # ---- Append additional columns that will be used + contrast_columns.extend(["sex", "species_id", "length_bin"]) + + # Bin counts by sex + # ---- Specimen + specimen_number_distribution = pd.concat( + [specimen_data, specimen_data.assign(sex="all")] + ).count_variable( + contrasts=contrast_columns, + variable="length", + fun="size", + ) + # ---- Filter out unsexed data for parallel number counts and drop any NA's + specimen_number_distribution_filtered = ( + pd.concat( + [ + specimen_data[specimen_data.sex != "unsexed"], + specimen_data[specimen_data.sex != "unsexed"].assign(sex="all"), + ] + ) + .dropna(subset=["length", "weight"]) + .count_variable( + contrasts=contrast_columns, + variable="length", + fun="size", + ) + ) + + # Repeat for the aggregated data + # ---- Length + length_number_distribution = pd.concat( + [length_data, length_data.assign(sex="all")] + ).count_variable( + contrasts=contrast_columns, + variable="length_count", + fun="sum", + ) + + return ( + specimen_number_distribution, + specimen_number_distribution_filtered, + length_number_distribution + ) + + +# def length_bin_counts(biology_dict: dict, file_configuration: dict): + +# # Get the spatial column name, if there is one +# contrast_columns = file_configuration["spatial_column"].copy() +# # ---- Append additional columns that will be used +# contrast_columns.extend(["sex", "species_id", "length_bin"]) + +# # Get database file +# biology_db = file_configuration["database"]["biology"] + +# # Get distribution data +# distribution_df = file_configuration["length_distribution"] + +# # Generate number counts for the length distribution +# length_datasets = ( +# biology_dict["specimen_df"] +# .meld(biology_dict["length_df"], +# contrasts=list(set(contrast_columns).union(["length_bin"]))) +# ) +# # ---- Create 'all' +# length_datasets_all = pd.concat([ +# length_datasets[length_datasets["sex"].isin(["male", "female"])], +# length_datasets.assign(sex="all") +# ]) + +# # Collapse by each bin +# grouped_length = ( +# length_datasets_all +# .groupby(contrast_columns, observed=False)["length_count"].sum() +# ) + +# # Get distinct DataFrame columns +# distinct_keys = ( +# grouped_length +# .reset_index() +# .loc[:, list(set(contrast_columns) - set(["length_bin"]))].drop_duplicates() +# ) + +# # Create complete DataFrame +# complete_distrib_df = ( +# distribution_df.merge(distinct_keys, how="cross").set_index(contrast_columns) +# ) +# # ---- Pre-allocate the "length_count" column +# complete_distrib_df.loc[:, "count"] = 0 +# # ---- Add the computed counts +# complete_distrib_df.loc[grouped_length.index, "count"] = grouped_length +# # ---- Create output DataFrame +# output_df = complete_distrib_df.filter(["count"]).reset_index() + +# # Check for `length_count_df` in the database file +# # ---- Create id/primary key +# key_values = ["-".join(output_df +# .loc[idx, ["species_id", "sex", "length_bin"]] +# .values.astype(str)) +# for idx in output_df.index] +# # ---- Add to the output +# output_df["id"] = key_values +# # ---- Query database +# if not SQL(biology_db, "validate", table_name="length_count_df"): +# # ---- Create +# SQL(biology_db, "create", table_name="length_count_df", +# dataframe=output_df, primary_keys=["id"]) +# # ---- Populate table +# SQL(biology_db, "insert", table_name="length_count_df", +# dataframe=output_df, id_columns=["id"]) +# else: +# # ---- Update the table +# sql_group_update(db_file=biology_db, +# dataframe=output_df, +# table_name="length_count_df", +# columns=["count"], +# unique_columns=contrast_columns, +# id_columns=["id"]) + +# # Return output +# return output_df + + +def bin_length_data(biology_dict: dict, distribution_df: pd.DataFrame): + + # Create Lambda help function + def _quantize_lengths(dataset, distribution): + # ---- Cut/merge the underlying histogram/discretized length bins + if "length" in dataset.columns: + # ---- Cut the intervals + dataset["length_bin"] = pd.cut(dataset["length"], + np.unique(np.hstack([distribution["lower"], + distribution["upper"]])), + labels=distribution["length_bin"]).astype(float) + # ---- Return the dataset + return dataset + + # Update the data dictionary + biology_dict.update({ + k: _quantize_lengths(d, distribution_df) for k, d in biology_dict.items() + }) + + +def compute_average_weights(specimen_number_proportion: pd.DataFrame, + length_number_proportion: pd.DataFrame, + sex_number_proportions: pd.DataFrame, + length_weight_df: pd.DataFrame, + distribution_df: pd.DataFrame, + file_configuration: dict): + + # Get the spatial column name, if there is one + contrast_columns = file_configuration["spatial_column"].copy() + # ---- Append additional columns that will be used + contrast_columns.extend(["sex", "species_id"]) + + overall_proportions = sex_number_proportions[sex_number_proportions["sex"] == "all"] + updated_proportions = sex_number_proportions.copy() + + updated_proportions["number_proportion_length_all"] = overall_proportions["proportion_number_length_overall"].values[0] + updated_proportions["number_proportion_specimen_all"] = overall_proportions["proportion_number_specimen_overall"].values[0] + + # Calculate the mixed aged and unaged number proportions + updated_proportions["proportion_length"] = ( + updated_proportions["number_proportion_length_all"] / + (updated_proportions["number_proportion_length_all"] + + updated_proportions["proportion_number_specimen_overall"]) + ) + # ---- Calculate aged number proportions per sex per stratum + updated_proportions["proportion_specimen"] = ( + updated_proportions["proportion_number_specimen_overall"] / ( + updated_proportions["proportion_number_specimen_overall"] + + updated_proportions["proportion_length"] + ) + ) + # ---- Reduce the columns + proportion_df = ( + updated_proportions.filter(contrast_columns + ["proportion_length", "proportion_specimen"]) + ) + + # Combine the aged-unaged (or station-specific) proportions for calculations + # ---- Wide-to-long DataFrame + station_proportions = pd.wide_to_long( + proportion_df, + stubnames="proportion", + i=contrast_columns, + j="group", + sep="_", + suffix="\\w+", + ).reset_index() + # ---- Convert to Table (to replicate indexed matrix operations) + station_proportions_table = station_proportions.pivot_table( + index=["species_id", "group", "sex"], + columns=file_configuration["spatial_column"].copy(), values="proportion" + ).fillna(0.0) + + # Calculate the number length proportions that will later be converted into weight + # ---- Specimen + specimen_length_distribution = ( + specimen_number_proportion.groupby(contrast_columns + ["length_bin"], observed=False)[ + "proportion_number_specimen" + ] + .sum() + .reset_index(name="number_proportion") + ) + # ---- Length + length_length_distribution = ( + length_number_proportion[length_number_proportion.sex != "unsexed"][ + contrast_columns + ["length_bin", "proportion_number_length"] + ].rename(columns={"proportion_number_length": "number_proportion"}) + ) + + # Get unique values of each contrast column across the biological datasets + dfs = [pd.DataFrame({col: df[col].unique().tolist()}) + for col, df in zip(contrast_columns, [specimen_number_proportion, + length_number_proportion, + sex_number_proportions])] + # ---- Reduce into a single DataFrame + full_contrast_keys = reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs) + + # + length_distribution_df = distribution_df.copy() + complete_distrib_df = ( + length_distribution_df.merge(full_contrast_keys, how="cross") + .drop(columns=["interval", "lower", "upper"]) + .set_index(contrast_columns + ["length_bin"]) + ) + + specimen_length_complete = complete_distrib_df.copy() + specimen_length_complete["number_proportion"] = specimen_length_distribution.set_index(contrast_columns + ["length_bin"]) + specimen_length_complete.loc[:, "number_proportion"] = specimen_length_complete["number_proportion"].fillna(0.0) + + length_length_complete = complete_distrib_df.copy() + length_length_complete["number_proportion"] = length_length_distribution.set_index(contrast_columns + ["length_bin"]) + length_length_complete.loc[:, "number_proportion"] = length_length_complete["number_proportion"].fillna(0.0) + + # ---- Concatenate the two datasets + combined_number_proportions = ( + pd.concat([specimen_length_complete.assign(group="specimen"), + length_length_complete.assign(group="length")]) + ).reset_index() + # ---- Convert to Table (to replicate indexed matrix operations) + length_proportions_table = combined_number_proportions.pivot_table( + index=["species_id", "group", "sex", "length_bin"], + columns=file_configuration["spatial_column"].copy(), + values="number_proportion", + observed=False, + ).fillna(0.0) + + # Convert the fitteed weights into a Table (to replicate index matrix operations) + fitted_weight_table = length_weight_df.pivot_table( + index=["species_id", "sex", "length_bin"], values="weight_fitted", observed=False + ) + + # Calculate the average weights for male, female, and all fish within each stratum + # ---- All + fitted_weight_table.loc[:, "all", :] + weight_all = fitted_weight_table.loc[:, "all", :]["weight_fitted"].values.dot( + length_proportions_table.loc[:, "specimen", "all"] + * station_proportions_table.loc[:, "specimen", "all"] + + length_proportions_table.loc[:, "length", "all"] + * station_proportions_table.loc[:, "length", "all"] + ) + weight_male = fitted_weight_table.loc[:, "male", :]["weight_fitted"].values.dot( + length_proportions_table.loc[:, "specimen", "male"] + * station_proportions_table.loc[:, "specimen", "male"] + + length_proportions_table.loc[:, "length", "male"] + * station_proportions_table.loc[:, "length", "male"] + ) + weight_female = fitted_weight_table.loc[:, "female", :]["weight_fitted"].values.dot( + length_proportions_table.loc[:, "specimen", "female"] + * station_proportions_table.loc[:, "specimen", "female"] + + length_proportions_table.loc[:, "length", "female"] + * station_proportions_table.loc[:, "length", "female"] + ) + # ---- Combine the averaged weights for each sex and all fish + fitted_weight_df = full_contrast_keys.copy() + fitted_weight_df["average_weight"] = ( + np.concatenate([weight_all, weight_male, weight_female]) + ) + + # Return output + return fitted_weight_df \ No newline at end of file diff --git a/echopop/live/live_core.py b/echopop/live/live_core.py index 28a63237..677cddc3 100644 --- a/echopop/live/live_core.py +++ b/echopop/live/live_core.py @@ -79,11 +79,15 @@ }, "length": { "dtypes": { + "operation_number": int, + "partition": str, "sex": str, "rounded_length": int, "frequency": int, }, "names": { + "operation_number": "haul_num", + "partition": "trawl_partition", "sex": "sex", "rounded_length": "length", "frequency": "length_count", @@ -91,13 +95,17 @@ }, "specimen": { "dtypes": { - "rounded_length": int, + "operation_number": int, + "partition": str, + "length": float, "organism_weight": float, "sex": str, }, "names": { + "operation_number": "haul_num", + "partition": "trawl_partition", "sex": "sex", - "rounded_length": "length", + "length": "length", "organism_weight": "weight" }, }, diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index ce5a06f7..1220591f 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -13,6 +13,8 @@ SPATIAL_CONFIG_MAP ) +from .live_spatial_methods import create_inpfc_strata + # TODO: Incorporate complete YAML file validator # TODO: Documentation def live_configuration(live_init_config_path: Union[str, Path], @@ -55,53 +57,105 @@ def live_configuration(live_init_config_path: Union[str, Path], # Combine both into a dictionary output that can be added to the `LiveSurvey` class object return {**init_config, **file_config} -# TODO: Documentation -def validate_data_directory(root_directory: str, file_settings: dict) -> List[Path]: +def read_acoustic_files(acoustic_files: List[Path]) -> tuple: - # Get acoustic directory and initialization settings - # ---- Create the full filepath - directory_path = Path(root_directory) / file_settings["directory"] - # ---- Get the defined file extension - file_extension = file_settings["extension"] + # Get the file-specific settings, datatypes, columns, etc. + # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` + acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"] - # Validate filepath, columns, datatypes - # ---- Error evaluation (if applicable) - if not directory_path.exists(): - raise FileNotFoundError( - f"The acoustic data directory [{directory_path}] does not exist." - ) - - # Validate that files even exist - # ---- List available *.zarr files - data_files = list(directory_path.glob(f"*{'.'+file_extension}")) - # ---- Error evaluation (if applicable) - if not data_files: - raise FileNotFoundError( - f"No `*.{file_extension}` files found in [{directory_path}]!" - ) + # Read all of the zarr files + results_list = [(data_df, unit_dict) if i ==0 else (data_df, None) + for i, (data_df, unit_dict) in enumerate( + read_acoustic_zarr(Path(file), acoustics_config_map) + for file in acoustic_files + )] + + # Concatenate the dataframe component + acoustic_data_df = pd.concat([df for df, _ in results_list], ignore_index = True) + # ---- Add the `acoustic_data_units` to the dictionary and output the resulting tuple + return acoustic_data_df, results_list[0][1] if results_list else None + +def filter_filenames(directory_path: Path, filename_id: str, + files: List[Path], + file_extension: str): + + # Drop the `{FIELD_ID}` tag identifier + file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', filename_id) + # ---- Replace all other tags with `*` placeholders + file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) + # ---- Create Path object with the generalized format + subfile_path_obj = directory_path.glob(f"{file_id_format}.{file_extension}") + # ---- List all files that match this pattern + subfile_str = [str(file) for file in list(subfile_path_obj)] + + # Convert list of proposed files from Path to String + file_str = [str(file) for file in list(files)] - # Return the output - return data_files + # Find intersection with the proposed filenames and return the output + return list(set(subfile_str).intersection(set(file_str))) + +def read_biology_files(biology_files: List[Path], file_configuration: dict): -def read_acoustic_zarr(acoustic_files: Path) -> tuple: + # Get the biology data file settings + file_settings = file_configuration["input_directories"]["biology"] # Get the file-specific settings, datatypes, columns, etc. # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` - acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"] - # ---- Create list of coordinate data variables - specified_vars = list(acoustics_config_map["xarray_variables"].keys()) - # ---- Create set of coordinate variables - specified_coords = list(acoustics_config_map["xarray_coordinates"].keys()) - # ---- Concatenate into a full configuration map - full_config_map = {**acoustics_config_map["xarray_coordinates"], - **acoustics_config_map["xarray_variables"]} + biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] + # ---- Extract the expected file name ID's + biology_file_ids = file_settings["file_name_formats"] + # ---- Extract all of the file ids + biology_config_ids = list(biology_file_ids.keys()) + # ---- Initialize the dictionary that will define this key in the `input` attribute + biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} + # # ---- Create filepath object + directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"] + + # Add SQL file to dict + file_configuration["database"]["biology"] = ( + Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"] + ) + + # Iterate through the different biology datasets and read them in + for dataset in list(biology_file_ids.keys()): + # ---- Get dataset-specific file lists + dataset_files = filter_filenames(directory_path, + biology_file_ids[dataset], + biology_files, + file_settings["extension"]) + # ---- If there are dataset files available + if dataset_files: + # ---- Read in validated biology data + dataframe_list = [read_biology_csv(Path(file), + file_settings["file_name_formats"][dataset], + biology_config_map[dataset]) + for file in dataset_files] + # ---- Concatenate the dataset + dataframe_combined = pd.concat(dataframe_list, ignore_index=True) + # ---- Lower-case sex + if "sex" in dataframe_combined.columns: + dataframe_combined["sex"] = dataframe_combined["sex"].str.lower() + # ---- Lower-case trawl partition type + if "trawl_partition" in dataframe_combined.columns: + dataframe_combined["trawl_partition"] = dataframe_combined["trawl_partition"].str.lower() + # ---- Reformat datetime column + if "datetime" in dataframe_combined.columns: + dataframe_combined["datetime"] = convert_datetime(dataframe_combined["datetime"]) + # ---- Add to the data dictionary + biology_output[f"{dataset}_df"] = dataframe_combined + + # Return the output + return biology_output + +def read_acoustic_zarr(file: Path, config_map: dict) -> tuple: + # Format the file reading configuration + # ---- Concatenate into a full configuration map + full_config_map = {**config_map["xarray_coordinates"], + **config_map["xarray_variables"]} + # Determine the file loading method for the `acoustic_files` - if len(acoustic_files) > 1: - zarr_data_ds = xr.open_mfdataset(acoustic_files, engine="zarr", chunks="auto", - data_vars=specified_vars, coords=specified_coords) - else: - zarr_data_ds = xr.open_dataset(acoustic_files[0], engine="zarr", chunks="auto") + zarr_data_ds = xr.open_dataset(file, engine="zarr", chunks="auto") # Pre-process the Dataset, convert it to a DataFrame, and validate the structure # ---- Convert to a DataFrame @@ -119,6 +173,9 @@ def read_acoustic_zarr(acoustic_files: Path) -> tuple: # ---- Select defined columns zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map) + # Add the filename as a column + zarr_data_df_filtered["source"] = Path(file).name + # Gather some of the units data_units = { "longitude": zarr_data_ds.longitude.units, @@ -130,99 +187,55 @@ def read_acoustic_zarr(acoustic_files: Path) -> tuple: return zarr_data_df_filtered, data_units # TODO: Documentation -def configure_transmit_frequency(frequency_values: pd.Series, - transmit_settings: dict, - current_units: str): - - # Extract transmit frequency units defined in configuration file - configuration_units = transmit_settings["units"] - - # Transform the units, if necessary - # ---- Hz to kHz - if current_units == "Hz" and configuration_units == "kHz": - return frequency_values * 1e-3 - # ---- kHz to Hz - elif current_units == "kHz" and configuration_units == "Hz": - return frequency_values * 1e3 - # ---- No change - else: - return frequency_values - -# TODO: Documentation -def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame, - file_configuration: dict) -> pd.DataFrame: - - # Get acoustic processing settings - acoustic_analysis_settings = file_configuration["acoustics"] - # ---- Extract the fined acoustic frequency - transmit_settings = acoustic_analysis_settings["transmit"] - - # Filter the dataset - # ---- Configure `frequency_nominal`, if necessary - prc_nasc_df["frequency_nominal"] = ( - configure_transmit_frequency(prc_nasc_df["frequency_nominal"], - transmit_settings, - acoustic_analysis_settings["dataset_units"]["frequency"]) - ) - # ---- Filter out any unused frequency coordinates - prc_nasc_df_filtered = ( - prc_nasc_df[prc_nasc_df["frequency_nominal"] == transmit_settings["frequency"]] - ) - - # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object - # ---- Replace NASC `NaN` values with `0.0` - prc_nasc_df_filtered.loc[:, "NASC"] = prc_nasc_df_filtered.loc[:, "NASC"].fillna(0.0) - # ---- Drop the `frequency_nominal` column and return the output - return prc_nasc_df_filtered.drop(columns = ["frequency_nominal"]) +def validate_data_directory(file_configuration: dict, dataset: str, + input_filenames: Optional[list] = None) -> List[Path]: -# TODO: Documentation -def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame]: + # Get the dataset file settings + file_settings = file_configuration["input_directories"][dataset] # Get the acoustic file settings and root directory - # ---- File settings - file_settings = file_configuration["input_directories"]["acoustics"] # ---- Root directory - root_directory = file_configuration["data_root_dir"] - - # Get and validate the acoustic data directory and files - acoustic_files = validate_data_directory(root_directory, file_settings) - - # Query `acoustics.db` to process only new files (or create the db file in the first place) - new_acoustic_files, file_configuration["database"]["acoustics"] = ( - query_processed_files(root_directory, file_settings, acoustic_files) - ) - - # Read in the acoustic data files - if new_acoustic_files: - # ! [REQUIRES DASK] ---- Read in the listed file - prc_nasc_df, acoustic_data_units = read_acoustic_zarr(new_acoustic_files) - # ---- Add the `acoustic_data_units` to the dictionary - file_configuration["acoustics"]["dataset_units"] = acoustic_data_units - # ---- Preprocess the acoustic dataset - prc_nasc_df_processed = preprocess_acoustic_data(prc_nasc_df, file_configuration) - # ---- Return output - return prc_nasc_df_processed - else: - return None - -def filter_filenames(directory_path: Path, filename_id: str, - files: List[Path], - file_extension: str): + if "data_root_dir" in file_configuration.keys(): + root_directory = Path(file_configuration["data_root_dir"]) + else: + root_directory = Path() + # ---- File folder + data_directory = Path(file_settings["directory"]) + # ---- Createa directory path + directory_path = root_directory / data_directory - # Drop the `{FIELD_ID}` tag identifier - file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', filename_id) - # ---- Replace all other tags with `*` placeholders - file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) - # ---- Create Path object with the generalized format - subfile_path_obj = directory_path.glob(f"{file_id_format}.{file_extension}") - # ---- List all files that match this pattern - subfile_str = [str(file) for file in list(subfile_path_obj)] + # Validate filepath, columns, datatypes + # ---- Error evaluation (if applicable) + if not directory_path.exists(): + raise FileNotFoundError( + f"The acoustic data directory [{directory_path}] does not exist." + ) - # Convert list of proposed files from Path to String - file_str = [str(file) for file in list(files)] + # Validate that files even exist + # ---- List available *.zarr files + data_files = list(directory_path.glob(f"*{'.'+file_settings["extension"]}")) + # ---- Error evaluation (if applicable) + if not data_files: + raise FileNotFoundError( + f"No `*.{file_settings["extension"]}` files found in [{directory_path}]!" + ) - # Find intersection with the proposed filenames and return the output - return list(set(subfile_str).intersection(set(file_str))) + # Check and format specific input filenames + if isinstance(input_filenames, list): + data_files = [directory_path / filename for filename in input_filenames] + # ---- Raise Error + elif input_filenames is not None: + raise TypeError( + "Data loading argument `input_filenames` must be a list." + ) + + # Query the SQL database to process only new files (or create the db file in the first place) + valid_files, file_configuration["database"][dataset] = ( + query_processed_files(root_directory, file_settings, data_files) + ) + + # Return the valid filenames/paths + return valid_files def compile_filename_format(file_name_format: str): @@ -276,89 +289,6 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict): # Return the resulting DataFrame return df_validated -def get_table_key_names(db_file: Path, data_dict: dict, table_name: str) -> List[str]: - - # Get the data input column names - if data_dict[table_name].empty: - # ---- Inspect the table - inspected_table = SQL(db_file, "inspect", table_name=table_name) - # ---- Create a list of the data columns - table_columns = list(inspected_table.keys()) - else: - # ---- Get the DataFrame column names - table_columns = data_dict[table_name].columns - - # Create a list of the primary keys - key_columns = ( - set(table_columns) - .intersection(["trawl_partition", "sex", "haul_num", "species_id", "longitude", - "latitude"]) - ) - - # Return a list of the output - return list(key_columns) - -def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict): - - # Create dataframe copy - data_copy = biology_data.copy() - - # Iterate through dictionary to apply filters (if present) - for column, value in filter_dict.items(): - if column in data_copy.columns: - data_copy = data_copy[data_copy[column] == value] - - # Return output - return data_copy - -def preprocess_biology_data(biology_output: dict, file_configuration: dict): - - # Get SQL database file - biology_db = file_configuration["database"]["biology"] - - # Get contrasts used for filtering the dataset - # ---- Species - species_filter = file_configuration["species"]["number_code"] - # ---- Trawl partition information - trawl_filter = file_configuration["biology"]["catch"]["partition"] - # ---- Create filter dictionary - filter_dict = dict(species_id=species_filter, trawl_partition=trawl_filter) - - # Apply the filter - filtered_biology_output = { - key: biology_data_filter(df, filter_dict) - for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty - } - # ---- Swap this out if no new files are present - if not filtered_biology_output: - # ---- Get available tables - table_list = list(set(SQL(biology_db, "map")) - set(["files_read"])) - # ---- Plug into the dictionary - filtered_biology_output.update({key: pd.DataFrame() for key in table_list}) - # ---- Initialize the results dictionary - results_dict = {key: pd.DataFrame() for key in filtered_biology_output.keys()} - - # Update the SQL database - for table_name, df in filtered_biology_output.items(): - # ---- Get identifier columns - key_columns = get_table_key_names(biology_db, filtered_biology_output, table_name) - # ---- Create copy - df = df.copy() - # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint - df.loc[:, "id"] = "row" + df.index.astype(str) + "-" + "-".join(key_columns) - # ---- Insert the new data into the database & pull in the combined dataset - table_df = sql_data_exchange(biology_db, - dataframe=df, - table_name=table_name, - id_columns=["id"], - primary_keys=["id"], - output_type=pd.DataFrame) - # ---- Add to the outgoing dictionary (and drop SQL db identifier) - results_dict.update({table_name: table_df.drop(columns="id")}) - - # Return the output - return results_dict - def infer_datetime_format(timestamp_str: Union[int, str]): patterns = { r"^\d{14}$": "%Y%m%d%H%M%S", # YYYYMMDDHHMMSS @@ -392,70 +322,70 @@ def convert_datetime(timestamp: Union[int, str, pd.Series]): else: return datetime.strptime(timestamp, datetime_format) -def load_biology_data(file_configuration: dict): - - # Get the acoustic file settings and root directory - # ---- File settings - file_settings = file_configuration["input_directories"]["biology"] - # ---- Root directory - root_directory = file_configuration["data_root_dir"] - - # Get and validate the acoustic data directory and files - biology_files = validate_data_directory(root_directory, file_settings) - - # Query `biology.db` to process only new files (or create the db file in the first place) - # SQL(biology_db, "drop", table_name="files_read") - new_biology_files, file_configuration["database"]["biology"] = ( - query_processed_files(root_directory, file_settings, biology_files) - ) - - # Get the file-specific settings, datatypes, columns, etc. - # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` - biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] - # ---- Extract the expected file name ID's - biology_file_ids = file_settings["file_name_formats"] - # ---- Extract all of the file ids - biology_config_ids = list(biology_file_ids.keys()) - # ---- Initialize the dictionary that will define this key in the `input` attribute - biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} - # ---- Create filepath object - directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"] +# def load_biology_data(file_configuration: dict): + +# # Get the acoustic file settings and root directory +# # ---- File settings +# file_settings = file_configuration["input_directories"]["biology"] +# # ---- Root directory +# root_directory = file_configuration["data_root_dir"] + +# # Get and validate the acoustic data directory and files +# biology_files = validate_data_directory(root_directory, file_settings) + +# # Query `biology.db` to process only new files (or create the db file in the first place) +# # SQL(biology_db, "drop", table_name="files_read") +# new_biology_files, file_configuration["database"]["biology"] = ( +# query_processed_files(root_directory, file_settings, biology_files) +# ) + +# # Get the file-specific settings, datatypes, columns, etc. +# # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` +# biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] +# # ---- Extract the expected file name ID's +# biology_file_ids = file_settings["file_name_formats"] +# # ---- Extract all of the file ids +# biology_config_ids = list(biology_file_ids.keys()) +# # ---- Initialize the dictionary that will define this key in the `input` attribute +# biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} +# # ---- Create filepath object +# directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"] - # Add SQL file to dict - file_configuration["database"]["biology"] = ( - Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"] - ) - - # Iterate through the different biology datasets and read them in - for dataset in list(biology_file_ids.keys()): - # ---- Get dataset-specific file lists - dataset_files = filter_filenames(directory_path, - file_settings["file_name_formats"][dataset], - new_biology_files, - file_settings["extension"]) - # ---- If there are dataset files available - if dataset_files: - # ---- Read in validated biology data - dataframe_list = [read_biology_csv(Path(file), - file_settings["file_name_formats"][dataset], - biology_config_map[dataset]) - for file in dataset_files] - # ---- Concatenate the dataset - dataframe_combined = pd.concat(dataframe_list, ignore_index=True) - # ---- Lower-case sex - if "sex" in dataframe_combined.columns: - dataframe_combined["sex"] = dataframe_combined["sex"].str.lower() - # ---- Lower-case trawl partition type - if "trawl_partition" in dataframe_combined.columns: - dataframe_combined["trawl_partition"] = dataframe_combined["trawl_partition"].str.lower() - # ---- Reformat datetime column - if "datetime" in dataframe_combined.columns: - dataframe_combined["datetime"] = convert_datetime(dataframe_combined["datetime"]) - # ---- Add to the data dictionary - biology_output[f"{dataset}_df"] = dataframe_combined - - # Pre-process and return the results - return preprocess_biology_data(biology_output, file_configuration) +# # Add SQL file to dict +# file_configuration["database"]["biology"] = ( +# Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"] +# ) + +# # Iterate through the different biology datasets and read them in +# for dataset in list(biology_file_ids.keys()): +# # ---- Get dataset-specific file lists +# dataset_files = filter_filenames(directory_path, +# file_settings["file_name_formats"][dataset], +# new_biology_files, +# file_settings["extension"]) +# # ---- If there are dataset files available +# if dataset_files: +# # ---- Read in validated biology data +# dataframe_list = [read_biology_csv(Path(file), +# file_settings["file_name_formats"][dataset], +# biology_config_map[dataset]) +# for file in dataset_files] +# # ---- Concatenate the dataset +# dataframe_combined = pd.concat(dataframe_list, ignore_index=True) +# # ---- Lower-case sex +# if "sex" in dataframe_combined.columns: +# dataframe_combined["sex"] = dataframe_combined["sex"].str.lower() +# # ---- Lower-case trawl partition type +# if "trawl_partition" in dataframe_combined.columns: +# dataframe_combined["trawl_partition"] = dataframe_combined["trawl_partition"].str.lower() +# # ---- Reformat datetime column +# if "datetime" in dataframe_combined.columns: +# dataframe_combined["datetime"] = convert_datetime(dataframe_combined["datetime"]) +# # ---- Add to the data dictionary +# biology_output[f"{dataset}_df"] = dataframe_combined + +# # Pre-process and return the results +# return preprocess_biology_data(biology_output, file_configuration) def validate_hauls_config(spatial_config: dict, link_method: str): @@ -581,6 +511,34 @@ def validate_inpfc_config(spatial_config: dict, link_method: str): f"be one of the following types within a list: {model}." ) +def configure_spatial_settings(file_configuration: dict): + + # Extract spatial strata *only* if spatial information from the configuration settings + # ---- Get (geo)spatial config + spatial_config = file_configuration["geospatial"] + # ---- Remove case sensitivity + spatial_config = {key.lower(): value for key, value in spatial_config.items()} + # ---- Extract the biology-acoustics linking method options + acoustics_biology_link = spatial_config["link_biology_acoustics"] + + # Validate the configuration + validate_spatial_config(spatial_config) + + # Create spatial dictionary that will be added as an `input` + spatial_dict = {"link_method": acoustics_biology_link} + + # Assign the spatial link constraints to the acoustic and biological data + if acoustics_biology_link == "INPFC": + # ---- Update spatial dictionary + spatial_dict.update({"strata": create_inpfc_strata(spatial_config)}) + # ---- Update the stratum classification in the primary file configuration + file_configuration.update({"spatial_column": ["stratum"]}) + else: + # ---- Empty `spatial_column` key + file_configuration.update({"spatial_column": []}) + + # Return the dictionary as an output + return spatial_dict def validate_spatial_config(spatial_config: dict): diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py index f38b130b..c83f35de 100644 --- a/echopop/live/live_spatial_methods.py +++ b/echopop/live/live_spatial_methods.py @@ -5,10 +5,10 @@ from ..spatial.projection import utm_string_generator import shapely.geometry -def apply_inpfc_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict): +def create_inpfc_strata(spatial_config: dict): # Extract the INPFC definitions - inpfc_definitions = spatial_config["inpfc"] + inpfc_definitions = spatial_config["inpfc"] # Create latitude bins latitude_bins = np.concatenate([[-90.0], inpfc_definitions["latitude_max"], [90.0]]) @@ -17,36 +17,88 @@ def apply_inpfc_definitions(acoustic_data: dict, biology_data: dict, spatial_con [np.max(inpfc_definitions["stratum_names"]) + 1]]) # Create spatial key - spatial_config["spatial_key"] = pd.DataFrame({ - "latitude_limit": inpfc_definitions["latitude_max"], + inpfc_strata_df = pd.DataFrame({ + "latitude_limit": np.concatenate([inpfc_definitions["latitude_max"], [90.0]]), + "latitude_interval": pd.cut(np.concatenate([inpfc_definitions["latitude_max"], [90.0]]), + latitude_bins), + "stratum": bin_names, }) - # ---- Cut - spatial_config["spatial_key"]["stratum"] = ( - pd.cut(inpfc_definitions["latitude_max"], - latitude_bins, - right = True, - labels = bin_names) - ) - - # Get the `prc_nasc_df` values, if they exist, and apply stratification information - if not acoustic_data["prc_nasc_df"].empty: - # ---- Bin the latitude data - acoustic_data["prc_nasc_df"]["stratum"] = pd.cut( - acoustic_data["prc_nasc_df"]["latitude"], - latitude_bins, - right = True, - labels = bin_names, - ) - # Get the `trawl_info_df` values, if they exist, and apply stratification information - if not biology_data["trawl_info_df"].empty: - # ---- Bin the latitude data - biology_data["trawl_info_df"]["stratum"] = pd.cut( - biology_data["trawl_info_df"]["latitude"], - latitude_bins, - right = True, - labels = bin_names, - ) + # Add boundaries + # ---- Lower + inpfc_strata_df["lower"] = inpfc_strata_df["latitude_interval"].apply(lambda x: x.left) + # ---- Upper + inpfc_strata_df["upper"] = inpfc_strata_df["latitude_interval"].apply(lambda x: x.right) + + # Return the dataframe + return inpfc_strata_df + +def apply_inpfc_definitions(dataset: pd.DataFrame, inpfc_df: pd.DataFrame): + + # Bin the data based on latitude + if "latitude" in dataset.columns: + dataset["stratum"] = pd.cut( + dataset["latitude"], + np.unique(np.hstack([inpfc_df["lower"], inpfc_df["upper"]])), + labels = inpfc_df["stratum"] + ).astype(int) + + # Return the INPFC-stratified dataset + return dataset + +def apply_spatial_definitions(data_dict: dict, spatial_dict: dict): + + # Get the acoustic-biology link method + link_method = spatial_dict["link_method"] + + # Apply spatial definitions + if link_method == "INPFC": + data_dict.update({ + k: apply_inpfc_definitions(d, spatial_dict["strata"]) for k, d in data_dict.items() + }) + +# def apply_inpfc_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict): + +# # Extract the INPFC definitions +# inpfc_definitions = spatial_config["inpfc"] + +# # Create latitude bins +# latitude_bins = np.concatenate([[-90.0], inpfc_definitions["latitude_max"], [90.0]]) +# # ---- Append 1 more stratum layer +# bin_names = np.concatenate([inpfc_definitions["stratum_names"], +# [np.max(inpfc_definitions["stratum_names"]) + 1]]) + +# # Create spatial key +# spatial_config["spatial_key"] = pd.DataFrame({ +# "latitude_limit": inpfc_definitions["latitude_max"], +# }) +# # ---- Cut +# spatial_config["spatial_key"]["stratum"] = ( +# pd.cut(inpfc_definitions["latitude_max"], +# latitude_bins, +# right = True, +# labels = bin_names) +# ) + +# # Get the `prc_nasc_df` values, if they exist, and apply stratification information +# if not acoustic_data["prc_nasc_df"].empty: +# # ---- Bin the latitude data +# acoustic_data["prc_nasc_df"]["stratum"] = pd.cut( +# acoustic_data["prc_nasc_df"]["latitude"], +# latitude_bins, +# right = True, +# labels = bin_names, +# ) + +# # Get the `trawl_info_df` values, if they exist, and apply stratification information +# if not biology_data["trawl_info_df"].empty: +# # ---- Bin the latitude data +# biology_data["trawl_info_df"]["stratum"] = pd.cut( +# biology_data["trawl_info_df"]["latitude"], +# latitude_bins, +# right = True, +# labels = bin_names, +# ) def define_boundary_box(boundary_dict: dict, projection: str): diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index 579cf463..306ddeb9 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -1,4 +1,4 @@ -from typing import Union +from typing import Union, Optional from pathlib import Path import copy @@ -12,6 +12,11 @@ to_linear ) +from .sql_methods import query_processed_files +from .live_acoustics import preprocess_acoustic_data, integrate_nasc +from .live_biology import preprocess_biology_data + + from . import live_data_processing as eldp from . import live_data_loading as eldl class LiveSurvey: @@ -47,13 +52,69 @@ def __init__( # Initialize the results attribute self.results = copy.deepcopy(LIVE_DATA_STRUCTURE["results"]) - # TODO: Replace Tuple output by appending the "database" key to the respective dataset dict - # Ingest data - # ---- Acoustics - self.input["acoustics"]["prc_nasc_df"] = eldl.load_acoustic_data(self.config) - # ---- Biology - self.input["biology"] = eldp.load_biology_data(self.config) - + # Configure the spatial settings + self.input.update({"spatial": eldl.configure_spatial_settings(self.config)}) + # TODO: Add verbosity for printing database filepaths/connections if verbose: - pass \ No newline at end of file + pass + + + def load_acoustic_data(self, + input_filenames: Optional[list] = None, + verbose: bool = True): + + # Validate the data directory and format the filepaths + acoustic_files = eldl.validate_data_directory(self.config, dataset="acoustics", + input_filenames=input_filenames) + + # Read in the acoustic data files + if acoustic_files: + # ! [REQUIRES DASK] ---- Read in the listed file + # ---- Read in the acoustic data files + prc_nasc_df, acoustic_data_units = eldl.read_acoustic_files(acoustic_files) + # ---- Add the `acoustic_data_units` to the dictionary + self.config["acoustics"]["dataset_units"] = acoustic_data_units + # ---- Preprocess the acoustic dataset + self.input["acoustics"]["prc_nasc_df"] = preprocess_acoustic_data(prc_nasc_df, + self.config) + # TODO: Add verbosity for printing database filepaths/connections + if verbose: + print( + f"The following acoustic files have been processed:\n" + f"{"\n".join(acoustic_files)}." + ) + else: + self.input["acoustics"]["prc_nasc_df"] = None + + def load_biology_data(self, + input_filenames: Optional[list] = None, + verbose: bool = True): + + # Validate the data directory and format the filepaths + biology_files = eldl.validate_data_directory(self.config, dataset="biology", + input_filenames=input_filenames) + + # TODO: Add verbosity for printing database filepaths/connections + if biology_files and verbose: + print( + f"The following biological files have been processed:\n" + f"{"\n".join(biology_files)}." + ) + + # Read in the biology data files + initial_biology_output = eldl.read_biology_files(biology_files, self.config) + + # Preprocess the biology dataset + self.input["biology"], self.input["biology_processed"] = ( + preprocess_biology_data(initial_biology_output, self.input["spatial"], self.config) + ) + + def process_biology_data(self): + + # Separate out processed and unprocessed biological data + # ----- Unprocessed + biology_unprocessed = self.input["biology"] + # ---- Processed + biology_processed = self.input["biology_processed"] + diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index 0d6a6d58..4d253455 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -4,6 +4,7 @@ from typing import Optional, Literal, Union, List import numpy as np from pathlib import Path +import re def sql_create(connection: sqla.Connection, dataframe: pd.DataFrame, table_name: str, primary_keys: Optional[list] = None): @@ -60,7 +61,7 @@ def sql_validate(connection: sqla.Connection, table_name: str): inspector = inspect(connection) return table_name in inspector.get_table_names() -def sql_inspect(connection: sqla.Connection, table_name: str): +def sql_inspect(connection: sqla.Connection, table_name: str, columns: List[str] = None): """ Get a list of all tables present @@ -71,15 +72,26 @@ def sql_inspect(connection: sqla.Connection, table_name: str): list: True if the table exists, False otherwise. """ - # Create 'inspector' for the db file - inspector = inspect(connection) - - # Retrieve column information - column_info = inspector.get_columns(table_name) - - # Format as a dictionary - return {col['name']: {k: v for k, v in col.items() if k != 'name'} for col in column_info} - + # Inspect the columns from the table + if columns is None: + # ---- Create 'inspector' for the db file + inspector = inspect(connection) + # ---- Retrieve column information + column_info = inspector.get_columns(table_name) + # ---- Format as a dictionary and return the output + return {col['name']: {k: v for k, v in col.items() if k != 'name'} for col in column_info} + else: + # Inspect unique values in specified columns + # ---- Create SQL command + sql_command = f"SELECT DISTINCT {", ".join(columns)} FROM {table_name};" + # ---- Execute + table = connection.execute(text(sql_command.strip())) + # ---- Extract unique values + unique_values = table.fetchall() + # ---- Format as a dictionary and return the output + return ( + {col: list(set(row[idx] for row in unique_values)) for idx, col in enumerate(columns)} + ) def sql_drop(connection: sqla.Connection, table_name: str): """ @@ -134,16 +146,13 @@ def format_value(x): # ---- Tuple to String # data_str = ", ".join( - # # f"({', '.join(map(lambda x: f'\'{x}\'' if isinstance(x, str) or isinstance(x, pd.Timestamp) else 'NULL' if x is None else str(x), row))})" - # f"({', '.join(map(lambda x: f'\'{x.replace('\\', '\\\\')}\'' if isinstance(x, str) or isinstance(x, pd.Timestamp) else 'NULL' if x is None else str(x), row))})" - # for row in data_tuple - # ) - flattened_data = [format_value(x) for row in data_tuple for x in row] - data_str = "({})".format(", ".join(flattened_data)) - # data_str = ", ".join( - # "({})".format(", ".join(map(format_value, row))) + # # f"({', '.join(map(lambda x: f'\'{x}\'' if isinstance(x, str) else str(x), row))})" + # f"({', '.join(map(lambda x: f'\'{x}\'' + # if isinstance(x, str) or isinstance(x, pd.Timestamp) + # else 'NULL' if x is None else str(x), row))})" # for row in data_tuple # ) + data_str = ", ".join(f"({','.join(map(lambda x: format_value(x), row))})" for row in data_tuple) # Construct the "ON CONFLICT, DO UPDATE SET" if needed on_conflict_clause = "" @@ -166,15 +175,92 @@ def format_value(x): # Commit connection.commit() +def sql_update(connection: sqla.Connection, table_name: str, columns: list, + dataframe: Optional[pd.DataFrame] = None, operation: Optional[str] = None, + condition: Optional[str] = None): + """ + Insert data into a table. + + Args: + connection (Connection): The SQLAlchemy Connection instance. + table_name (str): The name of the table. + columns (list): List of column names. + data (list of dict): List of dictionaries containing data to insert or update. + conflict_columns (list): List of column names to use for conflict resolution. + """ + + # Prepare the SQL statement for insertion + # ---- Check whether `columns` is '*' + if "*" in columns: + # ---- Create 'inspector' for the db file + inspector = inspect(connection) + # ---- Get the column names from the db file + columns = [col['name'] for col in inspector.get_columns(table_name)] + # ---- If not a List + elif not isinstance(columns, list): + columns = [columns] + + # Format the SET command + # ---- Update column by applying arithmetic between table and dataframe + if operation is not None and dataframe is not None: + set_list = [f"{column} = {column} {operation} {dataframe[column].values[0]}" + for column in columns] + # ---- Update column by applying arithmetic within table + if dataframe is None and operation is not None: + # ---- Make sure `operation` is a list + if not isinstance(operation, list): + operation = [operation] + # ---- Break up the columns into their components + set_list = [f"{column} = {calculation}" for column, calculation in zip(columns, operation)] + # ---- Update column by setting a defined value + if dataframe is not None and operation is None: + set_list = [f"{column} = {dataframe[column].values[0]}" for column in columns] + # ---- Join the list + set_clause = ', '.join(set_list) + + # Add the WHERE clause if a parsed condition is provided + if condition is not None: + # ---- Parse the conditional string + parsed_condition = parse_condition(condition) + set_clause += " WHERE " + parsed_condition + + # Complete the full command + sql_command = f"UPDATE {table_name} SET {set_clause};" -def sql_select(connection: sqla.Connection, table_name: str, columns: list, + # Execute + connection.execute(text(sql_command.strip())) + + # Commit + connection.commit() + +def sql_select(connection: sqla.Connection, table_name: str, + columns: Optional[Union[list, str]] = None, + condition: Optional[str] = None, output_type: type = pd.DataFrame): + # Columns + if columns is None: + column_names = "*" + elif isinstance(columns, list) or isinstance(columns, pd.Index): + column_names = ", ".join(columns) + else: + column_names = columns + # Prepare the columns as a string of column names - column_names = ", ".join(columns) + # if isinstance(columns, list): + # column_names = ", ".join(columns) + # else: + # column_names = columns # Format the SQL command - sql_command = f"SELECT {column_names} FROM {table_name};" + # sql_command = f"SELECT {column_names} FROM {table_name};" + sql_command = f"SELECT {column_names} FROM {table_name}" + + # Add the WHERE clause if a parsed condition is provided + if condition is not None: + # ---- Parse the conditional string + parsed_condition = parse_condition(condition) + sql_command += " WHERE " + parsed_condition # Execute the command table = connection.execute(text(sql_command)) @@ -200,7 +286,8 @@ def sql_select(connection: sqla.Connection, table_name: str, columns: list, # ---- Create DataFrame output_df = pd.DataFrame(data, columns=table.keys()) # ---- Format the expected datatypes - df_dtypes = {col: SQL_DTYPES[type(dtype).__name__] for col, dtype in table_dtypes.items()} + df_dtypes = {col: SQL_DTYPES[type(dtype).__name__] + for col, dtype in table_dtypes.items() if col in columns } # ---- Apply the dtypes return output_df.astype(df_dtypes) else: @@ -226,13 +313,14 @@ def sql_select(connection: sqla.Connection, table_name: str, columns: list, "create": dict(function=sql_create, args=["table_name", "dataframe", "primary_keys"]), "drop": dict(function=sql_drop, args=["table_name"]), "insert": dict(function=sql_insert, args=["table_name", "columns", "dataframe", "id_columns"]), - "inspect": dict(function=sql_inspect, args=["table_name"]), + "inspect": dict(function=sql_inspect, args=["table_name", "columns"]), "map": dict(function=sql_map_tables, args=[]), - "select": dict(function=sql_select, args=["table_name", "columns", "output_type"]), + "select": dict(function=sql_select, args=["table_name", "columns", "output_type", "condition"]), + "update": dict(function=sql_update, args=["table_name", "columns", "condition", "operation", + "dataframe"]), "validate": dict(function=sql_validate, args=["table_name"]), } - - + SQL_DTYPES = { 'int32': 'INTEGER', 'int64': 'INTEGER', @@ -249,13 +337,144 @@ def sql_select(connection: sqla.Connection, table_name: str, columns: list, "TEXT": str, } +def sql_group_update(db_file: str, + dataframe: pd.DataFrame, + table_name: str, + columns: List[str], + unique_columns: List[str], + id_columns: Optional[List[str]] = None): + + # Check for unique values contained within the table + unique_values = SQL(db_file, "inspect", table_name=table_name, columns=unique_columns) + + # Get the unique values in the table + table_values = {col: dataframe[col].unique().tolist() for col in unique_columns} + + # Find mismatched indices + new_indices = {col: list(set(table_values[col]) - set(unique_values[col])) + for col in unique_columns} + + # Filter the DataFrame to include only rows with these missing values + # ---- Create DataFrame copy + filtered_df = dataframe.copy() + # ---- Iterate through the extracted dictionary + for col, missing_vals in new_indices.items(): + if missing_vals: + filtered_df = filtered_df[filtered_df[col].isin(missing_vals)] + else: + # ---- Drop the values that are not contained within the list + filtered_df = pd.DataFrame(columns=filtered_df.columns) + + # Insert into the table if not otherwise present + if not filtered_df.empty: + SQL(db_file, "insert", table_name=table_name, id_columns=id_columns, dataframe=filtered_df) + + # Update the table + # ---- Format the conditional string + case_statements = [] + for col in columns: + case_stmt = "CASE" + for _, row in dataframe.iterrows(): + # Construct the filter condition based on unique_columns + filter_conditions = ' AND '.join([ + f"{col} = '{row[col]}'" if isinstance(row[col], str) else f"{col} = {row[col]}" + for col in unique_columns + ]) + # Add the WHEN condition to the CASE statement + case_stmt += f" WHEN {filter_conditions} THEN {row[col]}" + case_stmt += " END" + case_statements.append(f"{col} = {case_stmt}") + + # Construct the full SQL UPDATE statement + update_clause = ', '.join(case_statements) + + # Format the SQL COMMAND string + sql_command = f""" + UPDATE {table_name} + SET {update_clause} + WHERE ({' OR '.join([ + ' AND '.join([ + f"{col} = '{row[col]}'" if isinstance(row[col], str) else f"{col} = {row[col]}" + for col in unique_columns + ]) + for _, row in dataframe.iterrows() + ])}); + """ + + # Create engine + engine = create_engine(f"sqlite:///{db_file}") + + # Execute and commit + with engine.connect() as connection: + connection.execute(text(sql_command)) + connection.commit() + + # Dispose engine + engine.dispose() + +def get_table_key_names(db_file: Path, data_dict: dict, table_name: str) -> List[str]: + + # Get the data input column names + if data_dict[table_name].empty: + # ---- Inspect the table + inspected_table = SQL(db_file, "inspect", table_name=table_name) + # ---- Create a list of the data columns + table_columns = list(inspected_table.keys()) + else: + # ---- Get the DataFrame column names + table_columns = data_dict[table_name].columns + + # Create a list of the primary keys + key_columns = ( + set(table_columns) + .intersection(["trawl_partition", "sex", "haul_num", "species_id", "longitude", + "latitude", "stratum"]) + ) + + # Return a list of the output + return list(key_columns) + +def parse_condition(condition: str): + # Replace logical operators with SQL equivalents + condition = condition.replace('&', ' AND ').replace('|', ' OR ') + + # Handle "IN" lists and replace square brackets with parentheses + condition = re.sub(r'(\w+)\s*IN\s*\[(.*?)\]', lambda m: f"{m.group(1)} IN ({m.group(2)})", condition, flags=re.IGNORECASE) + + # Handle range conditions for BETWEEN, including floats + condition = re.sub(r'(\d*\.\d+|\d+)\s*<=\s*(\w+)\s*<=\s*(\d*\.\d+|\d+)', + lambda m: f"{m.group(2)} BETWEEN {m.group(1)} AND {m.group(3)}", condition) + + # Handle individual comparisons + condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\d*\.\d+|\d+)', lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", condition) + condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\'[^\']*\')', lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", condition) + + # Return the parsed condition + return condition + +def format_sql_select(table_name, column_names, condition_string): + # Base SQL command to select columns from the table + sql_command = f"SELECT {column_names} FROM {table_name}" + + # Parse the condition string + parsed_condition = parse_condition(condition_string) + + # Add the WHERE clause if a parsed condition is provided + if parsed_condition: + sql_command += " WHERE " + parsed_condition + + # Add a semicolon at the end of the SQL command + sql_command += ";" + + return sql_command + def format_sql_columns(kwargs: dict): # Columns - if "columns" in kwargs: + if "columns" in kwargs and "condition" not in kwargs: if isinstance(kwargs["columns"], list) or isinstance(kwargs["columns"], pd.Index): kwargs["columns"] = ", ".join(kwargs["columns"]) - else: + elif "columns" not in kwargs: kwargs["columns"] = "*" # ID/Conflict columns @@ -267,14 +486,14 @@ def format_sql_columns(kwargs: dict): return kwargs # TODO: Documentation -def query_processed_files(root_directory: str, file_settings: dict, files: List[Path]) -> dict: +def query_processed_files(root_directory: Path, file_settings: dict, files: List[Path]) -> dict: # Get the database name db_name = file_settings["database_name"] # Create filepath to the SQL database # ---- Create Path to SQL database file - db_directory = Path(root_directory) / "database" + db_directory = root_directory / "database" # ---- Create the directory if it does not already exist db_directory.mkdir(parents=True, exist_ok=True) # ---- Complete path to the database file @@ -301,7 +520,7 @@ def query_processed_files(root_directory: str, file_settings: dict, files: List[ # Query already existing files previous_files = SQL(db_file, "select", table_name="files_read", output_type=str) # ---- Insert file list - SQL(db_file, "insert", table_name="files_read", dataframe=current_files, id_columns="filepath") + SQL(db_file, "insert", table_name="files_read", dataframe=current_files, id_columns=["filepath"]) # Filter out previously processed files # ---- Apply filter by comparing sets and return the output @@ -321,11 +540,6 @@ def sql_data_exchange(database_file: Path, **kwargs): if not table_exists: # ---- Create table SQL(database_file, "create", **kwargs) - # ---- Ignore the `id_columns` argument, if present - try: - del kwargs["id_columns"] - except KeyError: - pass # ---- Insert into table SQL(database_file, "insert", **kwargs) # ---- Return the initial dataframe @@ -345,7 +559,8 @@ def SQL(db_file: str, command: str, **kwargs): engine = create_engine(f"sqlite:///{db_file}") # Format the data columns, if necessary, to fit within the SQL commands - kwargs = format_sql_columns(kwargs) + if command not in ["inspect", "update", "select"]: + kwargs = format_sql_columns(kwargs) # Run the command try: diff --git a/echopop/utils/operations.py b/echopop/utils/operations.py index bb5a6423..eae68ec2 100644 --- a/echopop/utils/operations.py +++ b/echopop/utils/operations.py @@ -306,8 +306,13 @@ def group_merge(dataframe, dataframes_to_add, inner_on, outer_on, how="outer", d def group_interpolator_creator( - grouped_data: pd.DataFrame, independent_var: str, dependent_var: str, contrast: str + grouped_data: pd.DataFrame, independent_var: str, dependent_var: str, + contrast: Union[List[str], str] ) -> dict: + + # Check if `contrast` is a list or not + if not isinstance(contrast, list): + contrast = [] # Interpolator generation helper function def interpolator_factory(sub_group): @@ -323,7 +328,7 @@ def interpolator_factory(sub_group): # Produce a dictionary comprising all of the produced interpolators interpolators = ( - grouped_data.groupby([contrast]).apply( + grouped_data.groupby(contrast).apply( lambda group: interpolator_factory(group), include_groups=False ) ).to_dict() diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py index ba7c2a2c..b657b07d 100644 --- a/echopop/zarr_read_ingest_test.py +++ b/echopop/zarr_read_ingest_test.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd import matplotlib.pyplot as plt -from typing import Union, Tuple +from typing import Union, Tuple, Optional from pathlib import Path import copy import yaml @@ -13,38 +13,132 @@ import re import contextlib from sqlalchemy import create_engine, text, Engine, inspect -from echopop.live.live_core import LIVE_DATA_STRUCTURE, LIVE_FILE_FORMAT_MAP, LIVE_INPUT_FILE_CONFIG_MAP +from echopop.live.live_core import LIVE_DATA_STRUCTURE, LIVE_FILE_FORMAT_MAP, LIVE_INPUT_FILE_CONFIG_MAP, SPATIAL_CONFIG_MAP +from echopop.live.live_data_loading import validate_data_directory +from echopop.live.sql_methods import SQL, SQL_COMMANDS, query_processed_files, format_sql_columns from echopop.live import live_data_processing as eldp - +from echopop.live import live_data_loading as eldl +from echopop.live.live_survey import LiveSurvey +from echopop.live.live_acoustics import preprocess_acoustic_data +from echopop.live.live_biology import preprocess_biology_data +from echopop.survey import Survey + +survey_2019 = Survey("C:/Users/Brandyn/Documents/GitHub/echopop/config_files/initialization_config.yml", "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/survey_year_2019_config.yml") +survey_2019.transect_analysis() +survey_2019.analysis["transect"]["biology"]["weight"]["weight_stratum_df"] +analysis_dict = survey_2019.analysis["transect"] + +proportions_dict=analysis_dict["biology"]["proportions"]["number"] +length_weight_dict = analysis_dict["biology"]["weight"] +stratum_proportions_sexed["proportion_aged"] + stratum_proportions_sexed["proportion_unaged"] #################################################################################################### # TEST: YAML FILE CONFIGURATION # ---- Define filepaths +self = LiveSurvey live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml" live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" # ---- Run function: `live_configuration` -file_configuration = live_configuration(live_init_config_path, live_file_config_path) -file_configuration.update({"database": {"acoustics": None, "biology": None}}) -#################################################################################################### -# * Accessory function for tuning the acoustic transmit frequency units/scaling -def format_vlaue(x): - pass - -def format_value(x): - if isinstance(x, str): - return "'{}'".format(x.replace("'", "''")) - elif isinstance(x, pd.Timestamp): - return "'{}'".format(x) - elif x is None: - return 'NULL' - else: - return str(x) +file_configuration = self.config +files = biology_files + +biology_output = initial_biology_output +file_configuration = self.config +table_name = "length_df" +df = filtered_biology_output[table_name] +database_file = biology_db +kwargs = dict(dataframe=df, table_name=table_name, id_columns=["id"], primary_keys=["id"], output_type=pd.DataFrame) + +def process_biology_data(self): + + # Separate out processed and unprocessed biological data + # ----- Unprocessed + biology_unprocessed = self.input["biology"] + # ---- Processed + biology_processed = self.input["biology_processed"] + + # Compute `sigma_bs` by sending it to the appropriate database table + compute_sigma_bs(biology_unprocessed["specimen_df"], biology_unprocessed["length_df"], + self.config) + + # Bin the length measurements of the biological data + bin_length_data(biology_unprocessed, self.config["length_distribution"]) -data_str = ", ".join( - "({})".format(", ".join(format_value(x) for x in row)) - for row in data_tuple -) + # Compute the length-weight regression and add it to the SQL table + length_weight_df = length_weight_regression(biology_unprocessed["specimen_df"], + self.config["length_distribution"], + self.config) + + # Compute length-binned counts for the aggregated and individual-based measurements + specimen_binned, specimen_binned_filtered, length_binned = ( + length_bin_counts(biology_unprocessed["length_df"], biology_unprocessed["specimen_df"], + self.config) + ) + + # Compute the number proportions + specimen_number_proportion, length_number_proportion, sex_number_proportions = ( + number_proportions(specimen_binned, specimen_binned_filtered, length_binned, + self.config) + ) + + # Compute the length-binned weights for the aggregated and individual-based measurements + length_weight_binned, specimen_weight_binned = ( + length_bin_weights(biology_unprocessed["length_df"], + biology_unprocessed["specimen_df"], + length_weight_df,self.config) + ) + + # Calculate the average weights among male, female, and all fish + fitted_weight_df = compute_average_weights(specimen_number_proportion, + length_number_proportion, + sex_number_proportions, + length_weight_df, + self.config["length_distribution"], + self.config) +catch_data = self.input["biology"]["catch_df"] +# Get the spatial column name, if there is one +contrast_columns = file_configuration["spatial_column"].copy() +# ---- Append additional columns that will be used +contrast_columns.extend(["sex", "species_id"]) + +# Calculate grouped totals +# ---- Specimen +specimen_weights = specimen_weight_binned.sum().reset_index(name="total_weight") + + +# Calculate the sexed and total stratum weights for each sex among unaged fish +# ---- Sum the net haul weights from station 1/unaged fish +catch_weights = catch_data.count_variable( + contrasts=["species_id"] + file_configuration["spatial_column"], + variable="haul_weight", fun="sum" +) +# ---- Rename resulting columns for both +catch_weights.rename(columns={"count": "total_weight"}, inplace=True) + +# Sum the sexed and total weights from the weight-fitted unaged data +# ---- Extract the unaged/length quantized weights +unaged_weights_binned = distributions_dict["unaged_length_weight_tbl"].copy() +# ---- Calculate the total weight per stratum per sex +unaged_weights_sex = unaged_weights_binned.sum() +# ---- Length (by sex) +length_weights_sex = length_weight_binned.groupby(contrast_columns)["weight_interp"].sum()#.to_frame("weight") +# ---- Further reduce +length_weight_total = length_weights_sex.transpose().unstack(0).sum(axis=0) +# ---- Standardize the unaged sexed weights +(length_weights_sex / length_weight_total).unstack(0) * catch_weights["total_weight"].to_numpy() + + +length_weight_total = ( + length_weights_sex.reset_index(list(set(contrast_columns)-set(file_configuration["spatial_column"].copy()))) + ["weight_interp"].sum() +) +# ---- Calculate the stratum totals +unaged_strata_weights = unaged_weights_sex.unstack(0).sum(axis=0) +# ---- Standardize the unaged sexed weights +unaged_weights_sex_standardized = (unaged_weights_sex / unaged_strata_weights).unstack( + 0 +) * catch_strata_weights["stratum_weight"].to_numpy() #################################################################################################### # * Functionality for reading in processed acoustic data @@ -182,6 +276,37 @@ def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True): # SQL(database_file, "drop", table_name="nasc_df") # SQL_DTYPES[type(dataframe["ping_time"][0]).__name__] + +def process_acoustic_data(self, + echometrics: bool = True): + + # Get the unprocessed acoustic data + acoustic_data_df = self.input["acoustics"]["prc_nasc_df"] + + # Integrate NASC (and compute the echometrics, if necessary) + nasc_data_df = ( + acoustic_data_df.groupby(["longitude", "latitude", "ping_time"]) + .apply(integrate_nasc, echometrics, include_groups=False) + .unstack().reset_index() + ) + + # ---- Amend the dtypes if echometrics were computed + if echometrics: + # ---- Set dtypes + nasc_data_df = ( + nasc_data_df + .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float, + "center_of_mass": float, "dispersion": float, "evenness": float, + "aggregation_index": float, "occupied_area": float}) + ) + # ---- Reorder columns + nasc_data_df = nasc_data_df[[ + "longitude", "latitude", "ping_time", "nasc", "n_layers", "nasc_db", "mean_Sv", + "max_Sv", "aggregation_index", "center_of_mass", "dispersion", "evenness", + "occupied_area" + ]] + + def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: dict, echometrics: bool = True): @@ -389,6 +514,10 @@ def convert_datetime(timestamp: Union[int, str, pd.Series]): acoustic_data = self.input["acoustics"] biology_data = self.input["biology"] + + +from echopop.live.live_core import SPATIAL_CONFIG_MAP + def load_spatial_data(acoustic_data: dict, biology_data: dict, file_configuration: dict,): @@ -406,9 +535,15 @@ def load_spatial_data(acoustic_data: dict, # Validate the configuration validate_spatial_config(spatial_config) + # Create spatial dictionary that will be added as an `input` + spatial_dict = {"link_method": acoustics_biology_link} + # Assign the spatial link constraints to the acoustic and biological data if acoustics_biology_link == "INPFC": - apply_inpfc_definitions(acoustic_data, biology_data, spatial_config) + spatial_dict.update({"strata": create_inpfc_strata(spatial_config)}) + + # Return the dictionary as an output + return spatial_dict @@ -552,6 +687,8 @@ def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: di __all__ = ["operations"] +biology_data = self.input["biology"] + # Meld bio datasets length_datasets = biology_data["specimen_df"].meld(biology_data["length_df"], contrasts=["haul_num", "species_id", "length"]) @@ -576,7 +713,10 @@ def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: di # file_configuration["acoustics"]["TS_length_regression_parameters"][target_species["text_code"]] -def average_sigma_bs(length: Union[pd.DataFrame, float, int], TS_L_slope: Optional[float] = None, TS_L_intercept: Optional[float] = None, weighted: Optional[Union[float, int, str]] = None): +def average_sigma_bs(length: Union[pd.DataFrame, float, int], + TS_L_slope: Optional[float] = None, + TS_L_intercept: Optional[float] = None, + weighted: Optional[Union[float, int, str]] = None): # if isinstance(length, pd.DataFrame): @@ -648,6 +788,36 @@ def average_sigma_bs(length: Union[pd.DataFrame, float, int], TS_L_slope: Option else: return sigma_bs_value.mean() +def parse_condition(condition): + # Handle nested conditions and logical operators + condition = condition.replace('&', ' AND ').replace('|', ' OR ') + + # Handle "IN" lists and replace square brackets with parentheses + condition = re.sub(r'(\w+)\s*IN\s*\[(.*?)\]', lambda m: f"{m.group(1)} IN ({m.group(2)})", condition, flags=re.IGNORECASE) + + # Handle range conditions for BETWEEN, including floats + condition = re.sub(r'(\d*\.\d+|\d+)\s*<=\s*(\w+)\s*<=\s*(\d*\.\d+|\d+)', + lambda m: f"{m.group(2)} BETWEEN {m.group(1)} AND {m.group(3)}", condition) + + # Handle individual comparisons + condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\d*\.\d+|\d+)', lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", condition) + condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\'[^\']*\')', lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", condition) + + # Handle single equal sign + condition = re.sub(r'(\w+)\s*=\s*(\d*\.\d+|\d+)', lambda m: f"{m.group(1)} = {m.group(2)}", condition) + + # Remove redundant spaces + condition = re.sub(r'\s+', ' ', condition).strip() + + return condition + +columns = ["sigma_bs_sum", "sigma_bs_count"] +operation = "+" +table_name = "sigma_bs_mean_df" +dataframe = sigma_bs_df +condition = condition_str + +SQL(acoustic_db, "select", table_name="files_read") average_sigma_bs ts_lengths_df.groupby(["haul_num"]).apply(average_sigma_bs).apply(lambda x: to_dB(x)) @@ -671,13 +841,57 @@ def integrate_nasc(prc_nasc_df: pd.DataFrame): }) +current = 10 ** (-60/10) +count = 5 +old_tuple = (current, count) + +new = 10 ** (-50/10) +count = 2 + +data = pd.DataFrame({"value": 10 ** (np.array([-60.0, -50.0]) / 10.0), + "count": np.array([5, 2]) }) + +data = pd.DataFrame({"value": 10 ** (np.array([-61, -62, -63, -62, -61]) / 10.0)}) +data_new = pd.DataFrame({"value": 10 ** (np.array([-51, -52, -53, -52, -54, -56, -58]) / 10.0)}) +data["value"].sum() / data["value"].size +data_new["value"].sum() / data_new["value"].size + +(data["value"].sum() + data_new["value"].sum()) / (data["value"].size + data_new["value"].size) + +data_test = pd.DataFrame({"value": 10 ** (np.array([-61, -62, -63, -62, -61, -51, -52, -53, -52, -54, -56, -58]) / 10.0)}) +data_test["value"].mean() + +data["value"].mean() + +data["value"].sum() + +old_number = np.average(data["value"], weights=data["count"]) +old_count = data["count"].sum() + +new_number = np.array([-80.0, -70.0, -60.0, -70.0, -80.0]) +new_count = len(new_number) +new_mean = 10 ** (new_number.mean() / 10) + +np.average(np.concatenate([[old_number], [new_mean]]), + weights = np.concatenate([[old_count], [new_count]])) + +np.mean(10 ** (np.array([-60.0, -60.0, -60.0, -60.0, -60.0, -50.0, -50.0, -80.0, -70.0, -60.0, -70.0, -80.0]) / 10)) +np.average(data["value"], weights=data["count"]) + +np.sum(10 ** (np.array([-60.0, -60.0, -60.0, -60.0, -60.0, -50.0, -50.0, -80.0, -70.0, -60.0, -70.0, -80.0]) / 10)) pd.read_fr pd.read_sql(text(SQL_COMMANDS["select"].format(**kwargs)), con=connection) +db_file = self.config["database"]["acoustics"] engine = create_engine(f"sqlite:///{db_file}") connection = engine.connect() -kwargs["dataframe"].to_sql(name=kwargs["table_name"], + +SQL(db_file, "select", table_name="sigma_bs_mean_df", condition="stratum = 1") + + +kwargs["dataframe"].to_sql(name=kwa +rgs["table_name"], con=connection, if_exists="append", index=False) connection.close() From a4a51a6d03820da9560985d6df0702f826819a3c Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Thu, 1 Aug 2024 09:37:40 -0700 Subject: [PATCH 10/81] Format some changes to methods --- echopop/live/live_acoustics.py | 65 +++++- echopop/live/live_biology.py | 2 + echopop/live/live_core.py | 4 +- echopop/live/live_data_loading.py | 4 +- echopop/live/live_spatial_methods.py | 38 +++- echopop/live/live_survey.py | 38 +++- echopop/live/sql_methods.py | 36 ++- echopop/test_workflow.py | 33 +++ echopop/zarr_read_ingest_test.py | 329 --------------------------- 9 files changed, 181 insertions(+), 368 deletions(-) create mode 100644 echopop/test_workflow.py diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py index 21ba1e23..6afc5bc2 100644 --- a/echopop/live/live_acoustics.py +++ b/echopop/live/live_acoustics.py @@ -3,6 +3,8 @@ import pandas as pd from echopop.acoustics import ts_length_regression, to_linear, to_dB +from .live_spatial_methods import apply_spatial_definitions +from .sql_methods import sql_data_exchange # TODO: Documentation def configure_transmit_frequency(frequency_values: pd.Series, @@ -25,6 +27,7 @@ def configure_transmit_frequency(frequency_values: pd.Series, # TODO: Documentation def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame, + spatial_dict: dict, file_configuration: dict) -> pd.DataFrame: # Get acoustic processing settings @@ -34,8 +37,8 @@ def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame, # Filter the dataset # ---- Configure `frequency_nominal`, if necessary - prc_nasc_df["frequency_nominal"] = ( - configure_transmit_frequency(prc_nasc_df["frequency_nominal"], + prc_nasc_df.loc[:, "frequency_nominal"] = ( + configure_transmit_frequency(prc_nasc_df.loc[:, "frequency_nominal"], transmit_settings, acoustic_analysis_settings["dataset_units"]["frequency"]) ) @@ -43,6 +46,11 @@ def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame, prc_nasc_df_filtered = ( prc_nasc_df[prc_nasc_df["frequency_nominal"] == transmit_settings["frequency"]] ) + + # Apply spatial settings + prc_nasc_df_filtered.loc[:, "stratum"] = ( + apply_spatial_definitions(prc_nasc_df_filtered["latitude"], spatial_dict) + ) # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object # ---- Replace NASC `NaN` values with `0.0` @@ -176,13 +184,19 @@ def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True): # Convert `nasc_dict` to a DataFrame and return the output return pd.Series(nasc_dict) -def compute_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True): +def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict, + echometrics: bool = True): + # Get spatial definitions, if any + spatial_column = file_configuration["spatial_column"] + # Integrate NASC (and compute the echometrics, if necessary) nasc_data_df = ( - acoustic_data_df.groupby(["longitude", "latitude", "ping_time"]) - .apply(integrate_nasc, echometrics, include_groups=False) + acoustic_data_df + .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, observed=False) + .apply(integrate_nasc, echometrics) .unstack().reset_index() + .sort_values("ping_time") ) # ---- Amend the dtypes if echometrics were computed if echometrics: @@ -194,8 +208,39 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True): "aggregation_index": float, "occupied_area": float}) ) # ---- Reorder columns - nasc_data_df = nasc_data_df[[ - "longitude", "latitude", "ping_time", "nasc", "n_layers", "nasc_db", "mean_Sv", - "max_Sv", "aggregation_index", "center_of_mass", "dispersion", "evenness", - "occupied_area" - ]] + nasc_data_df = nasc_data_df[ + spatial_column + + ["longitude", "latitude", "ping_time", "source", "nasc", "n_layers", "nasc_db", + "mean_Sv", "max_Sv", "aggregation_index", "center_of_mass", "dispersion", "evenness", + "occupied_area"] + ] + + # Return the output + return nasc_data_df + +def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict): + + # Get acoustic database filename + acoustic_db = file_configuration["database"]["acoustics"] + + # Create a copy of the dataframe + df = nasc_data_df.copy() + + # Add population-specific columns (specified in the file configuration) + # TODO: Add to `yaml` file for configuration; hard-code for now + add_columns = ["number_density", "biomass_density", "abundance", "biomass"] + # ---- + df[add_columns] = 0.0 + # ---- Assign values for key values + key_values = [f"{str(index)}-{df.loc[index, 'source']}" for index in df.index] + # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint + df.loc[:, "id"] = key_values + + # Insert the new data into the database & pull in the combined dataset + # TODO: Replace with single-direction INSERT statement instead of INSERT/SELECT + _ = sql_data_exchange(acoustic_db, dataframe=df, table_name="survey_data_df", + id_columns=["id"], primary_keys=["id"], output_type=pd.DataFrame) + + # Return the formatted dataframe + return df + diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py index cf04589b..76e24e6d 100644 --- a/echopop/live/live_biology.py +++ b/echopop/live/live_biology.py @@ -1,6 +1,8 @@ import pandas as pd import numpy as np from .sql_methods import SQL, sql_data_exchange, get_table_key_names +from .live_spatial_methods import apply_spatial_definitions +from .live_acoustics import average_sigma_bs from echopop.acoustics import ts_length_regression, to_dB, to_linear from echopop.utils.operations import group_interpolator_creator from functools import reduce diff --git a/echopop/live/live_core.py b/echopop/live/live_core.py index 677cddc3..256b9f27 100644 --- a/echopop/live/live_core.py +++ b/echopop/live/live_core.py @@ -53,13 +53,13 @@ "dtypes": { "partition": str, "species_code": int, - "sample_weight_kg": float, + "overall_weight": float, "catch_perc": float, }, "names": { "partition": "trawl_partition", "species_code": "species_id", - "sample_weight_kg": "haul_weight", + "overall_weight": "haul_weight", "catch_perc": "catch_percentage", } }, diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index 1220591f..823ebac4 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -213,11 +213,11 @@ def validate_data_directory(file_configuration: dict, dataset: str, # Validate that files even exist # ---- List available *.zarr files - data_files = list(directory_path.glob(f"*{'.'+file_settings["extension"]}")) + data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}")) # ---- Error evaluation (if applicable) if not data_files: raise FileNotFoundError( - f"No `*.{file_settings["extension"]}` files found in [{directory_path}]!" + f"No `*.{file_settings['extension']}` files found in [{directory_path}]!" ) # Check and format specific input filenames diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py index c83f35de..2dd8cefc 100644 --- a/echopop/live/live_spatial_methods.py +++ b/echopop/live/live_spatial_methods.py @@ -4,6 +4,7 @@ from geopy.distance import distance from ..spatial.projection import utm_string_generator import shapely.geometry +from typing import Union def create_inpfc_strata(spatial_config: dict): @@ -34,28 +35,43 @@ def create_inpfc_strata(spatial_config: dict): return inpfc_strata_df def apply_inpfc_definitions(dataset: pd.DataFrame, inpfc_df: pd.DataFrame): + + # Create dataset copy + dataset = dataset.copy() # Bin the data based on latitude - if "latitude" in dataset.columns: - dataset["stratum"] = pd.cut( - dataset["latitude"], - np.unique(np.hstack([inpfc_df["lower"], inpfc_df["upper"]])), - labels = inpfc_df["stratum"] + if isinstance(dataset, pd.DataFrame) and "latitude" in dataset.columns: + dataset.loc[:, "stratum"] = pd.cut( + dataset.loc[:, "latitude"], + np.unique(np.hstack([inpfc_df.loc[:, "lower"], inpfc_df.loc[:, "upper"]])), + labels = inpfc_df.loc[:, "stratum"] ).astype(int) + + return dataset + else: + strata = pd.cut(dataset.copy(), + np.unique(np.hstack([inpfc_df.loc[:, "lower"], + inpfc_df.loc[:, "upper"]])), + labels = inpfc_df.loc[:, "stratum"] + ) + + return strata # Return the INPFC-stratified dataset - return dataset + # return dataset -def apply_spatial_definitions(data_dict: dict, spatial_dict: dict): +def apply_spatial_definitions(dataset: Union[dict, pd.Series], spatial_dict: dict): # Get the acoustic-biology link method link_method = spatial_dict["link_method"] - + # Apply spatial definitions - if link_method == "INPFC": - data_dict.update({ - k: apply_inpfc_definitions(d, spatial_dict["strata"]) for k, d in data_dict.items() + if isinstance(dataset, dict) and link_method == "INPFC": + dataset.update({ + k: apply_inpfc_definitions(d, spatial_dict["strata"]) for k, d in dataset.items() }) + elif isinstance(dataset, pd.Series) and link_method == "INPFC": + return apply_inpfc_definitions(dataset, spatial_dict["strata"]) # def apply_inpfc_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict): diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index 306ddeb9..adb67cc3 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -13,7 +13,7 @@ ) from .sql_methods import query_processed_files -from .live_acoustics import preprocess_acoustic_data, integrate_nasc +from .live_acoustics import preprocess_acoustic_data, compute_nasc from .live_biology import preprocess_biology_data @@ -59,7 +59,6 @@ def __init__( if verbose: pass - def load_acoustic_data(self, input_filenames: Optional[list] = None, verbose: bool = True): @@ -76,13 +75,17 @@ def load_acoustic_data(self, # ---- Add the `acoustic_data_units` to the dictionary self.config["acoustics"]["dataset_units"] = acoustic_data_units # ---- Preprocess the acoustic dataset - self.input["acoustics"]["prc_nasc_df"] = preprocess_acoustic_data(prc_nasc_df, + # TODO: SettingWithCopyWarning: + self.input["acoustics"]["prc_nasc_df"] = preprocess_acoustic_data(prc_nasc_df.copy(), + self.input["spatial"], self.config) # TODO: Add verbosity for printing database filepaths/connections if verbose: + # ---- Create file list + file_list = "\n".join(acoustic_files) print( f"The following acoustic files have been processed:\n" - f"{"\n".join(acoustic_files)}." + f"{file_list}." ) else: self.input["acoustics"]["prc_nasc_df"] = None @@ -97,9 +100,11 @@ def load_biology_data(self, # TODO: Add verbosity for printing database filepaths/connections if biology_files and verbose: - print( + # ---- Create file list + file_list = "\n".join(biology_files) + print( f"The following biological files have been processed:\n" - f"{"\n".join(biology_files)}." + f"{file_list}." ) # Read in the biology data files @@ -111,10 +116,21 @@ def load_biology_data(self, ) def process_biology_data(self): + # method here + pass + + def process_acoustic_data(self, echometrics: bool = True): + + # Get the unprocessed acoustic data + acoustic_data_df = self.input["acoustics"]["prc_nasc_df"] - # Separate out processed and unprocessed biological data - # ----- Unprocessed - biology_unprocessed = self.input["biology"] - # ---- Processed - biology_processed = self.input["biology_processed"] + # Integrate NASC (and compute the echometrics, if necessary) + nasc_data_df = compute_nasc(acoustic_data_df, self.config, echometrics) + + # Format the dataframe and insert into the LiveSurvey object + self.input["nasc_df"] = nasc_data_df + + def estimate_population(self): + # method here + pass diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index 4d253455..0bb47306 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -45,7 +45,12 @@ def sql_map_tables(connection: sqla.Connection): """ """ inspector = inspect(connection) - return inspector.get_table_names() + table_names = inspector.get_table_names() + # result = connection.execute(text("SELECT name FROM sqlite_master WHERE type='table';")) + # table_names = result.fetch_all() + # Extract table names from the results + # table_names = [name[0] for name in table_names] + return table_names def sql_validate(connection: sqla.Connection, table_name: str): """ @@ -83,7 +88,7 @@ def sql_inspect(connection: sqla.Connection, table_name: str, columns: List[str] else: # Inspect unique values in specified columns # ---- Create SQL command - sql_command = f"SELECT DISTINCT {", ".join(columns)} FROM {table_name};" + sql_command = f"SELECT DISTINCT {', '.join(columns)} FROM {table_name};" # ---- Execute table = connection.execute(text(sql_command.strip())) # ---- Extract unique values @@ -96,7 +101,7 @@ def sql_inspect(connection: sqla.Connection, table_name: str, columns: List[str] def sql_drop(connection: sqla.Connection, table_name: str): """ """ - connection.execute(text(f"DROP TABLE IF EXISTS {table_name};")) + connection.execute(text(f"DROP TABLE IF EXISTS {table_name}")) def sql_insert(connection: sqla.Connection, table_name: str, columns: list, dataframe: pd.DataFrame, id_columns: Optional[list] = None): @@ -551,6 +556,31 @@ def sql_data_exchange(database_file: Path, **kwargs): # Select existing data frame the database and return the output return SQL(database_file, "select", **kwargs) +def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str, List[str]]] = None): + + # Get all database files + database_files = file_configuration["database"] + + # Iterate through all keys + for _, db_file in database_files.items(): + # ---- Map the table names + table_names = SQL(db_file, "map") + # ---- Drop any noted exceptions + if not isinstance(table_exception, list): + table_exception = [table_exception] + # ---- Drop exception table name + if None not in table_exception: + table_names = list(set(table_names) - set(table_exception)) + # ---- Iterate through + for table_name in table_names: + SQL(db_file, "drop", table_name=table_name) + # ---- Validate that all tables were removed + remaining_tables = SQL(table_names, "map") + if set(table_names).intersection(set(remaining_tables)): + raise ValueError( + f"Attempted reset of [{str(db_file)}] failed." + ) + # TODO: Documentation def SQL(db_file: str, command: str, **kwargs): diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py new file mode 100644 index 00000000..bb8c2bb0 --- /dev/null +++ b/echopop/test_workflow.py @@ -0,0 +1,33 @@ +from echopop.live.live_survey import LiveSurvey +from echopop.live.sql_methods import reset_db_files + +live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml" +live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" + +realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path) + +#################################################################################################### +# TEST: ACOUSTICS +#################################################################################################### +# NOTE: Reset database file for utility purposes +reset_db_files(realtime_survey.config) + +# NOTE: LOAD DATA +realtime_survey.load_acoustic_data() +# NOTE: INITIAL PROCESSING [JUST ACOUSTIC] +realtime_survey.process_acoustic_data() +realtime_survey.input +#################################################################################################### +# TEST: BIOLOGY +#################################################################################################### +# NOTE: Reset database file for utility purposes +reset_db_files(realtime_survey.config) + +# NOTE: LOAD DATA +realtime_survey.load_biology_data() +realtime_survey.input +#################################################################################################### +# TEST: POPULATION ESTIMATES +#################################################################################################### +# NOTE: Acoustic / biological data converge here to derive population estimates +realtime_survey.estimate_population() \ No newline at end of file diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py index b657b07d..2e7ef567 100644 --- a/echopop/zarr_read_ingest_test.py +++ b/echopop/zarr_read_ingest_test.py @@ -149,195 +149,6 @@ def process_biology_data(self): root_directory = file_configuration["data_root_dir"] -#################################################################################################### -# TEST: ACOUSTIC ZARR FILE INGESTION CONFIGURATION -# NOTE: -# ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration` -acoustic_data = load_acoustic_data(file_configuration) -acoustic_data -file_configuration["database"] - -def estimate_echometrics(acoustic_data_df: pd.DataFrame): - - # Create copy - acoustic_df = acoustic_data_df.copy().reset_index(drop=True) - - # Pre-compute the change in depth - acoustic_df["dz"] = acoustic_df["depth"].diff() - - # Initialize echometrics dictionary - echometrics = {} - - # Compute the metrics center-of-mass - if acoustic_df["NASC"].sum() == 0.0: - echometrics.update({ - "n_layers": 0, - "mean_Sv": -999, - "max_Sv": -999, - "nasc_db": np.nan, - "center_of_mass": np.nan, - "dispersion": np.nan, - "evenness": np.nan, - "aggregation": np.nan, - "occupied_area": 0.0, - }) - else: - - # Compute the number of layers - echometrics.update({ - "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size - }) - - # Compute ABC - # ---- Convert NASC to ABC - acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2) - # ---- Estimate mean Sv - echometrics.update({ - "mean_Sv": 10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max()) - }) - # --- Estimate max Sv (i.e. ) - echometrics.update({ - "max_Sv": 10 * np.log10(acoustic_df["ABC"].max() - / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"]) - }) - - # Compute (acoustic) abundance - echometrics.update({ - "nasc_db": 10 * np.log10(acoustic_df["ABC"].sum()) - }) - - # Compute center of mass - echometrics.update({ - "center_of_mass": ( - (acoustic_df["depth"] * acoustic_df["NASC"]).sum() - / (acoustic_df["NASC"]).sum() - ) - }) - - # Compute the dispersion - echometrics.update({ - "dispersion": ( - ((acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2 - * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum() - ) - }) - - # Compute the evenness - echometrics.update({ - "evenness": (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2 - }) - - # Compute the index of aggregation - echometrics.update({ - "aggregation": 1 / echometrics["evenness"] - }) - - # Get the occupied area - echometrics.update({ - "occupied_area": ( - acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max() - ) - }) - - # Return the dictionary - return echometrics - -def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True): - - # Vertically integrate PRC NASC - nasc_dict = {"nasc": acoustic_data_df["NASC"].sum()} - - # Horizontally concatenate `echometrics`, if `True` - if echometrics: - # ---- Compute values - # NOTE: This uses NASC instead of linear `sv` - echometrics_dict = estimate_echometrics(acoustic_data_df) - # ---- Merge - nasc_dict.update(echometrics_dict) - - # Convert `nasc_dict` to a DataFrame and return the output - return pd.Series(nasc_dict) - - -acoustic_data_df = acoustic_data["prc_nasc_df"] - - - -# SQL(database_file, "drop", table_name="nasc_df") -# SQL(database_file, "validate", **kwargs) -# SQL(database_file, "create", table_name="nasc_df", primary_keys=["latitude", "longitude", "ping_time"], dataframe=nasc_data_df) -# SQL(database_file, "validate", **kwargs) -# SQL(database_file, "select", table_name="nasc_df") -# SQL(database_file, "insert", table_name="nasc_df", id_columns=["latitude", "longitude", "ping_time"], dataframe=nasc_data_df) -# SQL(database_file, "select", table_name="nasc_df") -# SQL(database_file, "insert", table_name="nasc_df", id_columns=["latitude", "longitude", "ping_time"], dataframe=nasc_data_df) -# SQL(database_file, "select", table_name="nasc_df") -# SQL(database_file, "insert", table_name="nasc_df", dataframe=nasc_data_df) -# SQL(database_file, "drop", table_name="nasc_df") -# SQL_DTYPES[type(dataframe["ping_time"][0]).__name__] - - -def process_acoustic_data(self, - echometrics: bool = True): - - # Get the unprocessed acoustic data - acoustic_data_df = self.input["acoustics"]["prc_nasc_df"] - - # Integrate NASC (and compute the echometrics, if necessary) - nasc_data_df = ( - acoustic_data_df.groupby(["longitude", "latitude", "ping_time"]) - .apply(integrate_nasc, echometrics, include_groups=False) - .unstack().reset_index() - ) - - # ---- Amend the dtypes if echometrics were computed - if echometrics: - # ---- Set dtypes - nasc_data_df = ( - nasc_data_df - .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float, - "center_of_mass": float, "dispersion": float, "evenness": float, - "aggregation_index": float, "occupied_area": float}) - ) - # ---- Reorder columns - nasc_data_df = nasc_data_df[[ - "longitude", "latitude", "ping_time", "nasc", "n_layers", "nasc_db", "mean_Sv", - "max_Sv", "aggregation_index", "center_of_mass", "dispersion", "evenness", - "occupied_area" - ]] - - -def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: dict, - echometrics: bool = True): - - # Integrate NASC (and compute the echometrics, if necessary) - nasc_data_df = ( - acoustic_data_df.groupby(["longitude", "latitude", "ping_time"]) - .apply(lambda group: integrate_nasc(group, echometrics), include_groups=False) - .reset_index() - ) - # ---- Amend the dtypes if echometrics were computed - if echometrics: - nasc_data_df = ( - nasc_data_df - .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float, - "center_of_mass": float, "dispersion": float, "evenness": float, - "aggregation": float, "occupied_area": float}) - ) - - # Get the acoustics database file - acoustics_db = file_configuration["database"]["acoustics"] - - # Insert the new data into the database and pull in the combined previous and new data combined - full_nasc_df = sql_data_exchange(acoustics_db, dataframe=nasc_data_df, - table_name="nasc_df", - id_columns=["longitude", "latitude", "ping_time"], - primary_keys=["longitude", "latitude", "ping_time"], - output_type=pd.DataFrame) - - # Return the output - return full_nasc_df - #################################################################################################### def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str, List[str]]] = None): @@ -811,146 +622,6 @@ def parse_condition(condition): return condition -columns = ["sigma_bs_sum", "sigma_bs_count"] -operation = "+" -table_name = "sigma_bs_mean_df" -dataframe = sigma_bs_df -condition = condition_str - -SQL(acoustic_db, "select", table_name="files_read") -average_sigma_bs - -ts_lengths_df.groupby(["haul_num"]).apply(average_sigma_bs).apply(lambda x: to_dB(x)) -def integrate_nasc(prc_nasc_df: pd.DataFrame): - -# Compute the number of layers -echometrics.update({ - "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size -}) - -# Compute the index of aggregation -echometrics.update({ - "aggregation": 1 / echometrics["evenness"] -}) - -# Get the occupied area -echometrics.update({ - "occupied_area": ( - acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max() - ) -}) - - -current = 10 ** (-60/10) -count = 5 -old_tuple = (current, count) - -new = 10 ** (-50/10) -count = 2 - -data = pd.DataFrame({"value": 10 ** (np.array([-60.0, -50.0]) / 10.0), - "count": np.array([5, 2]) }) - -data = pd.DataFrame({"value": 10 ** (np.array([-61, -62, -63, -62, -61]) / 10.0)}) -data_new = pd.DataFrame({"value": 10 ** (np.array([-51, -52, -53, -52, -54, -56, -58]) / 10.0)}) -data["value"].sum() / data["value"].size -data_new["value"].sum() / data_new["value"].size - -(data["value"].sum() + data_new["value"].sum()) / (data["value"].size + data_new["value"].size) - -data_test = pd.DataFrame({"value": 10 ** (np.array([-61, -62, -63, -62, -61, -51, -52, -53, -52, -54, -56, -58]) / 10.0)}) -data_test["value"].mean() - -data["value"].mean() - -data["value"].sum() - -old_number = np.average(data["value"], weights=data["count"]) -old_count = data["count"].sum() - -new_number = np.array([-80.0, -70.0, -60.0, -70.0, -80.0]) -new_count = len(new_number) -new_mean = 10 ** (new_number.mean() / 10) - -np.average(np.concatenate([[old_number], [new_mean]]), - weights = np.concatenate([[old_count], [new_count]])) - -np.mean(10 ** (np.array([-60.0, -60.0, -60.0, -60.0, -60.0, -50.0, -50.0, -80.0, -70.0, -60.0, -70.0, -80.0]) / 10)) -np.average(data["value"], weights=data["count"]) - -np.sum(10 ** (np.array([-60.0, -60.0, -60.0, -60.0, -60.0, -50.0, -50.0, -80.0, -70.0, -60.0, -70.0, -80.0]) / 10)) - - -pd.read_fr -pd.read_sql(text(SQL_COMMANDS["select"].format(**kwargs)), con=connection) -db_file = self.config["database"]["acoustics"] -engine = create_engine(f"sqlite:///{db_file}") -connection = engine.connect() - -SQL(db_file, "select", table_name="sigma_bs_mean_df", condition="stratum = 1") - - -kwargs["dataframe"].to_sql(name=kwa -rgs["table_name"], - con=connection, - if_exists="append", index=False) -connection.close() -engine.dispose() -SQL(db_file, "insert", table_name=table_name, columns="*", - filter_columns=insertion_filter, - dataframe=df) - -SQL(db_file, "select", table_name="files_read") -SQL(db_file, "select", table_name="catch_df") -SQL(db_file, "select", table_name="specimen_df") -SQL(db_file, "select", table_name="length_df") - -def check_table_schema(connection, **kwargs): - query = text(("PRAGMA table_info({table_name});").format(**kwargs)) - schema = connection.execute(query).fetchall() - print("Table Schema:", schema) - -check_table_schema(connection, table_name=table_name) - -def insert_test_data(connection, table_name): - test_data = pd.DataFrame({ - 'trawl_partition': ['test'], - 'species_id': ['test'], - 'haul_weight': [0.0], - 'catch_percentage': [0.0], - 'haul_num': [1] - }) - - test_data.to_sql(name=table_name, con=connection, if_exists='append', index=False) - print("Test data inserted.") - -insert_test_data(connection, table_name) - -kwargs = {} -command = "insert" -kwargs["table_name"] = "catch_df" -kwargs["dataframe"] = df -kwargs["filter_columns"] = insertion_filter -columns = "*" - - -re.compile(file_name_format) -pattern = file_name_format -pattern = pattern.replace('{DATE:YYYYMM}', r'(?P\d{6})') -pattern = pattern.replace('{HAUL}', r'(?P\d+)') -pattern = pattern.replace('{FILE_ID}', r'(?P.+)') -regex = re.compile(pattern) -haul_values = [] - -file_name_format.search(file.name) -sub_df_lst = [] -for file in subcsv_files: - match = regex.search(file.name) - if match: - haul_value = match.group('HAUL') - df = pd.read_csv(file, usecols=list(sub_config_map.keys())) - df['HAUL'] = haul_value # Append HAUL value as a new column - sub_df_lst.append(df) #################################################################################################### def load_spatial_data(file_configuration: dict, acoustic_data: pd.DataFrame, From e395405efba52088ec6734994a8aedee647a42b1 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Thu, 1 Aug 2024 10:34:50 -0700 Subject: [PATCH 11/81] Quick patch --- echopop/test_workflow.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index bb8c2bb0..9678736b 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -26,8 +26,12 @@ # NOTE: LOAD DATA realtime_survey.load_biology_data() realtime_survey.input +# NOTE: INITIAL PROCESSING [JUST BIOLOGY] +realtime_survey.process_biology_data() +realtime_survey.input #################################################################################################### # TEST: POPULATION ESTIMATES #################################################################################################### # NOTE: Acoustic / biological data converge here to derive population estimates +# TODO: Add argument that indicates what the new datasets and what data need to be pulled in realtime_survey.estimate_population() \ No newline at end of file From 2f04cab604d6100c8afaea2003d392a9da6c1372 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Thu, 1 Aug 2024 12:26:56 -0700 Subject: [PATCH 12/81] Fleshed out biology processing methods --- echopop/__init__.py | 2 +- echopop/live/__init__.py | 2 +- echopop/live/live_acoustics.py | 2 +- echopop/live/live_biology.py | 22 ++-- echopop/live/live_survey.py | 94 +++++++++++-- echopop/test_workflow.py | 14 +- echopop/zarr_read_ingest_test.py | 219 +++++++++++++++++++++++++++---- 7 files changed, 296 insertions(+), 59 deletions(-) diff --git a/echopop/__init__.py b/echopop/__init__.py index a28b91b0..7dfc17fd 100644 --- a/echopop/__init__.py +++ b/echopop/__init__.py @@ -3,4 +3,4 @@ __all__ = ["Survey", "operations"] -from _echopop_version import version as __version__ # noqa +# from _echopop_version import version as __version__ # noqa diff --git a/echopop/live/__init__.py b/echopop/live/__init__.py index f4e742bb..325afcbb 100644 --- a/echopop/live/__init__.py +++ b/echopop/live/__init__.py @@ -2,4 +2,4 @@ __all__ = ["operations"] -from _echopop_version import version as __version__ # noqa \ No newline at end of file +# from _echopop_version import version as __version__ # noqa \ No newline at end of file diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py index 6afc5bc2..11d0b392 100644 --- a/echopop/live/live_acoustics.py +++ b/echopop/live/live_acoustics.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd -from echopop.acoustics import ts_length_regression, to_linear, to_dB +from ..acoustics import ts_length_regression, to_linear, to_dB from .live_spatial_methods import apply_spatial_definitions from .sql_methods import sql_data_exchange diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py index 76e24e6d..efc88765 100644 --- a/echopop/live/live_biology.py +++ b/echopop/live/live_biology.py @@ -3,8 +3,8 @@ from .sql_methods import SQL, sql_data_exchange, get_table_key_names from .live_spatial_methods import apply_spatial_definitions from .live_acoustics import average_sigma_bs -from echopop.acoustics import ts_length_regression, to_dB, to_linear -from echopop.utils.operations import group_interpolator_creator +from ..acoustics import ts_length_regression, to_dB, to_linear +from ..utils.operations import group_interpolator_creator from functools import reduce def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict): @@ -129,16 +129,12 @@ def preprocess_biology_data(biology_output: dict, spatial_dict: dict, file_confi def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, file_configuration: dict): - # Assign contrast columns - contrast_list = [] - # ---- Check for "stratum" column - if "stratum" in specimen_data.columns and "stratum" in length_data.columns: - contrast_list.append(["stratum"]) - # ---- Add the additional columns - contrast_list.append(["haul_num", "species_id", "length"]) - # ---- Concatenate - contrast_columns = list(np.concatenate(contrast_list)) - + # Determine contrast columns + # ----- Check for "stratum" column in spatial definitions configuration + stratum_column = file_configuration["spatial_column"] + # ---- Append to other defined keys + contrast_columns = stratum_column + ["haul_num", "species_id", "length"] + # Meld the biological datasets length_datasets = specimen_data.meld(length_data, contrasts=contrast_columns) @@ -167,7 +163,7 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, sigma_bs_df = ( ts_length_df .groupby(list(set(contrast_columns) - set(["length"])), observed=False) - .apply(lambda x: average_sigma_bs(x, weighted="length_count"), include_groups=False) + .apply(lambda x: average_sigma_bs(x, weights="length_count"), include_groups=False) .reset_index(name="sigma_bs") ) diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index adb67cc3..e8589b93 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -13,8 +13,21 @@ ) from .sql_methods import query_processed_files -from .live_acoustics import preprocess_acoustic_data, compute_nasc -from .live_biology import preprocess_biology_data +from .live_acoustics import ( + compute_nasc, + preprocess_acoustic_data +) + +from .live_biology import ( + bin_length_data, + compute_average_weights, + compute_sigma_bs, + length_bin_counts, + length_weight_regression, + number_proportions, + length_bin_weights, + preprocess_biology_data +) from . import live_data_processing as eldp @@ -116,19 +129,74 @@ def load_biology_data(self, ) def process_biology_data(self): - # method here - pass - - def process_acoustic_data(self, echometrics: bool = True): - - # Get the unprocessed acoustic data - acoustic_data_df = self.input["acoustics"]["prc_nasc_df"] - # Integrate NASC (and compute the echometrics, if necessary) - nasc_data_df = compute_nasc(acoustic_data_df, self.config, echometrics) + # TODO: How and when should the already processed data be imported? + # Separate out processed and unprocessed biological data + # ----- Unprocessed + biology_unprocessed = self.input["biology"] + + # Compute `sigma_bs` by sending it to the appropriate database table + compute_sigma_bs(biology_unprocessed["specimen_df"], + biology_unprocessed["length_df"], + self.config) + + # Bin the length measurements of the biological data + bin_length_data(biology_unprocessed, self.config["length_distribution"]) + + # Compute the length-weight regression and add it to the SQL table + length_weight_df = length_weight_regression(biology_unprocessed["specimen_df"], + self.config["length_distribution"], + self.config) - # Format the dataframe and insert into the LiveSurvey object - self.input["nasc_df"] = nasc_data_df + # Compute length-binned counts for the aggregated and individual-based measurements + specimen_binned, specimen_binned_filtered, length_binned = ( + length_bin_counts(biology_unprocessed["length_df"], + biology_unprocessed["specimen_df"], + self.config) + ) + + # Compute the number proportions + specimen_number_proportion, length_number_proportion, sex_number_proportions = ( + number_proportions(specimen_binned, specimen_binned_filtered, + length_binned, self.config) + ) + + # Compute the length-binned weights for the aggregated and individual-based measurements + length_weight_binned, specimen_weight_binned = ( + length_bin_weights(biology_unprocessed["length_df"], + biology_unprocessed["specimen_df"], + length_weight_df,self.config) + ) + + # Calculate the average weights among male, female, and all fish + fitted_weight_df = compute_average_weights(specimen_number_proportion, + length_number_proportion, + sex_number_proportions, + length_weight_df, + self.config["length_distribution"], + self.config) + + def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True): + + # Check for if any data is present; if not, provide report + if self.input["acoustics"]["prc_nasc_df"] is None: + # ---- Set the corresponding `nasc_df` DataFrame to None + self.input["nasc_df"] = None + # ---- Print, if verbose + if verbose: + print( + "No acoustic data located in `*.input['acoustics']['prc_nasc_df']" + " DataFrame. Data processing step will therefore be skipped." + ) + else: + # Get the unprocessed acoustic data + acoustic_data_df = self.input["acoustics"]["prc_nasc_df"] + + # Integrate NASC (and compute the echometrics, if necessary) + nasc_data_df = compute_nasc(acoustic_data_df, self.config, echometrics) + + # Format the dataframe and insert into the LiveSurvey object + self.input["nasc_df"] = nasc_data_df def estimate_population(self): # method here diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index 9678736b..ee87216b 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -1,8 +1,17 @@ from echopop.live.live_survey import LiveSurvey from echopop.live.sql_methods import reset_db_files +from echopop.live.sql_methods import query_processed_files +from echopop.live.live_acoustics import preprocess_acoustic_data, compute_nasc +from echopop.live.live_biology import preprocess_biology_data +from echopop.live.live_core import( + LIVE_DATA_STRUCTURE, +) -live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml" -live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" +from echopop.live import live_data_processing as eldp +from echopop.live import live_data_loading as eldl + +live_init_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_initialization_config.yml" +live_file_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path) @@ -34,4 +43,5 @@ #################################################################################################### # NOTE: Acoustic / biological data converge here to derive population estimates # TODO: Add argument that indicates what the new datasets and what data need to be pulled in +# TODO: ARGUMENT {working_dataset: Literal["acoustic", "biology"]} realtime_survey.estimate_population() \ No newline at end of file diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py index 2e7ef567..1d53749a 100644 --- a/echopop/zarr_read_ingest_test.py +++ b/echopop/zarr_read_ingest_test.py @@ -50,12 +50,6 @@ def process_biology_data(self): - # Separate out processed and unprocessed biological data - # ----- Unprocessed - biology_unprocessed = self.input["biology"] - # ---- Processed - biology_processed = self.input["biology_processed"] - # Compute `sigma_bs` by sending it to the appropriate database table compute_sigma_bs(biology_unprocessed["specimen_df"], biology_unprocessed["length_df"], self.config) @@ -98,15 +92,23 @@ def process_biology_data(self): catch_data = self.input["biology"]["catch_df"] # Get the spatial column name, if there is one -contrast_columns = file_configuration["spatial_column"].copy() +spatial_column = file_configuration["spatial_column"] # ---- Append additional columns that will be used -contrast_columns.extend(["sex", "species_id"]) +contrast_columns = spatial_column + ["sex", "species_id"] # Calculate grouped totals +# ---- Sum the net haul weights from station 1/unaged fish +catch_weights = catch_data.count_variable( + contrasts=["species_id"] + spatial_column, + variable="haul_weight", fun="sum" +) +# ---- Rename resulting columns for both +catch_weights.rename(columns={"count": "total_weight"}, inplace=True) + # ---- Specimen specimen_weights = specimen_weight_binned.sum().reset_index(name="total_weight") - +specimen_weight_binned # Calculate the sexed and total stratum weights for each sex among unaged fish # ---- Sum the net haul weights from station 1/unaged fish catch_weights = catch_data.count_variable( @@ -116,30 +118,191 @@ def process_biology_data(self): # ---- Rename resulting columns for both catch_weights.rename(columns={"count": "total_weight"}, inplace=True) -# Sum the sexed and total weights from the weight-fitted unaged data -# ---- Extract the unaged/length quantized weights -unaged_weights_binned = distributions_dict["unaged_length_weight_tbl"].copy() -# ---- Calculate the total weight per stratum per sex -unaged_weights_sex = unaged_weights_binned.sum() -# ---- Length (by sex) -length_weights_sex = length_weight_binned.groupby(contrast_columns)["weight_interp"].sum()#.to_frame("weight") -# ---- Further reduce -length_weight_total = length_weights_sex.transpose().unstack(0).sum(axis=0) +# For the specimen data +# ---- Sum the net haul weights from station 1/unaged fish +# ---- Specimen +specimen_weights_sex = ( + specimen_weight_binned + .groupby(contrast_columns)["weight"] + .sum() +) +# ---- Total (per stratum, if it exists) +specimen_weight_total = specimen_weights_sex.transpose().unstack(1).sum(axis=1) + +# For the length (unaged) dataset +length_weights_sex = ( + length_weight_binned + .groupby(contrast_columns)["weight_interp"] + .sum() +) +# ---- Further reduce to the grand total (per stratum, if it exists) +length_weight_total = length_weights_sex.transpose().unstack(1).sum(axis=1) + # ---- Standardize the unaged sexed weights -(length_weights_sex / length_weight_total).unstack(0) * catch_weights["total_weight"].to_numpy() +length_weight_standardized = ( + (length_weights_sex / length_weight_total).unstack(0) + * catch_weights["total_weight"].to_numpy() +) + +# Calculate the specimen weight proportions +# ---- Pivot weight bins +specimen_weight_binned_pvt = ( + specimen_weight_binned.pivot_table( + columns=spatial_column, + index=["length_bin", "species_id", "sex"], + values="weight", + observed = False + ) +) +# ---- Divide by the aged stratum weights (relative to only aged fish) +specimen_weight_proportions_pvt = ( + specimen_weight_binned_pvt / specimen_weight_total.to_numpy() +) +# ---- Pivot back to the desired format +specimen_weight_proportion = ( + specimen_weight_proportions_pvt + .stack().reset_index(name="weight_proportion") + .pivot_table(columns=stratum_column + ["species_id", "sex"], + index="length_bin", values="weight_proportion") +) +# ---- Calculate the internal (i.e. only aged fish) for each sex +within_specimen_sex_proportions = ( + specimen_weight_proportion.sum() +) +# Calculate the total strata weights +# ---- Index `catch_weights` +catch_weights_idx = catch_weights.set_index(stratum_column + ["species_id"]) +# ---- Compute the spatially-stratified/grouped weights +spatial_weights = ( + pd.concat([specimen_weight_total.to_frame("total_weight"), catch_weights_idx]) + .pivot_table( + columns=stratum_column, + aggfunc="sum", + values="total_weight", + observed=False + ) +) -length_weight_total = ( - length_weights_sex.reset_index(list(set(contrast_columns)-set(file_configuration["spatial_column"].copy()))) - ["weight_interp"].sum() +# Calculate the weight proportions relative to the overall stratum weights +# ---- Aged +# -------- Reformat into dataframe and merge with total stratum weights +specimen_weights_binned_df = ( + specimen_weight_binned_pvt.stack() + .to_frame("specimen_weight") + .reset_index() + .merge(spatial_weights.T.reset_index(), on=stratum_column) +) +# -------- Calculate proportions +specimen_weights_binned_df["weight_proportion_overall"] = ( + specimen_weights_binned_df["specimen_weight"] / specimen_weights_binned_df["total_weight"] +) +# -------- Consolidate to calculate the sexed proportions per stratum +specimen_weight_sex_proportions = specimen_weights_binned_df.groupby(stratum_column + ["species_id", "sex"])[ + "weight_proportion_overall" +].sum() +# ---- Unaged +# -------- Reformat into dataframe and merge with total stratum weights +length_weights_sex_standardized_df = ( + length_weight_standardized.stack() + .to_frame("catch_weight") + .reset_index() + .merge(spatial_weights.T.reset_index(), on=stratum_column) +) +# -------- Calculate proportions +length_weights_sex_standardized_df["weight_proportion_overall"] = ( + length_weights_sex_standardized_df["catch_weight"] + / length_weights_sex_standardized_df["total_weight"] +) +# -------- Back-calculate the sexed weight proportions relative to just unaged fish +# ------------ Aggregate proportions +length_total_sex_proportions = length_weights_sex_standardized_df.pivot_table( + columns=["species_id", "sex"], index=stratum_column, values="weight_proportion_overall" +).transpose().unstack(["species_id"]).sum(axis=0) +# ------------ Re-compute the proportions +length_weight_sex_proportions = ( + length_weights_sex_standardized_df.pivot_table( + index=["species_id", "sex"], columns=stratum_column, + values="weight_proportion_overall" + ) + / length_total_sex_proportions.to_numpy() ) -# ---- Calculate the stratum totals -unaged_strata_weights = unaged_weights_sex.unstack(0).sum(axis=0) -# ---- Standardize the unaged sexed weights -unaged_weights_sex_standardized = (unaged_weights_sex / unaged_strata_weights).unstack( - 0 -) * catch_strata_weights["stratum_weight"].to_numpy() +# Compute the overall length-binned weight distributions among unaged fish +# ---- Extract the number proportions computed for unaged fish +length_number_proportions = length_number_proportion.copy() +# ---- Filter out values besides those computed for 'all' fish +length_number_proportions = length_number_proportions[length_number_proportions["sex"] == "all"] +# ---- Convert to a table +length_number_proportions_tbl = length_number_proportions.pivot_table( + columns=stratum_column + ["species_id"], + index=["length_bin"], + values="proportion_number_length", + aggfunc="sum", + observed=False, +) +# ---- Extract the fitted weight values calculated for all fish +length_weight_all = length_weight_df[length_weight_df["sex"] == "all"] +# ---- Generate the fitted weight array +fitted_weights = length_weight_all.copy() +# ---- Get actual length bins in dataset +fitted_weights = fitted_weights[fitted_weights["length_bin"].isin(length_number_proportions["length_bin"])] +# ---- Apportion the averaged weights +length_apportioned_weights = length_number_proportions_tbl.T * fitted_weights["weight_fitted"].to_numpy() +# ---- Compute the average weight proportions per length bin per stratum +average_length_bin_weights = length_apportioned_weights.T / length_apportioned_weights.sum(axis=1) +# ---- Convert back to a DataFrame +average_length_bin_weights_df = average_length_bin_weights.unstack().reset_index( + name="weight_proportion" +) + +# Calculate the aged and unaged weight proportions +# ---- Aged +aged_proportions = specimen_weight_sex_proportions.unstack("sex").sum(axis=1) +# ---- Unaged +unaged_proportions = 1 - aged_proportions +# -------- Re-weight the unaged sexed proportions +unaged_weight_sex_proportions_overall = ( + (length_weight_sex_proportions * unaged_proportions.unstack().transpose()).astype(float).fillna(0.0) +) + +unaged_proportions.unstack().transpose() +# Format the outputs +# ---- Aged: stratum-sex-age-length relative to aged and total weights +aged_overall_df = ( + specimen_weight_proportion.unstack() + .reset_index(name="weight_proportions") + .merge( + specimen_weights_binned_df[ + stratum_column + ["length_bin", "sex", "species_id", "weight_proportion_overall"] + ] + ) +) +# ---- Aged: stratum-sex relative to total weights +aged_sex_df =within_specimen_sex_proportions.reset_index(name="weight_proportion_aged").set_index( + stratum_column + ["species_id", "sex"] + ) +# ---- Add the aged sex proportiosn relative to the overall survey +aged_sex_df["weight_proportion_overall_aged"] = specimen_weight_sex_proportions +# ---- Consolidate the aged and unaged sexed dataframes +# -------- Initialize the dataframe +aged_unaged_sex_proportions = aged_sex_df.reset_index().set_index(["species_id", "sex"] + stratum_column) +# --------- Add the within-unaged weight proportions +aged_unaged_sex_proportions["weight_proportion_unaged"] = ( + length_weight_sex_proportions.stack() +) +# --------- Add the overall-unaged weight proportions +aged_unaged_sex_proportions["weight_proportion_overall_unaged"] = ( + unaged_weight_sex_proportions_overall.stack() +) +# ---- Overall aged and unaged proportions +aged_unaged_proportions = aged_proportions.reset_index(name="aged_proportions") +# ---- Set index +aged_unaged_proportions.set_index(stratum_column + ["species_id"], inplace=True) +# -------- Add unaged proportions +aged_unaged_proportions["unaged_proportions"] = unaged_proportions#.reset_index() +# ---- Reset the index +aged_unaged_proportions = aged_unaged_proportions.reset_index() #################################################################################################### # * Functionality for reading in processed acoustic data # TODO: Expand data validator and limit cases to '*.zarr' (for now) From c95cf8dcd3d1f666c0c9d4cbe08804fb43aa62ce Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Thu, 1 Aug 2024 14:07:44 -0700 Subject: [PATCH 13/81] Further refinement of `process_biology_data` meth --- echopop/biology.py | 2 +- echopop/live/live_biology.py | 228 ++++++++++++++++++++++++++++++- echopop/test_workflow.py | 15 +- echopop/utils/operations.py | 4 +- echopop/zarr_read_ingest_test.py | 2 +- 5 files changed, 241 insertions(+), 10 deletions(-) diff --git a/echopop/biology.py b/echopop/biology.py index 0d24ef6b..cf8f0faa 100644 --- a/echopop/biology.py +++ b/echopop/biology.py @@ -75,7 +75,7 @@ def fit_length_weight_relationship( np.polyfit(np.log10(df["length"]), np.log10(df["weight"]), 1), index=["rate", "initial"], ), - include_groups=False, + # include_groups=False, ) .reset_index() ) diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py index efc88765..f42dbe82 100644 --- a/echopop/live/live_biology.py +++ b/echopop/live/live_biology.py @@ -163,7 +163,7 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, sigma_bs_df = ( ts_length_df .groupby(list(set(contrast_columns) - set(["length"])), observed=False) - .apply(lambda x: average_sigma_bs(x, weights="length_count"), include_groups=False) + .apply(lambda x: average_sigma_bs(x, weights="length_count")) .reset_index(name="sigma_bs") ) @@ -244,7 +244,7 @@ def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.Da np.polyfit(np.log10(df["length"]), np.log10(df["weight"]), 1), index=["rate", "initial"], ), - include_groups=False, + # include_groups=False, ) .reset_index() ) @@ -807,4 +807,226 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame, ) # Return output - return fitted_weight_df \ No newline at end of file + return fitted_weight_df + +def weight_proportions(catch_data: pd.DataFrame, + specimen_data: pd.DataFrame, + length_data: pd.DataFrame, + specimen_weight_binned: pd.DataFrame, + length_weight_binned: pd.DataFrame, + length_number_proportion: pd.DataFrame, + length_weight_df: pd.DataFrame, + file_configuration: dict): + + # Get the spatial column name, if there is one + spatial_column = file_configuration["spatial_column"] + # ---- Append additional columns that will be used + contrast_columns = spatial_column + ["sex", "species_id"] + + # Calculate grouped totals + # ---- Sum the net haul weights from station 1/unaged fish + catch_weights = catch_data.count_variable( + contrasts=["species_id"] + spatial_column, + variable="haul_weight", fun="sum" + ) + # ---- Rename resulting columns for both + catch_weights.rename(columns={"count": "total_weight"}, inplace=True) + + # Sum total weights for specimen data + specimen_weights = specimen_weight_binned.sum().reset_index(name="total_weight") + + # For the specimen data + # ---- Sum the net haul weights from station 1/unaged fish + specimen_weights_sex = ( + specimen_weight_binned + .groupby(contrast_columns)["weight"] + .sum() + ) + # ---- Total (per stratum, if it exists) + specimen_weight_total = specimen_weights_sex.transpose().unstack(1).sum(axis=1) + + # For the length (unaged) dataset + length_weights_sex = ( + length_weight_binned + .groupby(contrast_columns)["weight_interp"] + .sum() + ) + # ---- Further reduce to the grand total (per stratum, if it exists) + length_weight_total = length_weights_sex.transpose().unstack(1).sum(axis=1) + + # ---- Standardize the unaged sexed weights + length_weight_standardized = ( + (length_weights_sex / length_weight_total).unstack(0) + * catch_weights["total_weight"].to_numpy() + ) + + # Calculate the specimen weight proportions + # ---- Pivot weight bins + specimen_weight_binned_pvt = ( + specimen_weight_binned.pivot_table( + columns=spatial_column, + index=["length_bin", "species_id", "sex"], + values="weight", + observed = False + ) + ) + # ---- Divide by the aged stratum weights (relative to only aged fish) + specimen_weight_proportions_pvt = ( + specimen_weight_binned_pvt / specimen_weight_total.to_numpy() + ) + # ---- Pivot back to the desired format + specimen_weight_proportion = ( + specimen_weight_proportions_pvt + .stack().reset_index(name="weight_proportion") + .pivot_table(columns=spatial_column + ["species_id", "sex"], + index="length_bin", values="weight_proportion") + ) + # ---- Calculate the internal (i.e. only aged fish) for each sex + within_specimen_sex_proportions = ( + specimen_weight_proportion.sum() + ) + + # Calculate the total strata weights + # ---- Index `catch_weights` + catch_weights_idx = catch_weights.set_index(spatial_column + ["species_id"]) + # ---- Compute the spatially-stratified/grouped weights + spatial_weights = ( + pd.concat([specimen_weight_total.to_frame("total_weight"), catch_weights_idx]) + .pivot_table( + columns=spatial_column, + aggfunc="sum", + values="total_weight", + observed=False + ) + ) + + # Calculate the weight proportions relative to the overall stratum weights + # ---- Aged + # -------- Reformat into dataframe and merge with total stratum weights + specimen_weights_binned_df = ( + specimen_weight_binned_pvt.stack() + .to_frame("specimen_weight") + .reset_index() + .merge(spatial_weights.T.reset_index(), on=spatial_column) + ) + # -------- Calculate proportions + specimen_weights_binned_df["weight_proportion_overall"] = ( + specimen_weights_binned_df["specimen_weight"] / specimen_weights_binned_df["total_weight"] + ) + # -------- Consolidate to calculate the sexed proportions per stratum + specimen_weight_sex_proportions = specimen_weights_binned_df.groupby(spatial_column + ["species_id", "sex"])[ + "weight_proportion_overall" + ].sum() + # ---- Unaged + # -------- Reformat into dataframe and merge with total stratum weights + length_weights_sex_standardized_df = ( + length_weight_standardized.stack() + .to_frame("catch_weight") + .reset_index() + .merge(spatial_weights.T.reset_index(), on=spatial_column) + ) + # -------- Calculate proportions + length_weights_sex_standardized_df["weight_proportion_overall"] = ( + length_weights_sex_standardized_df["catch_weight"] + / length_weights_sex_standardized_df["total_weight"] + ) + # -------- Back-calculate the sexed weight proportions relative to just unaged fish + # ------------ Aggregate proportions + length_total_sex_proportions = length_weights_sex_standardized_df.pivot_table( + columns=["species_id", "sex"], index=spatial_column, values="weight_proportion_overall" + ).transpose().unstack(["species_id"]).sum(axis=0) + # ------------ Re-compute the proportions + length_weight_sex_proportions = ( + length_weights_sex_standardized_df.pivot_table( + index=["species_id", "sex"], columns=spatial_column, + values="weight_proportion_overall" + ) + / length_total_sex_proportions.to_numpy() + ) + + # Compute the overall length-binned weight distributions among unaged fish + # ---- Extract the number proportions computed for unaged fish + length_number_proportions = length_number_proportion.copy() + # ---- Filter out values besides those computed for 'all' fish + length_number_proportions = length_number_proportions[length_number_proportions["sex"] == "all"] + # ---- Convert to a table + length_number_proportions_tbl = length_number_proportions.pivot_table( + columns=spatial_column + ["species_id"], + index=["length_bin"], + values="proportion_number_length", + aggfunc="sum", + observed=False, + ) + # ---- Extract the fitted weight values calculated for all fish + length_weight_all = length_weight_df[length_weight_df["sex"] == "all"] + # ---- Generate the fitted weight array + fitted_weights = length_weight_all.copy() + # ---- Get actual length bins in dataset + fitted_weights = fitted_weights[fitted_weights["length_bin"].isin(length_number_proportions["length_bin"])] + # ---- Apportion the averaged weights + length_apportioned_weights = length_number_proportions_tbl.T * fitted_weights["weight_fitted"].to_numpy() + # ---- Compute the average weight proportions per length bin per stratum + average_length_bin_weights = length_apportioned_weights.T / length_apportioned_weights.sum(axis=1) + # ---- Convert back to a DataFrame + average_length_bin_weights_df = average_length_bin_weights.unstack().reset_index( + name="weight_proportion" + ) + + # Calculate the aged and unaged weight proportions + # ---- Aged + aged_proportions = specimen_weight_sex_proportions.unstack("sex").sum(axis=1) + # ---- Unaged + unaged_proportions = 1 - aged_proportions + # -------- Re-weight the unaged sexed proportions + unaged_weight_sex_proportions_overall = ( + (length_weight_sex_proportions * unaged_proportions.unstack().transpose()).astype(float).fillna(0.0) + ) + + unaged_proportions.unstack().transpose() + # Format the outputs + # ---- Aged: stratum-sex-age-length relative to aged and total weights + aged_overall_df = ( + specimen_weight_proportion.unstack() + .reset_index(name="weight_proportions") + .merge( + specimen_weights_binned_df[ + spatial_column + ["length_bin", "sex", "species_id", "weight_proportion_overall"] + ] + ) + ) + # ---- Aged: stratum-sex relative to total weights + aged_sex_df =within_specimen_sex_proportions.reset_index(name="weight_proportion_aged").set_index( + spatial_column + ["species_id", "sex"] + ) + # ---- Add the aged sex proportiosn relative to the overall survey + aged_sex_df["weight_proportion_overall_aged"] = specimen_weight_sex_proportions + # ---- Consolidate the aged and unaged sexed dataframes + # -------- Initialize the dataframe + aged_unaged_sex_proportions = aged_sex_df.reset_index().set_index(["species_id", "sex"] + spatial_column) + # --------- Add the within-unaged weight proportions + aged_unaged_sex_proportions["weight_proportion_unaged"] = ( + length_weight_sex_proportions.stack() + ) + # --------- Add the overall-unaged weight proportions + aged_unaged_sex_proportions["weight_proportion_overall_unaged"] = ( + unaged_weight_sex_proportions_overall.stack() + ) + # ---- Overall aged and unaged proportions + aged_unaged_proportions = aged_proportions.reset_index(name="aged_proportions") + # ---- Set index + aged_unaged_proportions.set_index(spatial_column + ["species_id"], inplace=True) + # -------- Add unaged proportions + aged_unaged_proportions["unaged_proportions"] = unaged_proportions#.reset_index() + # ---- Reset the index + aged_unaged_proportions = aged_unaged_proportions.reset_index() + + # Return output + return { + "aged_weight_proportions_df": aged_overall_df, + "unaged_weight_proportions_df": average_length_bin_weights_df, + "aged_unaged_sex_weight_proportions_df": ( + aged_unaged_sex_proportions.astype(float).reset_index().fillna(0.0) + ), + "aged_unaged_weight_proportions_df": aged_unaged_proportions, + } + diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index ee87216b..dec45397 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -6,12 +6,21 @@ from echopop.live.live_core import( LIVE_DATA_STRUCTURE, ) - +from echopop.live.live_biology import ( + bin_length_data, + compute_average_weights, + compute_sigma_bs, + length_bin_counts, + length_weight_regression, + number_proportions, + length_bin_weights, + preprocess_biology_data +) from echopop.live import live_data_processing as eldp from echopop.live import live_data_loading as eldl -live_init_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_initialization_config.yml" -live_file_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" +live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml" +live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path) diff --git a/echopop/utils/operations.py b/echopop/utils/operations.py index eae68ec2..5db0e84c 100644 --- a/echopop/utils/operations.py +++ b/echopop/utils/operations.py @@ -178,7 +178,7 @@ def meld(specimen_dataframe: pd.DataFrame, length_dataframe: pd.DataFrame, contr specimen_stacked = ( specimen_dataframe.copy() .groupby(contrasts, observed=False)[["length"]] - .apply(lambda x: len(x), include_groups=True) + .apply(lambda x: len(x)) .reset_index(name="length_count") ) @@ -329,7 +329,7 @@ def interpolator_factory(sub_group): # Produce a dictionary comprising all of the produced interpolators interpolators = ( grouped_data.groupby(contrast).apply( - lambda group: interpolator_factory(group), include_groups=False + lambda group: interpolator_factory(group) ) ).to_dict() diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py index 1d53749a..03c47590 100644 --- a/echopop/zarr_read_ingest_test.py +++ b/echopop/zarr_read_ingest_test.py @@ -553,7 +553,7 @@ def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: di # Integrate NASC (and compute the echometrics, if necessary) nasc_data_df = ( acoustic_data_df.groupby(["longitude", "latitude", "ping_time"]) - .apply(lambda group: integrate_nasc(group, echometrics), include_groups=False) + .apply(lambda group: integrate_nasc(group, echometrics)) .reset_index() ) # ---- Amend the dtypes if echometrics were computed From 4e4ca876087e3c6a164aaf3ab3a2ab372d16eb7b Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Thu, 1 Aug 2024 22:34:31 -0700 Subject: [PATCH 14/81] Complete biology processing code --- echopop/live/live_biology.py | 11 +--- echopop/live/live_survey.py | 21 +++++-- echopop/live/sql_methods.py | 2 +- echopop/test_workflow.py | 9 ++- echopop/zarr_read_ingest_test.py | 103 ++++++++++++++++++++++++++++++- 5 files changed, 128 insertions(+), 18 deletions(-) diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py index f42dbe82..896b3b23 100644 --- a/echopop/live/live_biology.py +++ b/echopop/live/live_biology.py @@ -1,6 +1,6 @@ import pandas as pd import numpy as np -from .sql_methods import SQL, sql_data_exchange, get_table_key_names +from .sql_methods import SQL, sql_data_exchange, get_table_key_names, sql_group_update from .live_spatial_methods import apply_spatial_definitions from .live_acoustics import average_sigma_bs from ..acoustics import ts_length_regression, to_dB, to_linear @@ -198,9 +198,9 @@ def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.Da file_configuration: dict): # Get the spatial column name, if there is one - contrast_columns = file_configuration["spatial_column"].copy() + spatial_column = file_configuration["spatial_column"].copy() # ---- Append additional columns that will be used - contrast_columns.extend(["trawl_partition", "sex", "haul_num", "species_id", "length_bin"]) + contrast_columns = spatial_column + ["trawl_partition", "sex", "haul_num", "species_id", "length_bin"] # Gather specimen measurements to represent 'all' fish specimen_data_all = specimen_data.assign(sex="all") @@ -810,8 +810,6 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame, return fitted_weight_df def weight_proportions(catch_data: pd.DataFrame, - specimen_data: pd.DataFrame, - length_data: pd.DataFrame, specimen_weight_binned: pd.DataFrame, length_weight_binned: pd.DataFrame, length_number_proportion: pd.DataFrame, @@ -832,9 +830,6 @@ def weight_proportions(catch_data: pd.DataFrame, # ---- Rename resulting columns for both catch_weights.rename(columns={"count": "total_weight"}, inplace=True) - # Sum total weights for specimen data - specimen_weights = specimen_weight_binned.sum().reset_index(name="total_weight") - # For the specimen data # ---- Sum the net haul weights from station 1/unaged fish specimen_weights_sex = ( diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index e8589b93..ac5bfcc3 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -15,6 +15,7 @@ from .sql_methods import query_processed_files from .live_acoustics import ( compute_nasc, + format_acoustic_dataset, preprocess_acoustic_data ) @@ -23,10 +24,11 @@ compute_average_weights, compute_sigma_bs, length_bin_counts, - length_weight_regression, - number_proportions, length_bin_weights, - preprocess_biology_data + length_weight_regression, + number_proportions, + preprocess_biology_data, + weight_proportions ) @@ -175,6 +177,17 @@ def process_biology_data(self): length_weight_df, self.config["length_distribution"], self.config) + + # Compute the weight proportions + self.input["biology"].update({ + "proportions": weight_proportions(biology_unprocessed["catch_df"], + specimen_weight_binned, + length_weight_binned, + length_number_proportion, + length_weight_df, + self.config) + }) + def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True): @@ -196,7 +209,7 @@ def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True): nasc_data_df = compute_nasc(acoustic_data_df, self.config, echometrics) # Format the dataframe and insert into the LiveSurvey object - self.input["nasc_df"] = nasc_data_df + self.input["nasc_df"] = format_acoustic_dataset(nasc_data_df, self.config) def estimate_population(self): # method here diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index 0bb47306..db5e3a06 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -589,7 +589,7 @@ def SQL(db_file: str, command: str, **kwargs): engine = create_engine(f"sqlite:///{db_file}") # Format the data columns, if necessary, to fit within the SQL commands - if command not in ["inspect", "update", "select"]: + if command not in ["inspect", "update"]: kwargs = format_sql_columns(kwargs) # Run the command diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index dec45397..872849fd 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -3,6 +3,8 @@ from echopop.live.sql_methods import query_processed_files from echopop.live.live_acoustics import preprocess_acoustic_data, compute_nasc from echopop.live.live_biology import preprocess_biology_data +from echopop.live.sql_methods import SQL, SQL_COMMANDS, query_processed_files, format_sql_columns, sql_group_update, sql_data_exchange + from echopop.live.live_core import( LIVE_DATA_STRUCTURE, ) @@ -14,13 +16,14 @@ length_weight_regression, number_proportions, length_bin_weights, - preprocess_biology_data + preprocess_biology_data, + weight_proportions ) from echopop.live import live_data_processing as eldp from echopop.live import live_data_loading as eldl -live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml" -live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" +live_init_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_initialization_config.yml" +live_file_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path) diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py index 03c47590..2cfd0cd8 100644 --- a/echopop/zarr_read_ingest_test.py +++ b/echopop/zarr_read_ingest_test.py @@ -15,11 +15,11 @@ from sqlalchemy import create_engine, text, Engine, inspect from echopop.live.live_core import LIVE_DATA_STRUCTURE, LIVE_FILE_FORMAT_MAP, LIVE_INPUT_FILE_CONFIG_MAP, SPATIAL_CONFIG_MAP from echopop.live.live_data_loading import validate_data_directory -from echopop.live.sql_methods import SQL, SQL_COMMANDS, query_processed_files, format_sql_columns +from echopop.live.sql_methods import SQL, SQL_COMMANDS, query_processed_files, format_sql_columns, sql_group_update, sql_data_exchange from echopop.live import live_data_processing as eldp from echopop.live import live_data_loading as eldl from echopop.live.live_survey import LiveSurvey -from echopop.live.live_acoustics import preprocess_acoustic_data +from echopop.live.live_acoustics import preprocess_acoustic_data, compute_nasc, format_acoustic_dataset from echopop.live.live_biology import preprocess_biology_data from echopop.survey import Survey @@ -89,6 +89,105 @@ def process_biology_data(self): self.config["length_distribution"], self.config) +# NOTE: ARGUMENT: {working_dataset: Literal["acoustic", "biology"]} +working_dataset = "biology" + +# +acoustic_db = self.config["database"]["acoustics"] +biology_db = self.config["database"]["biology"] + +# +spatial_column = file_configuration["spatial_column"] + +# Create conditional string +condition_str = ( + f"stratum in {np.unique(self.input["nasc_df"]["stratum"])} " + f"& nasc > 0.0" +) + +# Get corresponding data +acoustic_df = SQL(acoustic_db,"select",table_name="survey_data_df", + condition=condition_str) + +# Get corresponding `sigma_bs` +sigma_bs_df = SQL(acoustic_db,"select",table_name="sigma_bs_mean_df", + condition=f"stratum in {np.unique(self.input["nasc_df"]["stratum"])}") +# ---- Compute the weighted average +sigma_bs_mean_df = ( + sigma_bs_df.groupby(spatial_column + ["species_id"])[["sigma_bs", "sigma_bs_count"]] + .apply(lambda df: np.average(df.sigma_bs, weights=df.sigma_bs_count)) + .to_frame("sigma_bs_mean") + .reset_index() +) + +# +nasc_biology = acoustic_df.merge(sigma_bs_df, on=spatial_column) + +# +nasc_biology["number_density"] = ( + nasc_biology["nasc"] + / (4.0 * np.pi * nasc_biology["sigma_bs"]) +) + +psi = 10 ** (-21/10) +psi * 280**2 * 1500 * 128e-6 / 2 +psi / 3 * 280 ** 3 / 280 / 1852 ** 2 * nasc_biology["number_density"] + +psi * (280.0 ** 2) / 1852 ** 2 +depth_area = 280 ** 2 * psi +swath_length = 0.5 * 1852 +depth_area * swath_length / 1852 ** 2 * nasc_biology["number_density"] +280 ** 2 * psi / 1852 ** 2 * nasc_biology["number_density"] + +SQL(acoustic_db, "map") +beam_angle = 9.0 * np.pi / 180.0 +280.0 * np.tan(beam_angle) * 2.0 * swath_length / 1852 ** 2 * nasc_biology["number_density"] +280.0 * np.tan(beam_angle) * 2.0 ** 2 * np.pi * swath_length / 1852 ** 2 * nasc_biology["number_density"] +area = 2.0 * nasc_biology["center_of_mass"] ** 2 * np.tan(beam_angle) +area / 1852 ** 2 * nasc_biology["number_density"] +SQL(acoustic_db, "map") + +# Merge hake fraction data into `nasc_interval_df` +# ---- Initial merge +nasc_interval_df = nasc_interval_df.merge( + input_dict["spatial"]["strata_df"], on=[stratum_col, "haul_num"], how="outer" +) +# ---- Replace `fraction_hake` where NaN occurs +nasc_interval_df["fraction_hake"] = nasc_interval_df["fraction_hake"].fillna(0.0) +# ---- Drop NaN +nasc_interval_df.dropna(subset=["transect_num"], inplace=True) + +# Calculate the along-transect number density (animals per nmi^2) +# ---- Merge NASC measurements with mean sigma_bs for each stratum +nasc_biology = nasc_interval_df.merge(sigma_bs_strata, on=[stratum_col]) +# ---- Calculate the number densities +nasc_biology["number_density"] = ( + nasc_biology["fraction_hake"] + * nasc_biology["nasc"] + / (4.0 * np.pi * nasc_biology["sigma_bs_mean"]) +) + + +if working_dataset == "acoustic": + db_file = self.config["database"]["acoustic"] +elif working_dataset == "biology": + db_file = self.config["database"]["biology"] +else: + raise ValueError( + f"Argument for `working_dataset` [{working_dataset}] is invalid." + f" Value must either be 'acoustic' or 'biology'." + ) + +# Extract the necessary correct strata mean sigma_bs +sigma_bs_strata = analysis_dict["acoustics"]["sigma_bs"]["strata_mean_df"] + +# Pull out the length-weight conversion for each stratum +length_weight_strata = analysis_dict["biology"]["weight"]["weight_stratum_df"] + +# Get the name of the stratum column +stratum_col = settings_dict["transect"]["stratum_name"] + + catch_data = self.input["biology"]["catch_df"] # Get the spatial column name, if there is one From d0f4208fc3d2a8ddc0ba0ab00d1a9344e9c672de Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Sun, 4 Aug 2024 19:10:54 -0700 Subject: [PATCH 15/81] More changes to methods --- echopop/live/live_acoustics.py | 40 +++- echopop/live/live_data_processing.py | 53 +++++ echopop/live/live_survey.py | 24 +- echopop/live/sql_methods.py | 1 + echopop/zarr_read_ingest_test.py | 334 ++++++++++++++++++++++----- 5 files changed, 379 insertions(+), 73 deletions(-) diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py index 11d0b392..5bd29aca 100644 --- a/echopop/live/live_acoustics.py +++ b/echopop/live/live_acoustics.py @@ -1,10 +1,11 @@ -from typing import Union, Optional +from typing import Union, Optional, List import numpy as np import pandas as pd from ..acoustics import ts_length_regression, to_linear, to_dB from .live_spatial_methods import apply_spatial_definitions -from .sql_methods import sql_data_exchange +from .sql_methods import sql_data_exchange, SQL +from .live_data_processing import get_unique_identifiers, query_dataset # TODO: Documentation def configure_transmit_frequency(frequency_values: pd.Series, @@ -243,4 +244,37 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict # Return the formatted dataframe return df - + +def get_nasc_sql_data(db_file: str, + data_dict: dict, + unique_columns: List[str]): + # ---- Add SELECTION columns + data_columns = ( + unique_columns + ["x", "y", "longitude", "latitude", "ping_time", "nasc", "number_density", + "biomass_density"] + ) + # ----- Get the SQL dataset + nasc_sql_data = query_dataset(db_file, + data_dict, + table_name="survey_data_df", + data_columns = data_columns, + unique_columns=unique_columns, + constraint="nasc > 0.0") + # ---- Use SQL table data if present + if nasc_sql_data is not None and not nasc_sql_data.empty: + return nasc_sql_data + elif "nasc_df" in data_dict.keys(): + return data_dict["nasc_df"] + +def get_sigma_bs_sql_data(db_file: str, + data_dict: dict, + unique_columns: list): + + # Get corresponding `sigma_bs` DataFrame + sigma_bs_df = query_dataset(db_file, + data_dict, + table_name="sigma_bs_mean_df", + data_columns=["sigma_bs", "sigma_bs_count"], + unique_columns=unique_columns) + + sigma_bs_df = SQL(db_file, "select", table_name="sigma_bs_mean_df") diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py index cf126230..9587c935 100644 --- a/echopop/live/live_data_processing.py +++ b/echopop/live/live_data_processing.py @@ -1,6 +1,8 @@ import yaml import re +from functools import reduce +from .sql_methods import SQL from pathlib import Path from typing import Union, Tuple, Optional, List @@ -12,3 +14,54 @@ LIVE_FILE_FORMAT_MAP, LIVE_INPUT_FILE_CONFIG_MAP ) + +def get_unique_identifiers(data_dict: dict, + unique_columns: List[str]) -> pd.DataFrame: + + # Gather all dataframes from a dictionary into a list + df_list = [df for _, df in data_dict.items()] + + # Get unique values of each contrast column across the biological datasets + dfs = [pd.DataFrame({col: df[col].unique().tolist()}) for col in unique_columns + for df in df_list if not df.empty and isinstance(df, pd.DataFrame)] + + # Reduce into a single DataFrame + if len(unique_columns) > 1: + return reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs) + else: + return reduce(lambda left, right: pd.merge(left, right, how='inner'), dfs) + + +def query_dataset(db_file: str, + data_dict: dict, + table_name: str, + data_columns: List[str], + unique_columns: List[str], + constraint: str = None): + + # Validate that the desired table exists + if SQL(db_file, "validate", table_name=table_name): + # ---- Inspect the SQL table + inspected_table = SQL(db_file, "inspect", table_name=table_name) + # ---- Create a list of intersecting column names + unique_keys = list(set(inspected_table.keys()).intersection(set(unique_columns))) + # ---- Create list of valid columns + valid_keys = list(set(inspected_table.keys()).intersection(set(data_columns))) + # ---- Get unique identifiers + unique_keys_df = get_unique_identifiers(data_dict, unique_keys) + # ---- Create conditional string + conditional_str = ( + " & ".join([f"{col} in {np.unique(unique_keys_df[col])}" + for col in unique_keys_df.columns]) + ) + # ---- Append the additional constraint statement if present + if constraint is not None: + conditional_str += f" & {constraint}" + # ---- SELECT the dataset using the conidtional statement + data_sql = SQL(db_file, "select", table_name=table_name, columns=valid_keys, + condition=conditional_str).filter(data_columns) + else: + data_sql = None + + # Return the table DataFrame + return data_sql diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index ac5bfcc3..6f0b568e 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -1,4 +1,4 @@ -from typing import Union, Optional +from typing import Union, Optional, Literal from pathlib import Path import copy @@ -171,12 +171,14 @@ def process_biology_data(self): ) # Calculate the average weights among male, female, and all fish - fitted_weight_df = compute_average_weights(specimen_number_proportion, - length_number_proportion, - sex_number_proportions, - length_weight_df, - self.config["length_distribution"], - self.config) + self.input["weight_stratumn_df"] = ( + compute_average_weights(specimen_number_proportion, + length_number_proportion, + sex_number_proportions, + length_weight_df, + self.config["length_distribution"], + self.config) + ) # Compute the weight proportions self.input["biology"].update({ @@ -209,9 +211,11 @@ def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True): nasc_data_df = compute_nasc(acoustic_data_df, self.config, echometrics) # Format the dataframe and insert into the LiveSurvey object - self.input["nasc_df"] = format_acoustic_dataset(nasc_data_df, self.config) + self.input["acoustics"]["nasc_df"] = format_acoustic_dataset(nasc_data_df, self.config) - def estimate_population(self): - # method here + def estimate_population(self, + working_dataset: Literal["acoustic", "biology"]): + + # method pass diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index db5e3a06..3cdca5fd 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -333,6 +333,7 @@ def sql_select(connection: sqla.Connection, table_name: str, "float": "FLOAT", "int": "INTEGER", 'bool': 'BOOLEAN', + "Interval": "TEXT", "Timestamp": "DATETIME", 'object': 'TEXT', "str": "TEXT", diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py index 2cfd0cd8..09e93fcc 100644 --- a/echopop/zarr_read_ingest_test.py +++ b/echopop/zarr_read_ingest_test.py @@ -48,70 +48,34 @@ database_file = biology_db kwargs = dict(dataframe=df, table_name=table_name, id_columns=["id"], primary_keys=["id"], output_type=pd.DataFrame) -def process_biology_data(self): - - # Compute `sigma_bs` by sending it to the appropriate database table - compute_sigma_bs(biology_unprocessed["specimen_df"], biology_unprocessed["length_df"], - self.config) - - # Bin the length measurements of the biological data - bin_length_data(biology_unprocessed, self.config["length_distribution"]) - - # Compute the length-weight regression and add it to the SQL table - length_weight_df = length_weight_regression(biology_unprocessed["specimen_df"], - self.config["length_distribution"], - self.config) - - # Compute length-binned counts for the aggregated and individual-based measurements - specimen_binned, specimen_binned_filtered, length_binned = ( - length_bin_counts(biology_unprocessed["length_df"], biology_unprocessed["specimen_df"], - self.config) - ) - - # Compute the number proportions - specimen_number_proportion, length_number_proportion, sex_number_proportions = ( - number_proportions(specimen_binned, specimen_binned_filtered, length_binned, - self.config) - ) - - # Compute the length-binned weights for the aggregated and individual-based measurements - length_weight_binned, specimen_weight_binned = ( - length_bin_weights(biology_unprocessed["length_df"], - biology_unprocessed["specimen_df"], - length_weight_df,self.config) - ) - - # Calculate the average weights among male, female, and all fish - fitted_weight_df = compute_average_weights(specimen_number_proportion, - length_number_proportion, - sex_number_proportions, - length_weight_df, - self.config["length_distribution"], - self.config) - -# NOTE: ARGUMENT: {working_dataset: Literal["acoustic", "biology"]} -working_dataset = "biology" - -# -acoustic_db = self.config["database"]["acoustics"] -biology_db = self.config["database"]["biology"] +# NOTE: ARGUMENT: {working_dataset: Literal["acoustics", "biology"]} +working_dataset = "acoustics" +self = realtime_survey +file_configuration = self.config +self.results["biology"] = self.input["biology_processed"] +self.results["acoustics"] = self.input["nasc_df"] -# +# Get spatial column spatial_column = file_configuration["spatial_column"] -# Create conditional string -condition_str = ( - f"stratum in {np.unique(self.input["nasc_df"]["stratum"])} " - f"& nasc > 0.0" -) +# Initialize the working data dictionary +working_data = copy.deepcopy(self.results) +contrast_columns = [] +# ---- Define unique columns +unique_columns = spatial_column + contrast_columns + +if working_dataset == "acoustics" and self.input["nasc_df"] is not None: + # ---- Get dataset + acoustic_df = get_nasc_sql_data(acoustic_db, + self.input["acoustics"], + unique_columns=unique_columns) -# Get corresponding data -acoustic_df = SQL(acoustic_db,"select",table_name="survey_data_df", - condition=condition_str) # Get corresponding `sigma_bs` -sigma_bs_df = SQL(acoustic_db,"select",table_name="sigma_bs_mean_df", - condition=f"stratum in {np.unique(self.input["nasc_df"]["stratum"])}") +# sigma_bs_df = SQL(acoustic_db,"select",table_name="sigma_bs_mean_df", +# condition=f"stratum in {np.unique(self.input["nasc_df"]["stratum"])}") +sigma_bs_df = SQL(acoustic_db, "select", table_name="sigma_bs_mean_df") +sigma_bs_df["stratum"] = 2 # ---- Compute the weighted average sigma_bs_mean_df = ( sigma_bs_df.groupby(spatial_column + ["species_id"])[["sigma_bs", "sigma_bs_count"]] @@ -121,13 +85,263 @@ def process_biology_data(self): ) # -nasc_biology = acoustic_df.merge(sigma_bs_df, on=spatial_column) +nasc_biology = acoustic_df.merge(sigma_bs_mean_df, on=spatial_column) + +# Get the spatially averaged weights +weight_spatial_averages = self.input["weight_stratumn_df"] +# ---- Sub-select 'all' +general_weight_averages = weight_spatial_averages[weight_spatial_averages["sex"] == "all"] +general_weight_averages["stratum"] = 2 # nasc_biology["number_density"] = ( nasc_biology["nasc"] - / (4.0 * np.pi * nasc_biology["sigma_bs"]) + / (4.0 * np.pi * nasc_biology["sigma_bs_mean"]) +) + +# +nasc_biology = nasc_biology.merge(general_weight_averages) + +nasc_biology["biomass_density"] = nasc_biology["number_density"] * nasc_biology["average_weight"] + +sql_group_update(acoustic_db, dataframe=nasc_biology, + table_name="survey_data_df", columns=["number_density", "biomass_density"], + unique_columns=["stratum", "longitude", "latitude", "ping_time"]) + +strata_df = self.input["spatial"]["strata"].copy() +strata_df[["length_mean", "weight_mean", "TS_mean", "number_density_mean", + "biomass_density_mean", "abundance_sum", "biomass_sum"]] = np.nan +strata_df.drop(columns=["latitude_interval"], inplace=True) +SQL(acoustic_db, "select", table_name="survey_data_df") + +SQL(biology_db, "drop", table_name="strata_summary_df") +SQL(biology_db, "create", table_name="strata_summary_df", dataframe=strata_df, primary_keys=["stratum"]) +SQL(biology_db, "insert", table_name="strata_summary_df", dataframe=strata_df, + id_columns=["stratum"]) + +tt = pd.DataFrame({ + "x": np.array([1, 1, 1, 2, 2, 2, 3, 3, 3]), + "y": np.array([1, 2, 3, 1, 2, 3, 1, 2, 3]), + "area": 50 ** 2, + "mean_number_density": 0.0, + "mean_biomass_density": 0.0, + "abundance": 0.0, + "biomass": 0.0 +}) + +nasc_biology_output_a = self.input["nasc_df"].assign(x=1, y=1).reset_index(drop=True) +nasc_biology_output_a.loc[3, "x"] = 2 +nasc_biology_output_a.loc[3, "y"] = 3 +nasc_biology_output_a = nasc_biology_output_a.filter(["stratum", "x", "y", "longitude", "latitude", "nasc", "number_density", "biomass_density"]) +nasc_biology_output = nasc_biology_output_a.merge(sigma_bs_mean_df, on=spatial_column) +nasc_biology_output["number_density"] = ( + nasc_biology_output["nasc"] + / (4.0 * np.pi * nasc_biology_output["sigma_bs_mean"]) ) +nasc_biology_output =nasc_biology_output.merge(general_weight_averages) +nasc_biology_output["biomass_density"] = nasc_biology_output["number_density"] * nasc_biology_output["average_weight"] +nasc_biology_output = nasc_biology_output.filter(["stratum", "x", "y", "longitude", "latitude", "number_density", "biomass_density"]) +nasc_biology_output = nasc_biology_output[nasc_biology_output["number_density"] > 0.0].reset_index() + +SQL(acoustic_db, "drop", table_name="reference") +SQL(acoustic_db, "drop", table_name="grid") + +SQL(acoustic_db, "create", table_name = "reference", dataframe=tt) +SQL(acoustic_db, "create", table_name = "grid", dataframe=nasc_biology_output_a) + +SQL(acoustic_db, "insert", table_name = "reference", dataframe=tt) +SQL(acoustic_db, "insert", table_name = "grid", dataframe=nasc_biology_output_a) + +SQL(acoustic_db, "select", table_name="grid") +SQL(acoustic_db, "select", table_name="reference") + +sql_group_update(acoustic_db, dataframe=nasc_biology_output, + table_name="grid", columns=["number_density", "biomass_density"], + unique_columns=["stratum", "x", "y", "longitude", "latitude"]) + +SQL(acoustic_db, "select", table_name="grid") + +from typing import List + +data_table = "grid" +grid_table = "reference" +column_pairs = [("number_density", "abundance"), ("biomass_density", "biomass")] +coordinates = ["x", "y"] +dataframe = nasc_biology_output + +def update_population_grid(db_file: str, + data_table: str, + grid_table: str, + dataframe: pd.DataFrame, + column_pairs: Union[List[tuple[str, str]], tuple[str, str]], + coordinates: List[str]): + + # Convert `column_pairs` to a list, if needed + if not isinstance(column_pairs, list): + column_pairs = [column_pairs] + + dataframe[coordinates] + # Format the coordinate pairs + # ---- Convert coordinate values into a list of tuples + coord_pairs = [tuple(row) for row in dataframe[coordinates].itertuples(index=False)] + # ---- Get unique pairs + coords = list(set(coord_pairs)) + + # Format the SQL script command + # ---- Initialize + sql_script = [] + # ---- Iteratively update + for input_column, output_column in column_pairs: + sql_script.append( + f""" + BEGIN TRANSACTION; + + -- Calculate averages for input_column and update grid_table + WITH avgs AS ( + SELECT + {coordinates[0]}, + {coordinates[1]}, + AVG(d.{input_column}) as avg_value + FROM {data_table} d + GROUP BY d.{coordinates[0]}, d.{coordinates[1]} + ) + + -- Update the grid_table with both average and computed total + UPDATE {grid_table} + SET + mean_{input_column} = ( + SELECT avg_value + FROM avgs + WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} + AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} + ), + {output_column} = ( + SELECT avg_value * {grid_table}.area + FROM avgs + WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} + AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} + ) + WHERE EXISTS ( + SELECT 1 + FROM avgs + WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} + AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} + ); + + COMMIT; + """ + ) + + # Create the engine + engine = create_engine(f"sqlite:///{db_file}") + + # Create the SQL database connection and send the script + with engine.connect() as connection: + dbapi_conn = connection.connection + _ = dbapi_conn.executescript("\n".join(sql_script)) + + +SQL(acoustic_db, "select", table_name=data_table) +SQL(acoustic_db, "select", table_name=grid_table) + + +SQL(acoustic_db, "update", table_name="grid", dataframe=nasc_biology_output, unique_columns=["stratum", "x", "y"], columns=["number_density", "biomass_density"]) +SQL(acoustic_db, "select", table_name="reference") + +source_db = acoustic_db +target_db = biology_db + +source_table = "grid" +target_table = "strata_summary_df" + +data_columns = ["number_density", "biomass_density"] +strata_columns = ["stratum"] +strata = [2] +stratum_list = ', '.join(map(str, stratum_values)) + +data_column = data_columns[0] +data_columns = data_columns[0] +def sql_update_strata_summary(source_db: str, + target_db: str, + arg_fun: str, + data_columns: List[tuple[str, str]], + strata: list): + + # Format strata list as a string + strata_str = ', '.join(map(str, strata)) + + # Function reference map + FUNCTION_MAP = { + "sum": {"function": "SUM", + "suffix": "sum"}, + "mean": {"function": "AVG", + "suffix": "mean"} + } + + # Prepare the SQL script + sql_script = f""" + -- Attach the source and target databases + ATTACH DATABASE '{source_db}' AS source; + ATTACH DATABASE '{target_db}' AS target; + + """ + + # Dynamically format the cross-database command + for data_column, method in data_columns: + # ----- Format the function-method-suffic keys + suffix = FUNCTION_MAP[method]["suffix"] + fun = FUNCTION_MAP[method]["function"] + # ---- Create the combined SQL command using f-strings + sql_script += f""" + -- Calculate averages and directly update the target table + UPDATE target.{target_table} + SET {data_column}_{suffix} = ( + SELECT {fun}({data_column}) + FROM source.{source_table} + WHERE stratum = target.{target_table}.stratum + ) + WHERE stratum IN ({strata_str}); + """ + # ----- Append DETACH commands only once at the end + sql_script += """ + -- Detach the databases + DETACH DATABASE source; + DETACH DATABASE target; + """ + + # Create the engine + engine = create_engine(f"sqlite:///{target_db}") + + # Create the SQL database connection and send the script + with engine.connect() as connection: + dbapi_conn = connection.connection + _ = dbapi_conn.executescript(sql_script) + +SQL(biology_db, "select", table_name=target_table) +SQL(acoustic_db, "select", table_name=source_table) +connection.close() +dbapi_conn.close() + + +pairs = [(1, 2), (3, 4), (5, 6)] + +# Convert the pairs into a format suitable for SQL IN clause +pairs_placeholder = ', '.join(f'({x}, {y})' for x, y in pairs) + +# Construct the SQL command as a text string +sql_command = f''' +BEGIN TRANSACTION; + +UPDATE reference +SET total = ( + SELECT AVG(g.sigma_bs) * r.area + FROM grid g + WHERE g.stratum = r.stratum_x +) +WHERE (stratum_x, stratum_y) IN ({pairs_placeholder}); + +COMMIT; +''' psi = 10 ** (-21/10) psi * 280**2 * 1500 * 128e-6 / 2 From 6020f7946f06afcf66ae7ea0c7ce8f4925764864 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 7 Aug 2024 12:31:55 -0700 Subject: [PATCH 16/81] Full drafted workflow --- config_files/live_survey_year_2019_config.yml | 1 + echopop/live/live_acoustics.py | 78 +++---- echopop/live/live_biology.py | 127 ++++++++-- echopop/live/live_data_loading.py | 5 +- echopop/live/live_data_processing.py | 217 +++++++++++++++++- echopop/live/live_spatial_methods.py | 2 +- echopop/live/live_survey.py | 207 ++++++++++++----- echopop/live/sql_methods.py | 195 +++++++++++++--- echopop/test_workflow.py | 66 +++--- echopop/zarr_read_ingest_test.py | 167 ++++++++++---- 10 files changed, 828 insertions(+), 237 deletions(-) diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml index b7b7aef4..4111ea05 100644 --- a/config_files/live_survey_year_2019_config.yml +++ b/config_files/live_survey_year_2019_config.yml @@ -45,4 +45,5 @@ input_directories: coastline: directory: coastline/ coastline_name: ne_110m_land + ... diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py index 5bd29aca..82e4c1a3 100644 --- a/echopop/live/live_acoustics.py +++ b/echopop/live/live_acoustics.py @@ -4,8 +4,7 @@ from ..acoustics import ts_length_regression, to_linear, to_dB from .live_spatial_methods import apply_spatial_definitions -from .sql_methods import sql_data_exchange, SQL -from .live_data_processing import get_unique_identifiers, query_dataset +from .sql_methods import sql_data_exchange, SQL, query_processed_files # TODO: Documentation def configure_transmit_frequency(frequency_values: pd.Series, @@ -49,8 +48,9 @@ def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame, ) # Apply spatial settings - prc_nasc_df_filtered.loc[:, "stratum"] = ( - apply_spatial_definitions(prc_nasc_df_filtered["latitude"], spatial_dict) + prc_nasc_df_filtered = ( + prc_nasc_df_filtered + .assign(stratum=apply_spatial_definitions(prc_nasc_df_filtered["latitude"], spatial_dict)) ) # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object @@ -192,13 +192,25 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict, spatial_column = file_configuration["spatial_column"] # Integrate NASC (and compute the echometrics, if necessary) - nasc_data_df = ( - acoustic_data_df - .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, observed=False) - .apply(integrate_nasc, echometrics) - .unstack().reset_index() - .sort_values("ping_time") - ) + # ---- Get number of unique sources + if len(np.unique(acoustic_data_df["source"])) == 1: + nasc_data_df = ( + acoustic_data_df + .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, + observed=False) + .apply(integrate_nasc, echometrics) + .reset_index() + .sort_values("ping_time") + ) + else: + nasc_data_df = ( + acoustic_data_df + .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, + observed=False) + .apply(integrate_nasc, echometrics, include_groups=False) + .unstack().reset_index() + .sort_values("ping_time") + ) # ---- Amend the dtypes if echometrics were computed if echometrics: # ---- Set dtypes @@ -219,7 +231,7 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict, # Return the output return nasc_data_df -def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict): +def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict, meta_dict: dict): # Get acoustic database filename acoustic_db = file_configuration["database"]["acoustics"] @@ -236,6 +248,12 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict key_values = [f"{str(index)}-{df.loc[index, 'source']}" for index in df.index] # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint df.loc[:, "id"] = key_values + + # Update the successfully processed files + query_processed_files(file_configuration["data_root_dir"], + file_configuration["input_directories"]["acoustics"], + meta_dict["provenance"]["acoustic_files"], + processed=True) # Insert the new data into the database & pull in the combined dataset # TODO: Replace with single-direction INSERT statement instead of INSERT/SELECT @@ -243,38 +261,4 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict id_columns=["id"], primary_keys=["id"], output_type=pd.DataFrame) # Return the formatted dataframe - return df - -def get_nasc_sql_data(db_file: str, - data_dict: dict, - unique_columns: List[str]): - # ---- Add SELECTION columns - data_columns = ( - unique_columns + ["x", "y", "longitude", "latitude", "ping_time", "nasc", "number_density", - "biomass_density"] - ) - # ----- Get the SQL dataset - nasc_sql_data = query_dataset(db_file, - data_dict, - table_name="survey_data_df", - data_columns = data_columns, - unique_columns=unique_columns, - constraint="nasc > 0.0") - # ---- Use SQL table data if present - if nasc_sql_data is not None and not nasc_sql_data.empty: - return nasc_sql_data - elif "nasc_df" in data_dict.keys(): - return data_dict["nasc_df"] - -def get_sigma_bs_sql_data(db_file: str, - data_dict: dict, - unique_columns: list): - - # Get corresponding `sigma_bs` DataFrame - sigma_bs_df = query_dataset(db_file, - data_dict, - table_name="sigma_bs_mean_df", - data_columns=["sigma_bs", "sigma_bs_count"], - unique_columns=unique_columns) - - sigma_bs_df = SQL(db_file, "select", table_name="sigma_bs_mean_df") + return df \ No newline at end of file diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py index 896b3b23..ae7dde6b 100644 --- a/echopop/live/live_biology.py +++ b/echopop/live/live_biology.py @@ -1,6 +1,6 @@ import pandas as pd import numpy as np -from .sql_methods import SQL, sql_data_exchange, get_table_key_names, sql_group_update +from .sql_methods import SQL, sql_data_exchange, get_table_key_names, sql_group_update, query_processed_files, sql_update_strata_summary from .live_spatial_methods import apply_spatial_definitions from .live_acoustics import average_sigma_bs from ..acoustics import ts_length_regression, to_dB, to_linear @@ -163,15 +163,22 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, sigma_bs_df = ( ts_length_df .groupby(list(set(contrast_columns) - set(["length"])), observed=False) + [["TS_L_slope", "TS_L_intercept", "length", "length_count"]] .apply(lambda x: average_sigma_bs(x, weights="length_count")) - .reset_index(name="sigma_bs") + .to_frame("sigma_bs") ) # For SQL database storage purposes, the sum and count are stored instead # ---- Count sum - sigma_bs_df["sigma_bs_count"] = ts_length_df["length_count"].sum() + sigma_bs_df["sigma_bs_count"] = ( + ts_length_df.reset_index() + .groupby(list(set(contrast_columns) - set(["length"])), observed=False)["length_count"] + .sum() + ) # ---- Value sum sigma_bs_df["sigma_bs_sum"] = sigma_bs_df["sigma_bs"] * sigma_bs_df["sigma_bs_count"] + # ---- Reset index + sigma_bs_df = sigma_bs_df.reset_index() # Get the database file name acoustic_db = file_configuration["database"]["acoustics"] @@ -185,14 +192,22 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, # ---- Populate table SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df) else: - # ---- Create a filter condition command - condition_str = " & ".join([f"{key} in {np.unique(sigma_bs_df[key])}" for key in key_list]) - # ---- Update the table key - SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df, - operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], condition=condition_str) - # ---- Update the actual `sigma_bs` value in the table - SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", columns=["sigma_bs"], - operation="sigma_bs_sum / sigma_bs_count", condition=condition_str) + # ---- Check the present keys + current_keys_dict = SQL(acoustic_db, "inspect", table_name="sigma_bs_mean_df", + columns=key_list) + # ---- Insert if missing + if not all([all(sigma_bs_df[key].isin(current_keys_dict[key])) for key in key_list]): + SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df) + # ---- Update if not missing + else: + # ---- Create a filter condition command + condition_str = " & ".join([f"{key} in {np.unique(sigma_bs_df[key])}" for key in key_list]) + # ---- Update the table key + SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df, + operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], condition=condition_str) + # ---- Update the actual `sigma_bs` value in the table + SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", columns=["sigma_bs"], + operation="sigma_bs_sum / sigma_bs_count", condition=condition_str) def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.DataFrame, file_configuration: dict): @@ -754,11 +769,11 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame, ) specimen_length_complete = complete_distrib_df.copy() - specimen_length_complete["number_proportion"] = specimen_length_distribution.set_index(contrast_columns + ["length_bin"]) + specimen_length_complete["number_proportion"] = specimen_length_distribution.set_index(contrast_columns + ["length_bin"]).sort_index() specimen_length_complete.loc[:, "number_proportion"] = specimen_length_complete["number_proportion"].fillna(0.0) length_length_complete = complete_distrib_df.copy() - length_length_complete["number_proportion"] = length_length_distribution.set_index(contrast_columns + ["length_bin"]) + length_length_complete["number_proportion"] = length_length_distribution.set_index(contrast_columns + ["length_bin"]).sort_index() length_length_complete.loc[:, "number_proportion"] = length_length_complete["number_proportion"].fillna(0.0) # ---- Concatenate the two datasets @@ -806,6 +821,53 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame, np.concatenate([weight_all, weight_male, weight_female]) ) + # Get database file + biology_db = file_configuration["database"]["biology"] + + # Insert/update the table + # ---- Create id/primary key + key_values = ["-".join(fitted_weight_df.reset_index() + .loc[idx, contrast_columns] + .values.astype(str)) + for idx in fitted_weight_df.reset_index().index] + # ---- Add to the output + fitted_weight_df["id"] = key_values + if not SQL(biology_db, "validate", table_name="weight_stratum_df"): + # ---- Create + SQL(biology_db, "create", table_name="weight_stratum_df", + dataframe=fitted_weight_df, primary_keys=["id"]) + # ---- Populate table + SQL(biology_db, "insert", table_name="weight_stratum_df", + dataframe=fitted_weight_df, id_columns=["id"]) + else: + # ---- Get previous values in the table + table_df = SQL(biology_db, "select", table_name="weight_stratum_df") + # ---- Check the table keys + table_keys = np.unique(table_df[contrast_columns].apply(tuple, axis=1)).tolist() + # ---- Check the current keys + fitted_weight_df["current_keys"] = fitted_weight_df[contrast_columns].apply(tuple, axis=1) + # ---- Get unique values + current_keys = np.unique(fitted_weight_df["current_keys"]).tolist() + # ---- Get INSERTION keys + insertion_keys = list(set(current_keys).difference(set(table_keys))) + # ---- Get UPDATE keys + update_keys = list(set(current_keys).intersection(set(table_keys))) + # ---- INSERT values + if insertion_keys: + # ---- Create DataFrame + insertion_df = fitted_weight_df[fitted_weight_df["current_keys"].isin(insertion_keys)] + # ---- INSERT + SQL(biology_db, "insert", table_name="weight_stratum_df", + dataframe=insertion_df.drop(columns="current_keys")) + # ---- UPDATE values + if update_keys: + # ---- Create DataFrame + update_df = fitted_weight_df[fitted_weight_df["current_keys"].isin(update_keys)] + # ---- UPDATE + sql_group_update(biology_db, dataframe=update_df, + table_name="weight_stratum_df", columns=["average_weight"], + unique_columns=contrast_columns, + id_columns=["id"]) # Return output return fitted_weight_df @@ -1025,3 +1087,42 @@ def weight_proportions(catch_data: pd.DataFrame, "aged_unaged_weight_proportions_df": aged_unaged_proportions, } +# TODO: NEED TO UPDATE TO EITHER INSERT IF NOT PRESENT OR UPDATE OTHERWISE ! ! ! +# ! SEE ABOVE +def summarize_strata(nasc_biology_data: pd.DataFrame, spatial_data: pd.DataFrame, + file_configuration: dict): + + # Get biology database + acoustic_db = file_configuration["database"]["acoustics"] + + # Get biology database + biology_db = file_configuration["database"]["biology"] + + # Validate table + if not SQL(biology_db, "validate", table_name="strata_summary_df"): + + # Create copy + strata_df = spatial_data.copy() + + # Define new columns + strata_df[["length_mean", "weight_mean", "TS_mean", "number_density_mean", + "biomass_density_mean", "abundance_sum", "biomass_sum"]] = np.nan + # ---- Drop 'latitude_interval' + strata_df.drop(columns=["latitude_interval"], inplace=True) + + # ---- Create + SQL(biology_db, "create", table_name="strata_summary_df", + dataframe=strata_df, primary_keys=["stratum"]) + # ---- Populate table + SQL(biology_db, "insert", table_name="strata_summary_df", + dataframe=strata_df, id_columns=["stratum"]) + + # Get unique strata values + strata_values = np.unique(nasc_biology_data["stratum"]).tolist() + + # Update the table + sql_update_strata_summary(source_db=acoustic_db, target_db=biology_db, + source_table="survey_data_df", target_table="strata_summary_df", + data_columns=[("number_density", "mean"), + ("biomass_density", "mean")], + strata=strata_values) \ No newline at end of file diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index 823ebac4..f507d63f 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -2,7 +2,7 @@ from typing import Union, Tuple, Optional, List import yaml import re -from .sql_methods import SQL, query_processed_files, sql_data_exchange +from .sql_methods import SQL, query_processed_files, sql_data_exchange, initialize_database import pandas as pd from datetime import datetime import xarray as xr @@ -229,6 +229,9 @@ def validate_data_directory(file_configuration: dict, dataset: str, "Data loading argument `input_filenames` must be a list." ) + # Initialize the database file + initialize_database(root_directory, file_settings) + # Query the SQL database to process only new files (or create the db file in the first place) valid_files, file_configuration["database"][dataset] = ( query_processed_files(root_directory, file_settings, data_files) diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py index 9587c935..928ced70 100644 --- a/echopop/live/live_data_processing.py +++ b/echopop/live/live_data_processing.py @@ -2,7 +2,8 @@ import re from functools import reduce -from .sql_methods import SQL +from .sql_methods import SQL, sql_group_update +from .live_biology import summarize_strata from pathlib import Path from typing import Union, Tuple, Optional, List @@ -23,14 +24,13 @@ def get_unique_identifiers(data_dict: dict, # Get unique values of each contrast column across the biological datasets dfs = [pd.DataFrame({col: df[col].unique().tolist()}) for col in unique_columns - for df in df_list if not df.empty and isinstance(df, pd.DataFrame)] + for df in df_list if isinstance(df, pd.DataFrame) and not df.empty] # Reduce into a single DataFrame if len(unique_columns) > 1: return reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs) else: - return reduce(lambda left, right: pd.merge(left, right, how='inner'), dfs) - + return reduce(lambda left, right: pd.merge(left, right, how="outer"), dfs) def query_dataset(db_file: str, data_dict: dict, @@ -49,10 +49,10 @@ def query_dataset(db_file: str, valid_keys = list(set(inspected_table.keys()).intersection(set(data_columns))) # ---- Get unique identifiers unique_keys_df = get_unique_identifiers(data_dict, unique_keys) - # ---- Create conditional string + # ---- Create conditional string conditional_str = ( - " & ".join([f"{col} in {np.unique(unique_keys_df[col])}" - for col in unique_keys_df.columns]) + " & ".join([f"{col} in {np.unique(unique_keys_df[col]).tolist()}" + for col in unique_keys_df.columns]) ) # ---- Append the additional constraint statement if present if constraint is not None: @@ -65,3 +65,206 @@ def query_dataset(db_file: str, # Return the table DataFrame return data_sql + +def get_average_strata_weights(db_file: str, + data_dict: dict, + unique_columns: list): + + # Get corresponding `weight_fitted_df` from the database + weight_fitted_sql_df = query_dataset(db_file, data_dict, table_name="weight_stratum_df", + data_columns=unique_columns + ["average_weight"], + unique_columns=unique_columns, + constraint="sex == 'all'") + # ---- Use SQL table data if present + if weight_fitted_sql_df is not None and not weight_fitted_sql_df.empty: + # ---- Return output + return weight_fitted_sql_df + else: + return None + +def acoustic_pipeline(acoustic_dict: dict, + strata_df: pd.DataFrame, + file_configuration: dict, + verbose: bool, + contrast_columns: List[str] = []): + + # Get spatial column + spatial_column = file_configuration["spatial_column"] + unique_columns = spatial_column + contrast_columns + + # Get database file + acoustic_db = file_configuration["database"]["acoustics"] + + # Get biology database file + biology_db = file_configuration["database"]["biology"] + + # Check whether data dictionary is empty + if acoustic_dict["nasc_df"] is None or acoustic_dict["nasc_df"].empty: + # ---- Print, if verbose + if verbose: + print( + f"No new processed acoustic data available for processing." + ) + else: + # Get related acoustic data + acoustic_df = get_nasc_sql_data(acoustic_db, + acoustic_dict, + unique_columns=unique_columns) + + # Get the corresopding `sigma_bs` data (and also compute the sample-number weighted average) + sigma_bs_df = get_sigma_bs_sql_data(acoustic_db, + acoustic_dict, + unique_columns=unique_columns) + + # Calculate population estimates if valid data are available + if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]): + + # ---- Merge the NASC and sigma_bs datasets + nasc_biology = acoustic_df.merge(sigma_bs_df, on=unique_columns) + # ---- Compute the number densities (animals nmi^-2) + nasc_biology["number_density"] = ( + nasc_biology["nasc"] + / (4.0 * np.pi * nasc_biology["sigma_bs_mean"]) + ) + + # Get the corresponding average strata weights (computed for all fish) + weight_spatial_averages = get_average_strata_weights(biology_db, + acoustic_dict, + unique_columns=unique_columns) + + if weight_spatial_averages is not None: + # Merge average weights with number density estimates + nasc_biology = nasc_biology.merge(weight_spatial_averages, on=unique_columns) + + # Compute biomass densities + nasc_biology["biomass_density"] = ( + nasc_biology["number_density"] * nasc_biology["average_weight"] + ) + + # Update the survey population estimate DataFrame with the newly computed densities + if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]): + sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df", + columns=["number_density", "biomass_density"], + unique_columns=["stratum", "longitude", "latitude", "ping_time"]) + + # Summarize strata + summarize_strata(nasc_biology, strata_df, file_configuration) + +def get_nasc_sql_data(db_file: str, + data_dict: dict, + unique_columns: List[str]): + + # Add SELECTION columns + data_columns = ( + unique_columns + ["x", "y", "longitude", "latitude", "ping_time", "nasc", "number_density", + "biomass_density", "id"] + ) + # ----- Get the SQL dataset + nasc_sql_data = query_dataset(db_file, + data_dict, + table_name="survey_data_df", + data_columns = data_columns, + unique_columns=unique_columns, + constraint="nasc > 0.0") + # ---- Use SQL table data if present + if nasc_sql_data is not None and not nasc_sql_data.empty: + return nasc_sql_data + elif "nasc_df" in data_dict.keys(): + return data_dict["nasc_df"] + +def get_sigma_bs_sql_data(db_file: str, + data_dict: dict, + unique_columns: list): + + # Get corresponding `sigma_bs` DataFrame + sigma_bs_sql_df = query_dataset(db_file, data_dict, table_name="sigma_bs_mean_df", + data_columns=unique_columns + ["sigma_bs", "sigma_bs_count"], + unique_columns=unique_columns) + # ---- Use SQL table data if present + if sigma_bs_sql_df is not None and not sigma_bs_sql_df.empty: + # ---- Compute the weighted average + sigma_bs_mean_sql_df = ( + sigma_bs_sql_df.groupby(unique_columns)[["sigma_bs", "sigma_bs_count"]] + .apply(lambda df: np.average(df.sigma_bs, weights=df.sigma_bs_count)) + .to_frame("sigma_bs_mean") + .reset_index() + ) + # ---- Return output + return sigma_bs_mean_sql_df + else: + return None + + + +def biology_pipeline(biology_dict: dict, + strata_df: pd.DataFrame, + file_configuration: dict, + verbose: bool, + contrast_columns: List[str] = []): + + # Get spatial column + spatial_column = file_configuration["spatial_column"] + unique_columns = spatial_column + contrast_columns + + # Get database file + acoustic_db = file_configuration["database"]["acoustics"] + + # Get biology database file + biology_db = file_configuration["database"]["biology"] + + # Check for data completion + # ---- List of boolean values + full_biology_data = ( + [True if (isinstance(df, pd.DataFrame) and not df.empty) or (isinstance(df, dict)) + else False for _, df in biology_dict.items()] + ) + # ---- Validation + if not all(full_biology_data): + # ---- Print, if verbose + if verbose: + print( + f"No new processed biology data available for processing." + ) + else: + # Get related biology data + acoustic_df = get_nasc_sql_data(acoustic_db, + biology_dict, + unique_columns=unique_columns) + + # Get the corresopding `sigma_bs` data (and also compute the sample-number weighted average) + sigma_bs_df = get_sigma_bs_sql_data(acoustic_db, + biology_dict, + unique_columns=unique_columns) + + # Calculate population estimates if valid data are available + if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]): + # ---- Merge the NASC and sigma_bs datasets + nasc_biology = acoustic_df.merge(sigma_bs_df, on=unique_columns) + # ---- Compute the number densities (animals nmi^-2) + nasc_biology["number_density"] = ( + nasc_biology["nasc"] + / (4.0 * np.pi * nasc_biology["sigma_bs_mean"]) + ) + + # Get the corresponding average strata weights (computed for all fish) + weight_spatial_averages = get_average_strata_weights(biology_db, + biology_dict, + unique_columns=unique_columns) + + if weight_spatial_averages is not None: + # Merge average weights with number density estimates + nasc_biology = nasc_biology.merge(weight_spatial_averages, on=unique_columns) + + # Compute biomass densities + nasc_biology["biomass_density"] = ( + nasc_biology["number_density"] * nasc_biology["average_weight"] + ) + + # Update the survey population estimate DataFrame with the newly computed densities + if not nasc_biology.empty: + sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df", + columns=["number_density", "biomass_density"], + unique_columns=["stratum", "longitude", "latitude", "ping_time"]) + + # Summarize strata + summarize_strata(nasc_biology, strata_df, file_configuration) diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py index 2dd8cefc..6ce7741f 100644 --- a/echopop/live/live_spatial_methods.py +++ b/echopop/live/live_spatial_methods.py @@ -53,7 +53,7 @@ def apply_inpfc_definitions(dataset: pd.DataFrame, inpfc_df: pd.DataFrame): np.unique(np.hstack([inpfc_df.loc[:, "lower"], inpfc_df.loc[:, "upper"]])), labels = inpfc_df.loc[:, "stratum"] - ) + ).astype(int) return strata diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index 6f0b568e..f3cb7f5a 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -1,6 +1,10 @@ from typing import Union, Optional, Literal from pathlib import Path +from datetime import datetime import copy +import pandas as pd + +from .sql_methods import query_processed_files from .live_core import( LIVE_DATA_STRUCTURE, @@ -48,6 +52,8 @@ def __init__( ): # Initialize `meta` attribute self.meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"]) + # ---- Add datetime + self.meta["date"] = datetime.now() # Loading the configuration settings and definitions that are used to # initialize the Survey class object @@ -74,6 +80,48 @@ def __init__( if verbose: pass + def __repr__(self): + + # Get any acoustic files created + if "acoustic_files" in self.meta["provenance"]: + # ---- Get the filenames + acoustic_filenames = self.meta["provenance"]["acoustic_files"] + # ---- Subset if many files are being processed + if len(acoustic_filenames) > 2: + acoustic_filenames = acoustic_filenames[:2] + ["..."] + [f"[n = {len(acoustic_filenames)}]"] + # ---- Format string + acoustic_files = ", ".join(acoustic_filenames) + else: + acoustic_files = "None" + + # Get any biology files created + if "biology_files" in self.meta["provenance"]: + # ---- Get the filenames + biology_filenames = self.meta["provenance"]["biology_files"] + # ---- Subset if many files are being processed + if len(biology_filenames) > 4: + biology_filenames = biology_filenames + ["..."] + # ---- Format string + biology_files = ", ".join(biology_filenames) + else: + biology_files = "None" + + # Get linked database names + linked_dbs = ( + "\n ".join([f"{key.title()}: {db}" for key, db in self.config["database"].items()]) + ) + + return ( + f"LiveSurvey-class object \n" + f"Timestamp: {self.meta['date']} \n" + f"Acoustic files being processed: \n {acoustic_files}\n" + f"Biology files being processed: \n {biology_files}\n" + f"Linked databases: \n {linked_dbs}" + ) + + def __str__(self): + return self.__repr__() + def load_acoustic_data(self, input_filenames: Optional[list] = None, verbose: bool = True): @@ -93,13 +141,17 @@ def load_acoustic_data(self, # TODO: SettingWithCopyWarning: self.input["acoustics"]["prc_nasc_df"] = preprocess_acoustic_data(prc_nasc_df.copy(), self.input["spatial"], - self.config) + self.config) + # ---- Add meta key + self.meta["provenance"].update({ + "acoustic_files": acoustic_files, + }) # TODO: Add verbosity for printing database filepaths/connections if verbose: # ---- Create file list file_list = "\n".join(acoustic_files) print( - f"The following acoustic files have been processed:\n" + f"The following acoustic files are being processed:\n" f"{file_list}." ) else: @@ -118,17 +170,22 @@ def load_biology_data(self, # ---- Create file list file_list = "\n".join(biology_files) print( - f"The following biological files have been processed:\n" + f"The following biological files are being processed:\n" f"{file_list}." ) - # Read in the biology data files - initial_biology_output = eldl.read_biology_files(biology_files, self.config) + # Read in the biology data files + initial_biology_output = eldl.read_biology_files(biology_files, self.config) - # Preprocess the biology dataset - self.input["biology"], self.input["biology_processed"] = ( - preprocess_biology_data(initial_biology_output, self.input["spatial"], self.config) - ) + # Preprocess the biology dataset + self.input["biology"], self.input["biology_processed"] = ( + preprocess_biology_data(initial_biology_output, self.input["spatial"], self.config) + ) + + # Add meta key + self.meta["provenance"].update({ + "biology_files": biology_files, + }) def process_biology_data(self): @@ -137,59 +194,73 @@ def process_biology_data(self): # ----- Unprocessed biology_unprocessed = self.input["biology"] - # Compute `sigma_bs` by sending it to the appropriate database table - compute_sigma_bs(biology_unprocessed["specimen_df"], - biology_unprocessed["length_df"], - self.config) + # Check if data are present + unprocess_data_dfs = ( + [True if isinstance(df, pd.DataFrame) and not df.empty else False + for _, df in biology_unprocessed.items()] + ) + # ---- Proceed in processing the unprocessed data + if all(unprocess_data_dfs): - # Bin the length measurements of the biological data - bin_length_data(biology_unprocessed, self.config["length_distribution"]) + # Compute `sigma_bs` by sending it to the appropriate database table + compute_sigma_bs(biology_unprocessed["specimen_df"], + biology_unprocessed["length_df"], + self.config) - # Compute the length-weight regression and add it to the SQL table - length_weight_df = length_weight_regression(biology_unprocessed["specimen_df"], - self.config["length_distribution"], - self.config) - - # Compute length-binned counts for the aggregated and individual-based measurements - specimen_binned, specimen_binned_filtered, length_binned = ( - length_bin_counts(biology_unprocessed["length_df"], - biology_unprocessed["specimen_df"], - self.config) - ) + # Bin the length measurements of the biological data + bin_length_data(biology_unprocessed, self.config["length_distribution"]) - # Compute the number proportions - specimen_number_proportion, length_number_proportion, sex_number_proportions = ( - number_proportions(specimen_binned, specimen_binned_filtered, - length_binned, self.config) - ) + # Compute the length-weight regression and add it to the SQL table + length_weight_df = length_weight_regression(biology_unprocessed["specimen_df"], + self.config["length_distribution"], + self.config) + + # Compute length-binned counts for the aggregated and individual-based measurements + specimen_binned, specimen_binned_filtered, length_binned = ( + length_bin_counts(biology_unprocessed["length_df"], + biology_unprocessed["specimen_df"], + self.config) + ) - # Compute the length-binned weights for the aggregated and individual-based measurements - length_weight_binned, specimen_weight_binned = ( - length_bin_weights(biology_unprocessed["length_df"], - biology_unprocessed["specimen_df"], - length_weight_df,self.config) - ) + # Compute the number proportions + specimen_number_proportion, length_number_proportion, sex_number_proportions = ( + number_proportions(specimen_binned, specimen_binned_filtered, + length_binned, self.config) + ) - # Calculate the average weights among male, female, and all fish - self.input["weight_stratumn_df"] = ( - compute_average_weights(specimen_number_proportion, - length_number_proportion, - sex_number_proportions, - length_weight_df, - self.config["length_distribution"], - self.config) - ) - - # Compute the weight proportions - self.input["biology"].update({ - "proportions": weight_proportions(biology_unprocessed["catch_df"], - specimen_weight_binned, - length_weight_binned, - length_number_proportion, - length_weight_df, - self.config) - }) - + # Compute the length-binned weights for the aggregated and individual-based measurements + length_weight_binned, specimen_weight_binned = ( + length_bin_weights(biology_unprocessed["length_df"], + biology_unprocessed["specimen_df"], + length_weight_df,self.config) + ) + + # Calculate the average weights among male, female, and all fish + self.input["weight_stratum_df"] = ( + compute_average_weights(specimen_number_proportion, + length_number_proportion, + sex_number_proportions, + length_weight_df, + self.config["length_distribution"], + self.config) + ) + + # Compute the weight proportions + self.input["biology"].update({ + "proportions": weight_proportions(biology_unprocessed["catch_df"], + specimen_weight_binned, + length_weight_binned, + length_number_proportion, + length_weight_df, + self.config) + }) + + # Update the database + query_processed_files(self.config["data_root_dir"], + self.config["input_directories"]["biology"], + self.meta["provenance"]["biology_files"], + processed=True) + def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True): @@ -211,11 +282,27 @@ def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True): nasc_data_df = compute_nasc(acoustic_data_df, self.config, echometrics) # Format the dataframe and insert into the LiveSurvey object - self.input["acoustics"]["nasc_df"] = format_acoustic_dataset(nasc_data_df, self.config) + self.input["acoustics"]["nasc_df"] = format_acoustic_dataset(nasc_data_df, + self.config, + self.meta) + + # Update the database def estimate_population(self, - working_dataset: Literal["acoustic", "biology"]): + working_dataset: Literal["acoustic", "biology"], + verbose: bool = True): + + # method + if working_dataset == "acoustic": + eldp.acoustic_pipeline(self.input["acoustics"], + self.input["spatial"]["strata"], + self.config, + verbose=verbose) # method - pass + if working_dataset == "biology": + eldp.biology_pipeline(self.input["biology"], + self.input["spatial"]["strata"], + self.config, + verbose=verbose) diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index 3cdca5fd..335795b7 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -205,6 +205,16 @@ def sql_update(connection: sqla.Connection, table_name: str, columns: list, elif not isinstance(columns, list): columns = [columns] + def format_value(x): + if isinstance(x, str): + return "'{}'".format(x.replace("'", "''")) + elif isinstance(x, pd.Timestamp): + return "'{}'".format(x) + elif x is None: + return 'NULL' + else: + return str(x) + # Format the SET command # ---- Update column by applying arithmetic between table and dataframe if operation is not None and dataframe is not None: @@ -222,7 +232,8 @@ def sql_update(connection: sqla.Connection, table_name: str, columns: list, set_list = [f"{column} = {dataframe[column].values[0]}" for column in columns] # ---- Join the list set_clause = ', '.join(set_list) - + [f"{column} = {dataframe[column].values[0]}" for column in columns] + ", ".join(f"({','.join(map(lambda x: format_value(x), row))})" for row in data_tuple) # Add the WHERE clause if a parsed condition is provided if condition is not None: # ---- Parse the conditional string @@ -314,6 +325,89 @@ def sql_select(connection: sqla.Connection, table_name: str, else: return converted_data +def validate_tables(db_file: str, table_name: Union[str, List[str]], + reference_dataframe: pd.DataFrame): + + # Helper function + def _validate_table(table): + # ---- Check table existence + if not SQL(db_file, "validate", table_name=table): + raise KeyError( + f"SQL database table `{table}` in `{db_file}` failed to initialize!" + ) + # ---- Get DataFrame dtypes (avoid 'object' and similar ambiguous typing) + expected_dtypes = ( + {col: type(reference_dataframe[col][0]).__name__ for col in reference_dataframe.columns} + ) + # ---- Inspect the table + inspected_table = SQL(db_file, "inspect", table_name=table) + # ---- Get the column dtypes (with back-formatting via configuration mapping) + table_dtypes = { + col: SQL_DTYPES[type(inspected_table["filepath"]["type"]).__name__].__name__ + for col in inspected_table.keys() + } + # ---- Compare keys + key_difference = list(set(expected_dtypes).difference(set(table_dtypes))) + # -------- Raise error, if needed + if key_difference: + raise KeyError( + f"The following columns are missing from table `{table}` in `{db_file}`: " + f"{', '.join(key_difference)}." + ) + # ---- Compare dtypes + dtypes_comparison = ( + {key: table_dtypes[key] for key in table_dtypes + if table_dtypes[key] != expected_dtypes.get(key)} + ) + # ---- Get key names + dtypes_different_names = list(set(dtypes_comparison)) + # ---- Raise error, if needed + if dtypes_different_names: + raise TypeError( + f"The following columns from table `{table}` in `{db_file}` had unexpected " + f"datatypes: {', '.join(dtypes_different_names)}." + ) + + # Iterate through tables to validate + if isinstance(table_name, list): + _ = [_validate_table(table) for table in table_name] + else: + _validate_table(table_name) + +def initialize_database(root_directory: Path, file_settings: dict): + + # Get the database name + db_name = file_settings["database_name"] + + # Create filepath to the SQL database + # ---- Create Path to SQL database file + db_directory = root_directory / "database" + # ---- Create the directory if it does not already exist + db_directory.mkdir(parents=True, exist_ok=True) + # ---- Complete path to the database file + db_file = db_directory / db_name + + # Spoof an empty DataFrame for formatting purposes + template_df = pd.DataFrame({"filepath": ["dummy/path/string"]}) + + # Create two tables for 'files read' and 'files processed' + # ---- Read files + SQL(db_file, "create", table_name="files_read", dataframe=template_df, + primary_keys=["filepath"]) + # ---- Processed files + SQL(db_file, "create", table_name="files_processed", dataframe=template_df, + primary_keys=["filepath"]) + + # Query the database ensure it exists + # ---- File existence + if not Path(db_file).exists(): + raise FileExistsError( + f"SQL database file `{db_file}` failed to initialize!" + ) + + # Validate the created tables + validate_tables(db_file, ["files_read", "files_processed"], template_df) + SQL_COMMANDS = { "create": dict(function=sql_create, args=["table_name", "dataframe", "primary_keys"]), "drop": dict(function=sql_drop, args=["table_name"]), @@ -492,16 +586,15 @@ def format_sql_columns(kwargs: dict): return kwargs # TODO: Documentation -def query_processed_files(root_directory: Path, file_settings: dict, files: List[Path]) -> dict: +def query_processed_files(root_directory: Path, file_settings: dict, files: List[Path], + processed=False) -> dict: # Get the database name db_name = file_settings["database_name"] # Create filepath to the SQL database # ---- Create Path to SQL database file - db_directory = root_directory / "database" - # ---- Create the directory if it does not already exist - db_directory.mkdir(parents=True, exist_ok=True) + db_directory = Path(root_directory) / "database" # ---- Complete path to the database file db_file = db_directory / db_name @@ -509,28 +602,19 @@ def query_processed_files(root_directory: Path, file_settings: dict, files: List files_str = [str(file) for file in files] # ---- Create DataFrame current_files = pd.DataFrame(files_str, columns=["filepath"]) - - # Check for the table `files_read` - files_read_tbl = SQL(db_file, "validate", table_name="files_read") - - # Validate whether the table exists; if not, create the table and then insert - if not files_read_tbl: - # ---- Create table - SQL(db_file, "create", table_name="files_read", dataframe=current_files, - primary_keys = ["filepath"]) - # ---- Populate table - SQL(db_file, "insert", table_name="files_read", dataframe=current_files) - # ---- Break early - return files_str, db_file - - # Query already existing files - previous_files = SQL(db_file, "select", table_name="files_read", output_type=str) - # ---- Insert file list - SQL(db_file, "insert", table_name="files_read", dataframe=current_files, id_columns=["filepath"]) - - # Filter out previously processed files - # ---- Apply filter by comparing sets and return the output - return list(set(files_str) - set(previous_files)), db_file + + # Check against `files_processed` + previous_files = SQL(db_file, "select", table_name="files_processed", output_type=str) + + # Insert the files into the `files_read` table + if processed: + SQL(db_file, "insert", table_name="files_processed", dataframe=current_files, + id_columns=["filepath"]) + else: + SQL(db_file, "insert", table_name="files_read", dataframe=current_files, + id_columns=["filepath"]) + # ---- Apply filter by comparing sets and return the output + return list(set(files_str) - set(previous_files)), db_file # TODO: Documentation def sql_data_exchange(database_file: Path, **kwargs): @@ -582,6 +666,63 @@ def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str f"Attempted reset of [{str(db_file)}] failed." ) +def sql_update_strata_summary(source_db: str, + target_db: str, + source_table: str, + target_table: str, + data_columns: List[tuple[str, str]], + strata: list): + + # Format strata list as a string + strata_str = ', '.join(map(str, strata)) + + # Function reference map + FUNCTION_MAP = { + "sum": {"function": "SUM", + "suffix": "sum"}, + "mean": {"function": "AVG", + "suffix": "mean"} + } + + # Prepare the SQL script + sql_script = f""" + -- Attach the source and target databases + ATTACH DATABASE '{source_db}' AS source; + ATTACH DATABASE '{target_db}' AS target; + + """ + + # Dynamically format the cross-database command + for data_column, method in data_columns: + # ----- Format the function-method-suffic keys + suffix = FUNCTION_MAP[method]["suffix"] + fun = FUNCTION_MAP[method]["function"] + # ---- Create the combined SQL command using f-strings + sql_script += f""" + -- Calculate averages and directly update the target table + UPDATE target.{target_table} + SET {data_column}_{suffix} = ( + SELECT {fun}({data_column}) + FROM source.{source_table} + WHERE stratum = target.{target_table}.stratum + ) + WHERE stratum IN ({strata_str}); + """ + # ----- Append DETACH commands only once at the end + sql_script += """ + -- Detach the databases + DETACH DATABASE source; + DETACH DATABASE target; + """ + + # Create the engine + engine = create_engine(f"sqlite:///{target_db}") + + # Create the SQL database connection and send the script + with engine.connect() as connection: + dbapi_conn = connection.connection + _ = dbapi_conn.executescript(sql_script) + # TODO: Documentation def SQL(db_file: str, command: str, **kwargs): diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index 872849fd..35ca3b3a 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -1,59 +1,51 @@ from echopop.live.live_survey import LiveSurvey -from echopop.live.sql_methods import reset_db_files -from echopop.live.sql_methods import query_processed_files -from echopop.live.live_acoustics import preprocess_acoustic_data, compute_nasc -from echopop.live.live_biology import preprocess_biology_data -from echopop.live.sql_methods import SQL, SQL_COMMANDS, query_processed_files, format_sql_columns, sql_group_update, sql_data_exchange - -from echopop.live.live_core import( - LIVE_DATA_STRUCTURE, -) -from echopop.live.live_biology import ( - bin_length_data, - compute_average_weights, - compute_sigma_bs, - length_bin_counts, - length_weight_regression, - number_proportions, - length_bin_weights, - preprocess_biology_data, - weight_proportions -) -from echopop.live import live_data_processing as eldp -from echopop.live import live_data_loading as eldl - -live_init_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_initialization_config.yml" -live_file_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" - -realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path) +from echopop.live.sql_methods import SQL +# Set up `LiveSurvey` object +live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml" +live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" +realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path, verbose=True) +realtime_survey #################################################################################################### # TEST: ACOUSTICS #################################################################################################### -# NOTE: Reset database file for utility purposes -reset_db_files(realtime_survey.config) - # NOTE: LOAD DATA realtime_survey.load_acoustic_data() +realtime_survey +SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="files_read") +SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="files_processed") +SQL(realtime_survey.config["database"]["acoustics"], "map") +realtime_survey.config["database"] +realtime_survey.meta["provenance"] # NOTE: INITIAL PROCESSING [JUST ACOUSTIC] +# ! ERRORS OUT WHEN NUMBER OF FILES == 1 realtime_survey.process_acoustic_data() -realtime_survey.input +realtime_survey.estimate_population(working_dataset="acoustic") +self = realtime_survey +SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") #################################################################################################### # TEST: BIOLOGY #################################################################################################### -# NOTE: Reset database file for utility purposes -reset_db_files(realtime_survey.config) - # NOTE: LOAD DATA realtime_survey.load_biology_data() -realtime_survey.input # NOTE: INITIAL PROCESSING [JUST BIOLOGY] realtime_survey.process_biology_data() -realtime_survey.input +realtime_survey.estimate_population(working_dataset="biology") +SQL(realtime_survey.config["database"]["biology"], "select", table_name="files_read") +SQL(realtime_survey.config["database"]["biology"], "select", table_name="files_processed") +SQL(realtime_survey.config["database"]["biology"], "map") #################################################################################################### # TEST: POPULATION ESTIMATES #################################################################################################### # NOTE: Acoustic / biological data converge here to derive population estimates # TODO: Add argument that indicates what the new datasets and what data need to be pulled in # TODO: ARGUMENT {working_dataset: Literal["acoustic", "biology"]} -realtime_survey.estimate_population() \ No newline at end of file +# ! SQL ARGUMENT STRINGS FAIL ON > 1000 ENTRIES (250 ROWS) +realtime_survey.estimate_population(working_dataset="biology") +realtime_survey.estimate_population(working_dataset="acoustic") +#################################################################################################### +# TEST: GET DATA +#################################################################################################### +SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") +SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df") +SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df") \ No newline at end of file diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py index 09e93fcc..1c69351e 100644 --- a/echopop/zarr_read_ingest_test.py +++ b/echopop/zarr_read_ingest_test.py @@ -12,14 +12,16 @@ import os import re import contextlib +from echopop.acoustics import ts_length_regression, to_linear, to_dB from sqlalchemy import create_engine, text, Engine, inspect from echopop.live.live_core import LIVE_DATA_STRUCTURE, LIVE_FILE_FORMAT_MAP, LIVE_INPUT_FILE_CONFIG_MAP, SPATIAL_CONFIG_MAP from echopop.live.live_data_loading import validate_data_directory -from echopop.live.sql_methods import SQL, SQL_COMMANDS, query_processed_files, format_sql_columns, sql_group_update, sql_data_exchange +from echopop.live.sql_methods import SQL, SQL_COMMANDS, query_processed_files, format_sql_columns, sql_group_update, sql_data_exchange, initialize_database, sql_update_strata_summary from echopop.live import live_data_processing as eldp from echopop.live import live_data_loading as eldl +from echopop.live.live_data_processing import query_dataset, get_unique_identifiers from echopop.live.live_survey import LiveSurvey -from echopop.live.live_acoustics import preprocess_acoustic_data, compute_nasc, format_acoustic_dataset +from echopop.live.live_acoustics import integrate_nasc from echopop.live.live_biology import preprocess_biology_data from echopop.survey import Survey @@ -31,6 +33,28 @@ proportions_dict=analysis_dict["biology"]["proportions"]["number"] length_weight_dict = analysis_dict["biology"]["weight"] stratum_proportions_sexed["proportion_aged"] + stratum_proportions_sexed["proportion_unaged"] + +files = data_files + + + + + + # Map the table names and validate table creation + # ---- Get table names + tables = SQL(db_file, "map") + # ---- `files_read` + if "files_read" not in tables: + raise KeyError( + f"SQL database table `files_read` in `{db_file}` failed to initialize!" + ) + # ---- `files_processed` + if "files_processed" not in tables: + raise KeyError( + f"SQL database table `files_processed` in `{db_file}` failed to initialize!" + ) + + #################################################################################################### # TEST: YAML FILE CONFIGURATION # ---- Define filepaths @@ -64,50 +88,105 @@ # ---- Define unique columns unique_columns = spatial_column + contrast_columns -if working_dataset == "acoustics" and self.input["nasc_df"] is not None: - # ---- Get dataset - acoustic_df = get_nasc_sql_data(acoustic_db, - self.input["acoustics"], - unique_columns=unique_columns) - - -# Get corresponding `sigma_bs` -# sigma_bs_df = SQL(acoustic_db,"select",table_name="sigma_bs_mean_df", -# condition=f"stratum in {np.unique(self.input["nasc_df"]["stratum"])}") -sigma_bs_df = SQL(acoustic_db, "select", table_name="sigma_bs_mean_df") -sigma_bs_df["stratum"] = 2 -# ---- Compute the weighted average -sigma_bs_mean_df = ( - sigma_bs_df.groupby(spatial_column + ["species_id"])[["sigma_bs", "sigma_bs_count"]] - .apply(lambda df: np.average(df.sigma_bs, weights=df.sigma_bs_count)) - .to_frame("sigma_bs_mean") - .reset_index() -) - -# -nasc_biology = acoustic_df.merge(sigma_bs_mean_df, on=spatial_column) - -# Get the spatially averaged weights -weight_spatial_averages = self.input["weight_stratumn_df"] -# ---- Sub-select 'all' -general_weight_averages = weight_spatial_averages[weight_spatial_averages["sex"] == "all"] -general_weight_averages["stratum"] = 2 - -# -nasc_biology["number_density"] = ( - nasc_biology["nasc"] - / (4.0 * np.pi * nasc_biology["sigma_bs_mean"]) -) - -# -nasc_biology = nasc_biology.merge(general_weight_averages) +acoustic_db = file_configuration["database"][working_dataset] +self = realtime_survey +acoustic_dict = self.input["acoustics"] +verbose = True +contrast_columns = [] +db_file = acoustic_db +table_name="survey_data_df" +data_columns = data_columns +unique_columns=unique_columns +constraint="nasc > 0.0" +data_dict = self.input["acoustics"] +data_dict["nasc_df"]["stratum"] = 1 +data_dict["prc_nasc_df"]["stratum"] = 2 +table_name = "sigma_bs_mean_df" +data_columns=["sigma_bs", "sigma_bs_count"] +biology_db +strata_df = self.input["spatial"]["strata"] + +def biology_pipeline(biology_dict: dict, + strata_df: pd.DataFrame, + file_configuration: dict, + verbose: bool, + contrast_columns: List[str] = []): + + # Get spatial column + spatial_column = file_configuration["spatial_column"] + unique_columns = spatial_column + contrast_columns + + # Get database file + acoustic_db = file_configuration["database"]["acoustics"] + + # Get biology database file + biology_db = file_configuration["database"]["biology"] + + # Check for data completion + # ---- List of boolean values + full_biology_data = ( + [True for _, df in biology_dict.items() if isinstance(df, pd.DataFrame) and df is not None] + ) + # ---- Validation + if not all(full_biology_data): + # ---- Print, if verbose + if verbose: + print( + f"No new processed biology data available for processing." + ) + else: + # Get related biology data + acoustic_df = get_nasc_sql_data(acoustic_db, + biology_dict, + unique_columns=unique_columns) + + # Get the corresopding `sigma_bs` data (and also compute the sample-number weighted average) + sigma_bs_df = get_sigma_bs_sql_data(acoustic_db, + biology_dict, + unique_columns=unique_columns) + + # Calculate population estimates if valid data are available + if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]): + # ---- Merge the NASC and sigma_bs datasets + nasc_biology = acoustic_df.merge(sigma_bs_df, on=unique_columns) + # ---- Compute the number densities (animals nmi^-2) + nasc_biology["number_density"] = ( + nasc_biology["nasc"] + / (4.0 * np.pi * nasc_biology["sigma_bs_mean"]) + ) -nasc_biology["biomass_density"] = nasc_biology["number_density"] * nasc_biology["average_weight"] + # Get the corresponding average strata weights (computed for all fish) + weight_spatial_averages = get_average_strata_weights(biology_db, + biology_dict, + unique_columns=unique_columns) + + if weight_spatial_averages is not None: + # Merge average weights with number density estimates + nasc_biology = nasc_biology.merge(weight_spatial_averages, on=unique_columns) -sql_group_update(acoustic_db, dataframe=nasc_biology, - table_name="survey_data_df", columns=["number_density", "biomass_density"], - unique_columns=["stratum", "longitude", "latitude", "ping_time"]) + # Compute biomass densities + nasc_biology["biomass_density"] = ( + nasc_biology["number_density"] * nasc_biology["average_weight"] + ) + # Update the survey population estimate DataFrame with the newly computed densities + if not nasc_biology.empty: + sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df", + columns=["number_density", "biomass_density"], + unique_columns=["stratum", "longitude", "latitude", "ping_time"]) + + # Summarize strata + summarize_strata(nasc_biology, strata_df, file_configuration) + +db_file=acoustic_db +dataframe=nasc_biology +table_name="survey_data_df" +columns=["number_density", "biomass_density"] +unique_columns=["stratum", "longitude", "latitude", "ping_time"] +nasc_biology["number_density"].sum() / 2 +nasc_biology["number_density"] +SQL(acoustic_db, "select", table_name="survey_data_df") +SQL(biology_db, "select", table_name="strata_summary_df") strata_df = self.input["spatial"]["strata"].copy() strata_df[["length_mean", "weight_mean", "TS_mean", "number_density_mean", "biomass_density_mean", "abundance_sum", "biomass_sum"]] = np.nan @@ -318,7 +397,7 @@ def sql_update_strata_summary(source_db: str, _ = dbapi_conn.executescript(sql_script) SQL(biology_db, "select", table_name=target_table) -SQL(acoustic_db, "select", table_name=source_table) +SQL(acoustic_db, "select", table_name=source_table)["number_density"].mean() connection.close() dbapi_conn.close() From 1ee2e208e7989e9809a24f626c884917fdb0e548 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 7 Aug 2024 18:28:58 -0700 Subject: [PATCH 17/81] Patches --- echopop/live/live_acoustics.py | 47 ++++---- echopop/live/live_biology.py | 158 +++++++++++++++++++----- echopop/live/live_data_processing.py | 4 +- echopop/live/sql_methods.py | 63 +++++++--- echopop/test_workflow.py | 172 ++++++++++++++++++++++++++- echopop/zarr_read_ingest_test.py | 2 +- 6 files changed, 370 insertions(+), 76 deletions(-) diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py index 82e4c1a3..2da07e07 100644 --- a/echopop/live/live_acoustics.py +++ b/echopop/live/live_acoustics.py @@ -169,21 +169,24 @@ def estimate_echometrics(acoustic_data_df: pd.DataFrame): # Return the dictionary return echometrics -def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True): +def integrate_nasc(data_df: pd.DataFrame, echometrics: bool = True): # Vertically integrate PRC NASC - nasc_dict = {"nasc": acoustic_data_df["NASC"].sum()} + nasc_dict = {"nasc": data_df["NASC"].sum()} # Horizontally concatenate `echometrics`, if `True` if echometrics: # ---- Compute values # NOTE: This uses NASC instead of linear `sv` - echometrics_dict = estimate_echometrics(acoustic_data_df) + echometrics_dict = estimate_echometrics(data_df) # ---- Merge nasc_dict.update(echometrics_dict) # Convert `nasc_dict` to a DataFrame and return the output - return pd.Series(nasc_dict) + # return pd.Series(nasc_dict) + return pd.DataFrame(nasc_dict, index=[0]) + + # return pd.DataFrame([nasc_dict]) def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict, echometrics: bool = True): @@ -193,24 +196,24 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict, # Integrate NASC (and compute the echometrics, if necessary) # ---- Get number of unique sources - if len(np.unique(acoustic_data_df["source"])) == 1: - nasc_data_df = ( - acoustic_data_df - .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, - observed=False) - .apply(integrate_nasc, echometrics) - .reset_index() - .sort_values("ping_time") - ) - else: - nasc_data_df = ( - acoustic_data_df - .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, - observed=False) - .apply(integrate_nasc, echometrics, include_groups=False) - .unstack().reset_index() - .sort_values("ping_time") - ) + # if len(np.unique(acoustic_data_df["ping_time"])) > 1: + # nasc_data_df = ( + # acoustic_data_df + # .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, + # observed=False) + # .apply(integrate_nasc, echometrics, include_groups=False).unstack() + # .reset_index() + # .sort_values("ping_time") + # ) + # else: + nasc_data_df = ( + acoustic_data_df + .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, + observed=False) + .apply(integrate_nasc, echometrics, include_groups=False).droplevel(-1) + .reset_index() + .sort_values("ping_time") + ) # ---- Amend the dtypes if echometrics were computed if echometrics: # ---- Set dtypes diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py index ae7dde6b..27a53bd1 100644 --- a/echopop/live/live_biology.py +++ b/echopop/live/live_biology.py @@ -162,7 +162,7 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, # ---- Compute haul-specific means sigma_bs_df = ( ts_length_df - .groupby(list(set(contrast_columns) - set(["length"])), observed=False) + .groupby(key_list, observed=False) [["TS_L_slope", "TS_L_intercept", "length", "length_count"]] .apply(lambda x: average_sigma_bs(x, weights="length_count")) .to_frame("sigma_bs") @@ -172,42 +172,75 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, # ---- Count sum sigma_bs_df["sigma_bs_count"] = ( ts_length_df.reset_index() - .groupby(list(set(contrast_columns) - set(["length"])), observed=False)["length_count"] + .groupby(key_list, observed=False)["length_count"] .sum() ) # ---- Value sum sigma_bs_df["sigma_bs_sum"] = sigma_bs_df["sigma_bs"] * sigma_bs_df["sigma_bs_count"] # ---- Reset index sigma_bs_df = sigma_bs_df.reset_index() - + # ---- Create a tuple-key that can be used as an identifier + sigma_bs_df.loc[:, "id"] = sigma_bs_df[key_list].apply(tuple, axis=1).astype(str) + # Get the database file name acoustic_db = file_configuration["database"]["acoustics"] # Check for `sigma_bs_mean_df` in the database file # ---- Query database if not SQL(acoustic_db, "validate", table_name="sigma_bs_mean_df"): + # ---- Create an insertion dataframe + insertion_df = sigma_bs_df.copy() # ---- Create - SQL(acoustic_db, "create", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df, - primary_keys=list(set(contrast_columns) - set(["length"]))) + SQL(acoustic_db, "create", table_name="sigma_bs_mean_df", dataframe=insertion_df, + primary_keys=["id"]) # ---- Populate table - SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df) + SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", dataframe=insertion_df) else: - # ---- Check the present keys - current_keys_dict = SQL(acoustic_db, "inspect", table_name="sigma_bs_mean_df", - columns=key_list) - # ---- Insert if missing - if not all([all(sigma_bs_df[key].isin(current_keys_dict[key])) for key in key_list]): - SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df) - # ---- Update if not missing - else: + # ---- Get previous values in the table + table_df = SQL(acoustic_db, "select", table_name="sigma_bs_mean_df") + # ---- Check the table keys + table_keys = np.unique(table_df["id"]).tolist() + # ---- Get unique values + current_keys = np.unique(sigma_bs_df["id"]).tolist() + # ---- Get INSERTION keys + insertion_keys = list(set(current_keys).difference(set(table_keys))) + # ---- Get UPDATE keys + update_keys = list(set(current_keys).intersection(set(table_keys))) + # ---- INSERT values + if insertion_keys: + # ---- Create DataFrame + insertion_df = sigma_bs_df[sigma_bs_df["id"].isin(insertion_keys)] + # ---- INSERT + SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", + dataframe=insertion_df) + # ---- UPDATE values + if update_keys: + update_df = sigma_bs_df[sigma_bs_df["id"].isin(update_keys)] # ---- Create a filter condition command - condition_str = " & ".join([f"{key} in {np.unique(sigma_bs_df[key])}" for key in key_list]) - # ---- Update the table key - SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df, - operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], condition=condition_str) - # ---- Update the actual `sigma_bs` value in the table - SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", columns=["sigma_bs"], - operation="sigma_bs_sum / sigma_bs_count", condition=condition_str) + sql_group_update(acoustic_db, dataframe=update_df, table_name="sigma_bs_mean_df", + columns=["sigma_bs_count", "sigma_bs_sum"], operation="+", + unique_columns=["id"], id_columns=["id"]) + # condition_str = " & ".join([f"id = {id_value}" for id_value in update_keys]) + + # SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=update_df, + # operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], + # condition=condition_str) + # # ---- Check the present keys + # current_keys_dict = SQL(acoustic_db, "inspect", table_name="sigma_bs_mean_df", + # columns=key_list) + # # ---- Insert if missing + # if not all([all(sigma_bs_df[key].isin(current_keys_dict[key])) for key in key_list]): + # SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df) + # # ---- Update if not missing + # else: + # # ---- Create a filter condition command + # condition_str = " & ".join([f"{key} in {np.unique(sigma_bs_df[key])}" for key in key_list]) + # # ---- Update the table key + # SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df, + # operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], condition=condition_str) + # # ---- Update the actual `sigma_bs` value in the table + # SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", columns=["sigma_bs"], + # operation="sigma_bs_sum / sigma_bs_count", condition=condition_str) def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.DataFrame, file_configuration: dict): @@ -361,7 +394,7 @@ def length_bin_weights(length_data: pd.DataFrame, specimen_data: pd.DataFrame, # columns=list(set(length_data.columns) - set(["length_bin"]))) # list(set(length_data.columns) - set(["length_bin"])) # Get length distribution - # distribution_df = file_configuration["length_distribution"] + distribution_df = file_configuration["length_distribution"] # Generate sex-specific interpolators for fitted length-weight values for binned length counts # ---- Parse the male- and female-specific fitted weight values @@ -407,7 +440,21 @@ def weight_interpolator(dataframe_row): ).reset_index() # Check for `length_weight_df` in the database file + # ---- Combine the datasets + full_weight_distrib = ( + pd.concat([length_table_sexed.rename(columns={"weight_interp": "weight"}), + specimen_table_sexed], ignore_index=True) + ) + # ---- Sum by bin + full_weight_distrib = ( + full_weight_distrib.groupby(contrast_columns + ["length_bin"])["weight"].sum().reset_index() + ) # ---- Create id/primary key + full_weight_distrib.loc[:, "id"] = ( + full_weight_distrib[contrast_columns + ["length_bin"]].apply(tuple, axis=1).astype(str) + .str.replace("'", "") + ) + # key_values = ["-".join(length_table_sexed.reset_index() .loc[idx, ["species_id", "sex", "length_bin"]] .values.astype(str)) @@ -416,20 +463,65 @@ def weight_interpolator(dataframe_row): length_table_sexed["id"] = key_values # ---- Query database if not SQL(biology_db, "validate", table_name="length_weight_df"): + # ---- Create full table + overall_weight_distrib = ( + pd.DataFrame({"stratum": file_configuration["geospatial"]["inpfc"]["stratum_names"] + + [len(file_configuration["geospatial"]["inpfc"]["stratum_names"]) + 1]}) + .merge(pd.DataFrame({"sex": ["male", "female"]}), how="cross") + .merge(pd.DataFrame( + {"species_id": np.unique(file_configuration["species"]["number_code"])} + ), how="cross") + .merge(distribution_df.filter(["length_bin"]), how="cross") + ) + # ---- Pre-allocate weight + overall_weight_distrib.loc[:, "weight"] = 0.0 + # ---- Create id/primary key + overall_weight_distrib.loc[:, "id"] = ( + overall_weight_distrib[contrast_columns + ["length_bin"]].apply(tuple, axis=1) + .astype(str) + .str.replace("'", "") + ) # ---- Create SQL(biology_db, "create", table_name="length_weight_df", - dataframe=length_table_sexed, primary_keys=["id"]) - # ---- Populate table + dataframe=overall_weight_distrib, primary_keys=["id"]) + # ---- INSERT SQL(biology_db, "insert", table_name="length_weight_df", - dataframe=length_table_sexed, id_columns=["id"]) - else: - # ---- Update the table - sql_group_update(db_file=biology_db, - dataframe=length_table_sexed, - table_name="length_weight_df", - columns=["weight_interp"], - unique_columns=contrast_columns, - id_columns=["id"]) + dataframe=overall_weight_distrib) + # ---- UPDATE + sql_group_update(biology_db, dataframe=full_weight_distrib, table_name="length_weight_df", + columns=["weight"], + unique_columns=["id"], id_columns=["id"]) + # table_df = SQL(biology_db, "select", table_name="length_weight_df") + # # ---- Check the table keys + # table_keys = np.unique(table_df["id"]).tolist() + # # ---- Get unique values + # current_keys = np.unique(full_weight_distrib["id"]).tolist() + # # ---- Get INSERTION keys + # insertion_keys = list(set(current_keys).difference(set(table_keys))) + # # ---- Get UPDATE keys + # update_keys = list(set(current_keys).intersection(set(table_keys))) + # # ---- INSERT values + # if insertion_keys: + # # ---- Create DataFrame + # insertion_df = full_weight_distrib[full_weight_distrib["id"].isin(insertion_keys)] + # # ---- INSERT + # SQL(biology_db, "insert", table_name="length_weight_df", + # dataframe=insertion_df) + # # ---- UPDATE values + # if update_keys: + # update_df = full_weight_distrib[full_weight_distrib["id"].isin(update_keys)] + # # ---- Create a filter condition command + # sql_group_update(biology_db, dataframe=update_df, table_name="length_weight_df", + # columns=["weight"], + # unique_columns=["id"], id_columns=["id"]) + + # # ---- Update the table + # sql_group_update(db_file=biology_db, + # dataframe=length_table_sexed, + # table_name="length_weight_df", + # columns=["weight_interp"], + # unique_columns=contrast_columns, + # id_columns=["id"]) # length_sql_sexed diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py index 928ced70..18d493d0 100644 --- a/echopop/live/live_data_processing.py +++ b/echopop/live/live_data_processing.py @@ -144,8 +144,8 @@ def acoustic_pipeline(acoustic_dict: dict, # Update the survey population estimate DataFrame with the newly computed densities if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]): sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df", - columns=["number_density", "biomass_density"], - unique_columns=["stratum", "longitude", "latitude", "ping_time"]) + columns=["number_density", "biomass_density"], + unique_columns=["id"]) # Summarize strata summarize_strata(nasc_biology, strata_df, file_configuration) diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index 335795b7..f680e908 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -232,8 +232,7 @@ def format_value(x): set_list = [f"{column} = {dataframe[column].values[0]}" for column in columns] # ---- Join the list set_clause = ', '.join(set_list) - [f"{column} = {dataframe[column].values[0]}" for column in columns] - ", ".join(f"({','.join(map(lambda x: format_value(x), row))})" for row in data_tuple) + # Add the WHERE clause if a parsed condition is provided if condition is not None: # ---- Parse the conditional string @@ -442,6 +441,7 @@ def sql_group_update(db_file: str, table_name: str, columns: List[str], unique_columns: List[str], + operation: Optional[str] = None, id_columns: Optional[List[str]] = None): # Check for unique values contained within the table @@ -468,9 +468,7 @@ def sql_group_update(db_file: str, # Insert into the table if not otherwise present if not filtered_df.empty: SQL(db_file, "insert", table_name=table_name, id_columns=id_columns, dataframe=filtered_df) - - # Update the table - # ---- Format the conditional string + case_statements = [] for col in columns: case_stmt = "CASE" @@ -482,24 +480,57 @@ def sql_group_update(db_file: str, ]) # Add the WHEN condition to the CASE statement case_stmt += f" WHEN {filter_conditions} THEN {row[col]}" - case_stmt += " END" - case_statements.append(f"{col} = {case_stmt}") + case_stmt += f" ELSE {col} END" + + if operation is not None: + case_statements.append(f"{col} = {col} {operation} {case_stmt}") + else: + case_statements.append(f"{col} = {case_stmt}") + + + # Update the table + # ---- Format the conditional string + # case_statements = [] + # for col in columns: + # case_stmt = "CASE" + # for _, row in dataframe.iterrows(): + # # Construct the filter condition based on unique_columns + # filter_conditions = ' AND '.join([ + # f"{col} = '{row[col]}'" if isinstance(row[col], str) else f"{col} = {row[col]}" + # for col in unique_columns + # ]) + # # Add the WHEN condition to the CASE statement + # case_stmt += f" WHEN {filter_conditions} THEN {row[col]}" + # case_stmt += " END" + # case_statements.append(f"{col} = {case_stmt}") # Construct the full SQL UPDATE statement - update_clause = ', '.join(case_statements) + update_clause = ", ".join(case_statements) # Format the SQL COMMAND string + # sql_command = f""" + # UPDATE {table_name} + # SET {update_clause} + # WHERE ({' OR '.join([ + # ' AND '.join([ + # f"{col} = '{row[col]}'" if isinstance(row[col], str) else f"{col} = {row[col]}" + # for col in unique_columns + # ]) + # for _, row in dataframe.iterrows() + # ])}); + # """ sql_command = f""" UPDATE {table_name} - SET {update_clause} - WHERE ({' OR '.join([ - ' AND '.join([ - f"{col} = '{row[col]}'" if isinstance(row[col], str) else f"{col} = {row[col]}" - for col in unique_columns - ]) - for _, row in dataframe.iterrows() - ])}); + SET {update_clause}; """ + # WHERE ({' OR '.join([ + # ' AND '.join([ + # f"{col} = '{row[col]}'" if isinstance(row[col], str) else f"{col} = {row[col]}" + # for col in unique_columns + # ]) + # for _, row in dataframe.iterrows() + # ])}); + # """ # Create engine engine = create_engine(f"sqlite:///{db_file}") diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index 35ca3b3a..84bb298c 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -1,6 +1,16 @@ from echopop.live.live_survey import LiveSurvey from echopop.live.sql_methods import SQL - +from echopop.live.live_biology import ( + bin_length_data, + compute_average_weights, + compute_sigma_bs, + length_bin_counts, + length_bin_weights, + length_weight_regression, + number_proportions, + preprocess_biology_data, + weight_proportions +) # Set up `LiveSurvey` object live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml" live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" @@ -8,23 +18,140 @@ realtime_survey #################################################################################################### # TEST: ACOUSTICS +# Actual flow: +realtime_survey.load_acoustic_data() #`input_filenames` = Optional[List[str]] +realtime_survey.process_acoustic_data() +realtime_survey.estimate_population(working_dataset="acoustic") +amo = SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") +amo[amo.nasc > 0] +SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_fitted_df") + +realtime_survey.load_biology_data() #`input_filenames` = Optional[List[str]] +realtime_survey.process_biology_data() +realtime_survey.estimate_population(working_dataset="biology") +SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df") +SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df") +tbl = SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df") +tbl[tbl.weight > 0] +tbl.weight.sum() +# NOTE: Pulling successfully processed filenames +# ! This dictionary key name will change +realtime_survey.meta["provenance"][f"{working_dataset}_files"] +SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="files_processed") #################################################################################################### # NOTE: LOAD DATA +table_df[table_df.weight > 0] realtime_survey.load_acoustic_data() realtime_survey SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="files_read") SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="files_processed") -SQL(realtime_survey.config["database"]["acoustics"], "map") +out = SQL(realtime_survey.config["database"]["acoustics"], "map") +SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df") +_ = SQL(realtime_survey.config["database"]["biology"], "drop", table_name="length_weight_df") realtime_survey.config["database"] realtime_survey.meta["provenance"] # NOTE: INITIAL PROCESSING [JUST ACOUSTIC] # ! ERRORS OUT WHEN NUMBER OF FILES == 1 realtime_survey.process_acoustic_data() +# ! sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near "18": syntax error realtime_survey.estimate_population(working_dataset="acoustic") self = realtime_survey SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") #################################################################################################### # TEST: BIOLOGY +# Actual flow +realtime_survey.load_biology_data() #`input_filenames` = Optional[List[str]] +realtime_survey.process_biology_data() +realtime_survey.estimate_population(working_dataset="biology") +self = realtime_survey +biology_unprocessed = self.input["biology"] +specimen_data = biology_unprocessed["specimen_df"] +length_data = biology_unprocessed["length_df"] +biology_dict = self.input["biology"] +file_configuration = self.config +strata_df = self.input["spatial"]["strata"] +from echopop.live.live_acoustics import average_sigma_bs, compute_nasc, estimate_echometrics, integrate_nasc +from echopop.live.sql_methods import sql_group_update +from echopop.live.live_biology import summarize_strata +import numpy as np; import pandas as pd +echometrics: bool = True +acoustic_data_df = self.input["acoustics"]["prc_nasc_df"].copy() +spatial_column = ["stratum"] +# acoustic_data_df_copy = acoustic_data_df.copy() +acoustic_data_df_copy.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(integrate_nasc, echometrics, include_groups=False) + +acoustic_data_df.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(integrate_nasc, echometrics, include_groups=False).droplevel(-1).reset_index() +acoustic_data_df_copy.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(integrate_nasc, echometrics, include_groups=False) +acoustic_data_df.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(integrate_nasc, echometrics, include_groups=False) + +acoustic_data_df_copy.groupby(["longitude", "latitude", "ping_time"] + spatial_column, as_index=True, group_keys=True).apply(integrate_nasc, echometrics, include_groups=False) +acoustic_data_df.groupby(["longitude", "latitude", "ping_time"] + spatial_column, as_index=True, group_keys=True).apply(integrate_nasc, echometrics, include_groups=False) +acoustic_data_df.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(lambda g: integrate_nasc(g, echometrics)).reset_index(drop=True) +dd.index.get_level_values(-1) +cc.index.get_level_values(-1) +( + acoustic_data_df + .groupby(['longitude', 'latitude', 'ping_time', 'source'] + spatial_column) + .apply(lambda g: integrate_nasc(g, echometrics=True), include_groups=False) + .reset_index() + # .rename_axis(None, axis=0) # Remove any unwanted hierarchical index +) +(acoustic_data_df.groupby(['longitude', 'latitude', 'ping_time', 'source', 'stratum']) + .apply(integrate_nasc, echometrics=True) + .reset_index()) +acoustic_data_df = acoustic_data_df[acoustic_data_df.distance == 0.0] +acoustic_data_df = acoustic_data_df_copy[acoustic_data_df_copy.distance==0.0] +pd.Series(nasc_dict).index +pd.DataFrame.from_dict(nasc_dict, orient="columns") +pd.DataFrame(nasc_dict, index=[0]) +SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df") +( + acoustic_data_df.groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, + observed=False) + .apply(lambda df: integrate_nasc(df, echometrics)).reset_index() +) + +( + acoustic_data_df.groupby(["longitude", "latitude", "ping_time", "source", "stratum"], observed=False) + .apply(lambda df: integrate_nasc(df, echometrics=True), include_groups=False) + .reset_index() +) + +result_df = acoustic_data_df.groupby(['longitude', 'latitude', 'ping_time', 'source', 'stratum']) \ + .apply(lambda g: integrate_nasc(g, echometrics=True), include_groups=False) \ + .reset_index() + +print(acoustic_data_df.columns) +print(acoustic_data_df_copy.columns) +print(acoustic_data_df.dtypes) +print(acoustic_data_df_copy.dtypes) +# Inspect DataFrame before groupby +print(acoustic_data_df.head()) +print(acoustic_data_df_copy.head()) + +print(acoustic_data_df["longitude"].unique()) +print(acoustic_data_df_copy["longitude"].unique()) + +print(acoustic_data_df["latitude"].unique()) +print(acoustic_data_df_copy["latitude"].unique()) + +print("Grouped original index levels:", grouped_original.size().index.names) +print("Grouped reset index levels:", grouped_reset.size().index.names) + +print(acoustic_data_df["ping_time"].unique()) +print(acoustic_data_df_copy["ping_time"].unique()) + +print(acoustic_data_df["source"].unique()) +print(acoustic_data_df_copy["source"].unique()) + +grouped_original = acoustic_data_df_copy.groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, observed=False) +grouped_reset = acoustic_data_df.groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, observed=False) +print(acoustic_data_df.index) +print(acoustic_data_df_copy.index) +grouped_original.index +print(grouped_original.size()) +print(grouped_reset.size()) +sql_group_update(acoustic_db, sigma_bs_df, table_name="sigma_bs_mean_df") #################################################################################################### # NOTE: LOAD DATA realtime_survey.load_biology_data() @@ -34,6 +161,8 @@ SQL(realtime_survey.config["database"]["biology"], "select", table_name="files_read") SQL(realtime_survey.config["database"]["biology"], "select", table_name="files_processed") SQL(realtime_survey.config["database"]["biology"], "map") +SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df") +SQL(realtime_survey.config["database"]["acoustics"], "drop", table_name="sigma_bs_mean_df") #################################################################################################### # TEST: POPULATION ESTIMATES #################################################################################################### @@ -43,9 +172,48 @@ # ! SQL ARGUMENT STRINGS FAIL ON > 1000 ENTRIES (250 ROWS) realtime_survey.estimate_population(working_dataset="biology") realtime_survey.estimate_population(working_dataset="acoustic") +self = realtime_survey +acoustic_dict = self.input["acoustics"] +strata_df = self.input["spatial"]["strata"] +file_configuration = self.config +from echopop.live.sql_methods import SQL, sql_group_update +from echopop.live.live_biology import summarize_strata + +db_file = acoustic_db +dataframe=nasc_biology +table_name="survey_data_df" +columns=["number_density", "biomass_density"] +unique_columns = ["stratum", "longitude", "latitude", "ping_time"] #################################################################################################### # TEST: GET DATA #################################################################################################### SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df") +SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df") + +SQL(acoustic_db, "drop", table_name="sigma_bs_mean_df") + +##### +# NOTE: Below are hypothetical visualizations +# +survey_data = SQL(realtime_survey.config["database"]["acoustics"], "select", + table_name="survey_data_df") + +import matplotlib.pyplot as plt +import numpy as np + +survey_data.loc[0, "nasc"] = 1e3 + +plt.plot(survey_data["longitude"], survey_data["latitude"]) +plt.scatter(survey_data["longitude"], survey_data["latitude"], s=survey_data["nasc"]) +plt.show() + +SQL(realtime_survey.config["database"]["biology"], "map") +# ! NEED TO ENSURE THAT TABLE FOR LENGTH/WEIGHT HISTOGRAM IS AVAILABLE +SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df") +SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_fitted_df") +SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df") +realtime_survey.input["spatial"]["strata"] +# +SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df") SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df") \ No newline at end of file diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py index 1c69351e..e6d00cc9 100644 --- a/echopop/zarr_read_ingest_test.py +++ b/echopop/zarr_read_ingest_test.py @@ -29,7 +29,7 @@ survey_2019.transect_analysis() survey_2019.analysis["transect"]["biology"]["weight"]["weight_stratum_df"] analysis_dict = survey_2019.analysis["transect"] - +SQL(acoustic_db, "select", table_name="sigma_bs_mean_df") proportions_dict=analysis_dict["biology"]["proportions"]["number"] length_weight_dict = analysis_dict["biology"]["weight"] stratum_proportions_sexed["proportion_aged"] + stratum_proportions_sexed["proportion_unaged"] From 0f31b20d1a4bad0a82902f0e70fa6bc7a3972696 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Thu, 8 Aug 2024 10:53:47 -0700 Subject: [PATCH 18/81] Cleaned up `test_workflow` --- echopop/test_workflow.py | 252 ++++++++------------------------------- 1 file changed, 51 insertions(+), 201 deletions(-) diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index 84bb298c..e52c6739 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -1,219 +1,69 @@ from echopop.live.live_survey import LiveSurvey from echopop.live.sql_methods import SQL -from echopop.live.live_biology import ( - bin_length_data, - compute_average_weights, - compute_sigma_bs, - length_bin_counts, - length_bin_weights, - length_weight_regression, - number_proportions, - preprocess_biology_data, - weight_proportions -) -# Set up `LiveSurvey` object + +#################################################################################################### +# TEST: Set up `LiveSurvey` object +# NOTE: General initialization parameter configuration live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml" +# NOTE: File configuration live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" +# NOTE: Create object realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path, verbose=True) +# NOTE: String-representation via `LiveSurvey.__repr__`: +# NOTE: Lists current files being processed and linked databases (WIP) realtime_survey #################################################################################################### -# TEST: ACOUSTICS -# Actual flow: -realtime_survey.load_acoustic_data() #`input_filenames` = Optional[List[str]] -realtime_survey.process_acoustic_data() -realtime_survey.estimate_population(working_dataset="acoustic") -amo = SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") -amo[amo.nasc > 0] -SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_fitted_df") - -realtime_survey.load_biology_data() #`input_filenames` = Optional[List[str]] -realtime_survey.process_biology_data() -realtime_survey.estimate_population(working_dataset="biology") -SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df") -SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df") -tbl = SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df") -tbl[tbl.weight > 0] -tbl.weight.sum() -# NOTE: Pulling successfully processed filenames -# ! This dictionary key name will change -realtime_survey.meta["provenance"][f"{working_dataset}_files"] -SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="files_processed") -#################################################################################################### -# NOTE: LOAD DATA -table_df[table_df.weight > 0] +# TEST: TRIGGER --> NEW ACOUSTIC DATA +# NOTE: Load new acoustic data (Either glob file search or `input_filenames Optional[List[str]]`) realtime_survey.load_acoustic_data() -realtime_survey -SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="files_read") -SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="files_processed") -out = SQL(realtime_survey.config["database"]["acoustics"], "map") -SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df") -_ = SQL(realtime_survey.config["database"]["biology"], "drop", table_name="length_weight_df") -realtime_survey.config["database"] -realtime_survey.meta["provenance"] -# NOTE: INITIAL PROCESSING [JUST ACOUSTIC] -# ! ERRORS OUT WHEN NUMBER OF FILES == 1 +# NOTE: Process new acoustic data +# NOTE: This will update linked database tables realtime_survey.process_acoustic_data() -# ! sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near "18": syntax error +# NOTE: Generate population estimates (or pass if there are no biological data) +# NOTE: `working_dataset = Literal["acoustic", "biology"]` realtime_survey.estimate_population(working_dataset="acoustic") -self = realtime_survey -SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") -#################################################################################################### -# TEST: BIOLOGY -# Actual flow -realtime_survey.load_biology_data() #`input_filenames` = Optional[List[str]] -realtime_survey.process_biology_data() -realtime_survey.estimate_population(working_dataset="biology") -self = realtime_survey -biology_unprocessed = self.input["biology"] -specimen_data = biology_unprocessed["specimen_df"] -length_data = biology_unprocessed["length_df"] -biology_dict = self.input["biology"] -file_configuration = self.config -strata_df = self.input["spatial"]["strata"] -from echopop.live.live_acoustics import average_sigma_bs, compute_nasc, estimate_echometrics, integrate_nasc -from echopop.live.sql_methods import sql_group_update -from echopop.live.live_biology import summarize_strata -import numpy as np; import pandas as pd -echometrics: bool = True -acoustic_data_df = self.input["acoustics"]["prc_nasc_df"].copy() -spatial_column = ["stratum"] -# acoustic_data_df_copy = acoustic_data_df.copy() -acoustic_data_df_copy.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(integrate_nasc, echometrics, include_groups=False) - -acoustic_data_df.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(integrate_nasc, echometrics, include_groups=False).droplevel(-1).reset_index() -acoustic_data_df_copy.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(integrate_nasc, echometrics, include_groups=False) -acoustic_data_df.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(integrate_nasc, echometrics, include_groups=False) - -acoustic_data_df_copy.groupby(["longitude", "latitude", "ping_time"] + spatial_column, as_index=True, group_keys=True).apply(integrate_nasc, echometrics, include_groups=False) -acoustic_data_df.groupby(["longitude", "latitude", "ping_time"] + spatial_column, as_index=True, group_keys=True).apply(integrate_nasc, echometrics, include_groups=False) -acoustic_data_df.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(lambda g: integrate_nasc(g, echometrics)).reset_index(drop=True) -dd.index.get_level_values(-1) -cc.index.get_level_values(-1) -( - acoustic_data_df - .groupby(['longitude', 'latitude', 'ping_time', 'source'] + spatial_column) - .apply(lambda g: integrate_nasc(g, echometrics=True), include_groups=False) - .reset_index() - # .rename_axis(None, axis=0) # Remove any unwanted hierarchical index -) -(acoustic_data_df.groupby(['longitude', 'latitude', 'ping_time', 'source', 'stratum']) - .apply(integrate_nasc, echometrics=True) - .reset_index()) -acoustic_data_df = acoustic_data_df[acoustic_data_df.distance == 0.0] -acoustic_data_df = acoustic_data_df_copy[acoustic_data_df_copy.distance==0.0] -pd.Series(nasc_dict).index -pd.DataFrame.from_dict(nasc_dict, orient="columns") -pd.DataFrame(nasc_dict, index=[0]) -SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df") -( - acoustic_data_df.groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, - observed=False) - .apply(lambda df: integrate_nasc(df, echometrics)).reset_index() -) - -( - acoustic_data_df.groupby(["longitude", "latitude", "ping_time", "source", "stratum"], observed=False) - .apply(lambda df: integrate_nasc(df, echometrics=True), include_groups=False) - .reset_index() -) - -result_df = acoustic_data_df.groupby(['longitude', 'latitude', 'ping_time', 'source', 'stratum']) \ - .apply(lambda g: integrate_nasc(g, echometrics=True), include_groups=False) \ - .reset_index() - -print(acoustic_data_df.columns) -print(acoustic_data_df_copy.columns) -print(acoustic_data_df.dtypes) -print(acoustic_data_df_copy.dtypes) -# Inspect DataFrame before groupby -print(acoustic_data_df.head()) -print(acoustic_data_df_copy.head()) - -print(acoustic_data_df["longitude"].unique()) -print(acoustic_data_df_copy["longitude"].unique()) - -print(acoustic_data_df["latitude"].unique()) -print(acoustic_data_df_copy["latitude"].unique()) - -print("Grouped original index levels:", grouped_original.size().index.names) -print("Grouped reset index levels:", grouped_reset.size().index.names) - -print(acoustic_data_df["ping_time"].unique()) -print(acoustic_data_df_copy["ping_time"].unique()) - -print(acoustic_data_df["source"].unique()) -print(acoustic_data_df_copy["source"].unique()) - -grouped_original = acoustic_data_df_copy.groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, observed=False) -grouped_reset = acoustic_data_df.groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, observed=False) -print(acoustic_data_df.index) -print(acoustic_data_df_copy.index) -grouped_original.index -print(grouped_original.size()) -print(grouped_reset.size()) -sql_group_update(acoustic_db, sigma_bs_df, table_name="sigma_bs_mean_df") +# NOTE: String-representation via `LiveSurvey.__repr__`: +# NOTE: Lists current files being processed and linked databases (WIP) +realtime_survey #################################################################################################### -# NOTE: LOAD DATA +# TEST: TRIGGER --> NEW BIOLOGY DATA +# NOTE: Load new biological data (Either glob file search or `input_filenames Optional[List[str]]`) realtime_survey.load_biology_data() -# NOTE: INITIAL PROCESSING [JUST BIOLOGY] +# NOTE: Process new biological data +# NOTE: This will update linked database tables realtime_survey.process_biology_data() +# NOTE: Generate population estimates (or pass if there are no acoustic data) +# NOTE: `working_dataset = Literal["acoustic", "biology"]` realtime_survey.estimate_population(working_dataset="biology") -SQL(realtime_survey.config["database"]["biology"], "select", table_name="files_read") -SQL(realtime_survey.config["database"]["biology"], "select", table_name="files_processed") -SQL(realtime_survey.config["database"]["biology"], "map") -SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df") -SQL(realtime_survey.config["database"]["acoustics"], "drop", table_name="sigma_bs_mean_df") -#################################################################################################### -# TEST: POPULATION ESTIMATES -#################################################################################################### -# NOTE: Acoustic / biological data converge here to derive population estimates -# TODO: Add argument that indicates what the new datasets and what data need to be pulled in -# TODO: ARGUMENT {working_dataset: Literal["acoustic", "biology"]} -# ! SQL ARGUMENT STRINGS FAIL ON > 1000 ENTRIES (250 ROWS) -realtime_survey.estimate_population(working_dataset="biology") -realtime_survey.estimate_population(working_dataset="acoustic") -self = realtime_survey -acoustic_dict = self.input["acoustics"] -strata_df = self.input["spatial"]["strata"] -file_configuration = self.config -from echopop.live.sql_methods import SQL, sql_group_update -from echopop.live.live_biology import summarize_strata - -db_file = acoustic_db -dataframe=nasc_biology -table_name="survey_data_df" -columns=["number_density", "biomass_density"] -unique_columns = ["stratum", "longitude", "latitude", "ping_time"] +# NOTE: String-representation via `LiveSurvey.__repr__`: +# NOTE: Lists current files being processed and linked databases (WIP) +realtime_survey #################################################################################################### -# TEST: GET DATA +# TEST: `LiveSurvey` --[`files_processed`]--> `Echodataflow` +# NOTE: `LiveSurvey.meta` attribute +# ---- ACOUSTIC +realtime_survey.meta["provenance"]["acoustic_files"] +# ---- BIOLOGICAL +realtime_survey.meta["provenance"]["biology_files"] +# NOTE: SQL function query from database file [cumulative list] +# ---- ACOUSTIC +SQL(db_file=realtime_survey.config["database"]["acoustics"], + command="select", table_name="files_processed") +# ---- BIOLOGICAL +SQL(db_file=realtime_survey.config["database"]["biology"], + command="select", table_name="files_processed") #################################################################################################### -SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") +# TEST: `LiveSurvey` --[(key) SQL tables]--> Users +# !!! The SQL functions will fail if the tables have not yet been created/initialized +# ---- ACOUSTICS +# NOTE: Mean linear backscatter coefficient (`sigma_bs`) keyed for each haul and stratum SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df") -SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df") - -SQL(acoustic_db, "drop", table_name="sigma_bs_mean_df") - -##### -# NOTE: Below are hypothetical visualizations -# -survey_data = SQL(realtime_survey.config["database"]["acoustics"], "select", - table_name="survey_data_df") - -import matplotlib.pyplot as plt -import numpy as np - -survey_data.loc[0, "nasc"] = 1e3 - -plt.plot(survey_data["longitude"], survey_data["latitude"]) -plt.scatter(survey_data["longitude"], survey_data["latitude"], s=survey_data["nasc"]) -plt.show() - -SQL(realtime_survey.config["database"]["biology"], "map") -# ! NEED TO ENSURE THAT TABLE FOR LENGTH/WEIGHT HISTOGRAM IS AVAILABLE -SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df") +# NOTE: Along-track acoustically-derived number/biomass densities and NASC +SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") +# ---- BIOLOGICAL +# NOTE: Fitted (discretized) length-weight relationship SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_fitted_df") -SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df") -realtime_survey.input["spatial"]["strata"] -# -SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df") -SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df") \ No newline at end of file +# NOTE: Quantized length-binned weights (summed) +SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df") +# NOTE: Average weights per stratum +SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df") \ No newline at end of file From 40f3d7bfc4e368a52304d7453d970ac002813a7b Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 12 Aug 2024 11:35:28 -0700 Subject: [PATCH 19/81] YAML config settings adjustment for db dir --- config_files/live_initialization_config.yml | 4 +- config_files/live_survey_year_2019_config.yml | 5 +- echopop/live/live_acoustics.py | 27 +- echopop/live/live_biology.py | 14 +- echopop/live/live_data_loading.py | 9 +- echopop/live/live_data_processing.py | 54 ++- echopop/live/live_spatial_methods.py | 410 +++++++++++++++--- echopop/live/live_survey.py | 8 +- echopop/live/sql_methods.py | 5 +- 9 files changed, 423 insertions(+), 113 deletions(-) diff --git a/config_files/live_initialization_config.yml b/config_files/live_initialization_config.yml index 9436cefc..ae265343 100644 --- a/config_files/live_initialization_config.yml +++ b/config_files/live_initialization_config.yml @@ -35,8 +35,8 @@ longitude: [-135.25, -117.00] # x/y (or E-W/N-S) grid resolution in nmi grid_resolution: - x_distance: 50.0 - y_distance: 50.0 + x_distance: 25.0 + y_distance: 25.0 projection: epsg:4326 # EPSG integer code for geodetic parameter dataset # TODO: Remember to convert this back to a string # NOTE: `link_biology_acoustics` defines how biological and acoustic data are linked with one another. This diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml index 4111ea05..e52db83c 100644 --- a/config_files/live_survey_year_2019_config.yml +++ b/config_files/live_survey_year_2019_config.yml @@ -15,6 +15,7 @@ species: # Directory path that contains all input data needed data_root_dir: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files +database_directory: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files/database ############################################################################## # Input data directories @@ -44,6 +45,8 @@ input_directories: trawl_info: operation_info coastline: directory: coastline/ - coastline_name: ne_110m_land + coastline_name: ne_10m_land + grid: + database_name: grid.db ... diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py index 2da07e07..6c1ebf08 100644 --- a/echopop/live/live_acoustics.py +++ b/echopop/live/live_acoustics.py @@ -3,7 +3,7 @@ import pandas as pd from ..acoustics import ts_length_regression, to_linear, to_dB -from .live_spatial_methods import apply_spatial_definitions +from .live_spatial_methods import apply_spatial_definitions, apply_griddify_definitions from .sql_methods import sql_data_exchange, SQL, query_processed_files # TODO: Documentation @@ -26,7 +26,7 @@ def configure_transmit_frequency(frequency_values: pd.Series, return frequency_values # TODO: Documentation -def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame, +def preprocess_acoustic_data(survey_data: pd.DataFrame, spatial_dict: dict, file_configuration: dict) -> pd.DataFrame: @@ -37,15 +37,21 @@ def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame, # Filter the dataset # ---- Configure `frequency_nominal`, if necessary - prc_nasc_df.loc[:, "frequency_nominal"] = ( - configure_transmit_frequency(prc_nasc_df.loc[:, "frequency_nominal"], + survey_data.loc[:, "frequency_nominal"] = ( + configure_transmit_frequency(survey_data.loc[:, "frequency_nominal"], transmit_settings, acoustic_analysis_settings["dataset_units"]["frequency"]) ) # ---- Filter out any unused frequency coordinates prc_nasc_df_filtered = ( - prc_nasc_df[prc_nasc_df["frequency_nominal"] == transmit_settings["frequency"]] + survey_data[survey_data["frequency_nominal"] == transmit_settings["frequency"]] ) + + # Get grid coordinates + prc_nasc_df_filtered = pd.concat([ + prc_nasc_df_filtered, + apply_griddify_definitions(prc_nasc_df_filtered, file_configuration["geospatial"]) + ], axis = 1) # Apply spatial settings prc_nasc_df_filtered = ( @@ -192,7 +198,10 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict, echometrics: bool = True): # Get spatial definitions, if any - spatial_column = file_configuration["spatial_column"] + # spatial_column = file_configuration["spatial_column"] + + # Get stratum column, if any + gridding_column = file_configuration["gridding_column"] # Integrate NASC (and compute the echometrics, if necessary) # ---- Get number of unique sources @@ -208,7 +217,7 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict, # else: nasc_data_df = ( acoustic_data_df - .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, + .groupby(["longitude", "latitude", "ping_time", "source"] + gridding_column, observed=False) .apply(integrate_nasc, echometrics, include_groups=False).droplevel(-1) .reset_index() @@ -225,7 +234,7 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict, ) # ---- Reorder columns nasc_data_df = nasc_data_df[ - spatial_column + gridding_column + ["longitude", "latitude", "ping_time", "source", "nasc", "n_layers", "nasc_db", "mean_Sv", "max_Sv", "aggregation_index", "center_of_mass", "dispersion", "evenness", "occupied_area"] @@ -244,7 +253,7 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict # Add population-specific columns (specified in the file configuration) # TODO: Add to `yaml` file for configuration; hard-code for now - add_columns = ["number_density", "biomass_density", "abundance", "biomass"] + add_columns = ["number_density", "biomass_density"] # ---- df[add_columns] = 0.0 # ---- Assign values for key values diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py index 27a53bd1..5fcf3c32 100644 --- a/echopop/live/live_biology.py +++ b/echopop/live/live_biology.py @@ -183,21 +183,21 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, sigma_bs_df.loc[:, "id"] = sigma_bs_df[key_list].apply(tuple, axis=1).astype(str) # Get the database file name - acoustic_db = file_configuration["database"]["acoustics"] + biology_db = file_configuration["database"]["biology"] # Check for `sigma_bs_mean_df` in the database file # ---- Query database - if not SQL(acoustic_db, "validate", table_name="sigma_bs_mean_df"): + if not SQL(biology_db, "validate", table_name="sigma_bs_mean_df"): # ---- Create an insertion dataframe insertion_df = sigma_bs_df.copy() # ---- Create - SQL(acoustic_db, "create", table_name="sigma_bs_mean_df", dataframe=insertion_df, + SQL(biology_db, "create", table_name="sigma_bs_mean_df", dataframe=insertion_df, primary_keys=["id"]) # ---- Populate table - SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", dataframe=insertion_df) + SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=insertion_df) else: # ---- Get previous values in the table - table_df = SQL(acoustic_db, "select", table_name="sigma_bs_mean_df") + table_df = SQL(biology_db, "select", table_name="sigma_bs_mean_df") # ---- Check the table keys table_keys = np.unique(table_df["id"]).tolist() # ---- Get unique values @@ -211,13 +211,13 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, # ---- Create DataFrame insertion_df = sigma_bs_df[sigma_bs_df["id"].isin(insertion_keys)] # ---- INSERT - SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", + SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=insertion_df) # ---- UPDATE values if update_keys: update_df = sigma_bs_df[sigma_bs_df["id"].isin(update_keys)] # ---- Create a filter condition command - sql_group_update(acoustic_db, dataframe=update_df, table_name="sigma_bs_mean_df", + sql_group_update(biology_db, dataframe=update_df, table_name="sigma_bs_mean_df", columns=["sigma_bs_count", "sigma_bs_sum"], operation="+", unique_columns=["id"], id_columns=["id"]) # condition_str = " & ".join([f"id = {id_value}" for id_value in update_keys]) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index f507d63f..84316027 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -112,10 +112,14 @@ def read_biology_files(biology_files: List[Path], file_configuration: dict): directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"] # Add SQL file to dict + # file_configuration["database"]["biology"] = ( + # Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"] + # ) file_configuration["database"]["biology"] = ( - Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"] + Path(file_configuration["database_directory"]) / file_settings["database_name"] ) + # Iterate through the different biology datasets and read them in for dataset in list(biology_file_ids.keys()): # ---- Get dataset-specific file lists @@ -540,6 +544,9 @@ def configure_spatial_settings(file_configuration: dict): # ---- Empty `spatial_column` key file_configuration.update({"spatial_column": []}) + # Add grid + file_configuration.update({"gridding_column": file_configuration["stratum_column"] + ["x", "y"]}) + # Return the dictionary as an output return spatial_dict diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py index 18d493d0..c46317a0 100644 --- a/echopop/live/live_data_processing.py +++ b/echopop/live/live_data_processing.py @@ -20,17 +20,25 @@ def get_unique_identifiers(data_dict: dict, unique_columns: List[str]) -> pd.DataFrame: # Gather all dataframes from a dictionary into a list - df_list = [df for _, df in data_dict.items()] + if isinstance(data_dict, dict): + df_list = [df for _, df in data_dict.items()] + else: + df_list = [data_dict] # Get unique values of each contrast column across the biological datasets - dfs = [pd.DataFrame({col: df[col].unique().tolist()}) for col in unique_columns - for df in df_list if isinstance(df, pd.DataFrame) and not df.empty] + # dfs = [pd.DataFrame({col: df[col].unique().tolist()}) for col in unique_columns + # for df in df_list if isinstance(df, pd.DataFrame) and not df.empty and col in df.columns] + combined_df = pd.concat( + [df[unique_columns] for df in df_list if all(col in df.columns for col in unique_columns)], + ignore_index=True + ).drop_duplicates() # Reduce into a single DataFrame - if len(unique_columns) > 1: - return reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs) - else: - return reduce(lambda left, right: pd.merge(left, right, how="outer"), dfs) + return combined_df + # if len(unique_columns) > 1: + # return reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs) + # else: + # return reduce(lambda left, right: pd.merge(left, right, how="outer"), dfs) def query_dataset(db_file: str, data_dict: dict, @@ -49,14 +57,18 @@ def query_dataset(db_file: str, valid_keys = list(set(inspected_table.keys()).intersection(set(data_columns))) # ---- Get unique identifiers unique_keys_df = get_unique_identifiers(data_dict, unique_keys) - # ---- Create conditional string - conditional_str = ( - " & ".join([f"{col} in {np.unique(unique_keys_df[col]).tolist()}" - for col in unique_keys_df.columns]) - ) + # ---- Create conditional string + conditional_str = " | ".join( + [" & ".join([f"{col} = {val}" for col, val in row.items()]) + for _, row in unique_keys_df.iterrows()] + ) + # conditional_str = ( + # " & ".join([f"{col} in {np.unique(unique_keys_df[col]).tolist()}" + # for col in unique_keys_df.columns]) + # ) # ---- Append the additional constraint statement if present if constraint is not None: - conditional_str += f" & {constraint}" + conditional_str = f"({conditional_str})" + f" & {constraint}" # ---- SELECT the dataset using the conidtional statement data_sql = SQL(db_file, "select", table_name=table_name, columns=valid_keys, condition=conditional_str).filter(data_columns) @@ -90,7 +102,8 @@ def acoustic_pipeline(acoustic_dict: dict, # Get spatial column spatial_column = file_configuration["spatial_column"] - unique_columns = spatial_column + contrast_columns + gridding_column = file_configuration["gridding_column"] + unique_columns = gridding_column + contrast_columns # Get database file acoustic_db = file_configuration["database"]["acoustics"] @@ -112,15 +125,15 @@ def acoustic_pipeline(acoustic_dict: dict, unique_columns=unique_columns) # Get the corresopding `sigma_bs` data (and also compute the sample-number weighted average) - sigma_bs_df = get_sigma_bs_sql_data(acoustic_db, + sigma_bs_df = get_sigma_bs_sql_data(biology_db, acoustic_dict, - unique_columns=unique_columns) + unique_columns=["stratum"]) # Calculate population estimates if valid data are available if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]): # ---- Merge the NASC and sigma_bs datasets - nasc_biology = acoustic_df.merge(sigma_bs_df, on=unique_columns) + nasc_biology = acoustic_df.merge(sigma_bs_df, on=spatial_column + contrast_columns) # ---- Compute the number densities (animals nmi^-2) nasc_biology["number_density"] = ( nasc_biology["nasc"] @@ -130,11 +143,12 @@ def acoustic_pipeline(acoustic_dict: dict, # Get the corresponding average strata weights (computed for all fish) weight_spatial_averages = get_average_strata_weights(biology_db, acoustic_dict, - unique_columns=unique_columns) + unique_columns=spatial_column + contrast_columns) if weight_spatial_averages is not None: # Merge average weights with number density estimates - nasc_biology = nasc_biology.merge(weight_spatial_averages, on=unique_columns) + nasc_biology = nasc_biology.merge(weight_spatial_averages, + on=spatial_column + contrast_columns) # Compute biomass densities nasc_biology["biomass_density"] = ( @@ -156,7 +170,7 @@ def get_nasc_sql_data(db_file: str, # Add SELECTION columns data_columns = ( - unique_columns + ["x", "y", "longitude", "latitude", "ping_time", "nasc", "number_density", + unique_columns + ["longitude", "latitude", "ping_time", "nasc", "number_density", "biomass_density", "id"] ) # ----- Get the SQL dataset diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py index 6ce7741f..29f5df4e 100644 --- a/echopop/live/live_spatial_methods.py +++ b/echopop/live/live_spatial_methods.py @@ -4,6 +4,9 @@ from geopy.distance import distance from ..spatial.projection import utm_string_generator import shapely.geometry +from shapely.geometry import box +import sqlalchemy as sqla +from pathlib import Path from typing import Union def create_inpfc_strata(spatial_config: dict): @@ -143,7 +146,7 @@ def define_boundary_box(boundary_dict: dict, projection: str): crs=projection, ) -def apply_griddify_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict): +def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict): # Extract the griddification definitions griddify_definitions = spatial_config["griddify"] @@ -190,77 +193,346 @@ def apply_griddify_definitions(acoustic_data: dict, biology_data: dict, spatial_ # Get the centroids cells_gdf["cell_centroid"] = cells_gdf["geometry"].centroid - # Get the `prc_nasc_df` values, if they exist, and apply stratification information - if not acoustic_data["prc_nasc_df"].empty: + # Convert to GeoDataFrame + dataset_gdf = gpd.GeoDataFrame( + data=dataset, + geometry=gpd.points_from_xy(dataset["longitude"], dataset["latitude"]), + crs=projection, + ) + # ---- To UTM + dataset_gdf = dataset_gdf.to_crs(projection_new) + + # Extract x- and y-coordinates + dataset_gdf["x"] = dataset_gdf["geometry"].x + dataset_gdf["y"] = dataset_gdf["geometry"].y + + # Bin the longitude data + dataset_gdf["stratum_x"] = pd.cut( + dataset_gdf["x"], + np.arange(xmin, xmax+x_step, x_step), + right = True, + labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1), + ).astype(int) + 1 + + # Bin the latitude data + dataset_gdf["stratum_y"] = pd.cut( + dataset_gdf["y"], + np.arange(ymin, ymax+y_step, y_step), + right = True, + labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1), + ).astype(int) + 1 + + # Update the original dataset + return ( + dataset_gdf.loc[:, ["stratum_x", "stratum_y"]] + .rename(columns={"stratum_x": "x", "stratum_y": "y"}) + ) + # dataset.loc[:, "x"] = dataset_gdf.copy().loc[:, "stratum_x"] + # dataset.loc[:, "y"] = dataset_gdf.copy().loc[:, "stratum_y"] - # - prc_nasc_df = acoustic_data["prc_nasc_df"] - # to GDF - prc_nasc_gdf = gpd.GeoDataFrame( - data=prc_nasc_df, - geometry=gpd.points_from_xy(prc_nasc_df["longitude"], prc_nasc_df["latitude"]), - crs=projection, - ) - # to UTM - prc_nasc_new = prc_nasc_gdf.to_crs(projection_new) - - prc_nasc_new["x"] = prc_nasc_new["geometry"].x - prc_nasc_new["y"] = prc_nasc_new["geometry"].y - - # ---- Bin the latitude data - prc_nasc_new["stratum_x"] = pd.cut( - prc_nasc_new["x"], - np.arange(xmin, xmax+x_step, x_step), - right = True, - labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1), - ).astype(int) + 1 - - prc_nasc_new["stratum_y"] = pd.cut( - prc_nasc_new["y"], - np.arange(ymin, ymax+y_step, y_step), - right = True, - labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1), - ).astype(int) + 1 - - # - acoustic_data["prc_nasc_df"]["stratum"] = ( - prc_nasc_new["stratum_x"].astype(str) + "-" + prc_nasc_new["stratum_y"].astype(str) - ) +# def apply_griddify_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict): - if not biology_data["trawl_info_df"].empty: +# # Extract the griddification definitions +# griddify_definitions = spatial_config["griddify"] - # - trawl_info_df = biology_data["trawl_info_df"] +# # Get the projection definition +# projection = spatial_config["projection"] - # to GDF - trawl_info_gdf = gpd.GeoDataFrame( - data=trawl_info_df, - geometry=gpd.points_from_xy(trawl_info_df["longitude"], trawl_info_df["latitude"]), - crs=projection, - ) - # to UTM - trawl_info_new = trawl_info_gdf.to_crs(projection_new) - - trawl_info_new["x"] = trawl_info_new["geometry"].x - trawl_info_new["y"] = trawl_info_new["geometry"].y - - # ---- Bin the latitude data - trawl_info_new["stratum_x"] = pd.cut( - trawl_info_new["x"], - np.arange(xmin, xmax+x_step, x_step), - right = True, - labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1), - ).astype(int) + 1 - - trawl_info_new["stratum_y"] = pd.cut( - trawl_info_new["y"], - np.arange(ymin, ymax+y_step, y_step), - right = True, - labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1), - ).astype(int) + 1 - - # - biology_data["trawl_info_df"]["stratum"] = ( - trawl_info_new["stratum_x"].astype(str) + "-" + trawl_info_new["stratum_y"].astype(str) +# # Compute the boundary box GeoDataFrame +# boundary_box = define_boundary_box(griddify_definitions["bounds"], projection) + +# # Convert the coordinates, if needed +# if not set(["northings", "eastings"]).intersection(set(griddify_definitions["bounds"].keys())): +# # ---- Compute the equivalent UTM string +# utm_num = int(utm_string_generator(np.median(boundary_box.loc[0:3, "x"]), +# np.median(boundary_box.loc[0:3, "y"]))) +# # ---- Compute the boundary box GeoDataFrame with the new projection +# boundary_box = boundary_box.to_crs(utm_num) +# # ---- Create a new projection for later +# projection_new = f"epsg:{utm_num}" +# else: +# projection_new = projection + +# # Define the step sizes +# # ---- Define x step size +# x_step = distance(nautical=griddify_definitions["grid_resolution"]["x_distance"]).meters +# # ---- Define y step size +# y_step = distance(nautical=griddify_definitions["grid_resolution"]["y_distance"]).meters + +# # Get the boundary tuple +# xmin, ymin, xmax, ymax = boundary_box.total_bounds + +# # Generate the cells +# grid_cells = [] +# # ---- Iterate through +# for y0 in np.arange(ymin, ymax+y_step, y_step): +# for x0 in np.arange(xmin, xmax+x_step, x_step): +# x1 = x0-x_step +# y1 = y0+y_step +# grid_cells.append(shapely.geometry.box(x0, y0, x1, y1)) + +# # Convert to a GeoDataFrame +# cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=projection_new) + +# # Get the centroids +# cells_gdf["cell_centroid"] = cells_gdf["geometry"].centroid + +# # Get the `prc_nasc_df` values, if they exist, and apply stratification information +# if not acoustic_data["prc_nasc_df"].empty: + +# # +# prc_nasc_df = acoustic_data["prc_nasc_df"] + +# # to GDF +# prc_nasc_gdf = gpd.GeoDataFrame( +# data=prc_nasc_df, +# geometry=gpd.points_from_xy(prc_nasc_df["longitude"], prc_nasc_df["latitude"]), +# crs=projection, +# ) +# # to UTM +# prc_nasc_new = prc_nasc_gdf.to_crs(projection_new) + +# prc_nasc_new["x"] = prc_nasc_new["geometry"].x +# prc_nasc_new["y"] = prc_nasc_new["geometry"].y + +# # ---- Bin the latitude data +# prc_nasc_new["stratum_x"] = pd.cut( +# prc_nasc_new["x"], +# np.arange(xmin, xmax+x_step, x_step), +# right = True, +# labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1), +# ).astype(int) + 1 + +# prc_nasc_new["stratum_y"] = pd.cut( +# prc_nasc_new["y"], +# np.arange(ymin, ymax+y_step, y_step), +# right = True, +# labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1), +# ).astype(int) + 1 + +# # +# acoustic_data["prc_nasc_df"]["stratum"] = ( +# prc_nasc_new["stratum_x"].astype(str) + "-" + prc_nasc_new["stratum_y"].astype(str) +# ) + +# if not biology_data["trawl_info_df"].empty: + +# # +# trawl_info_df = biology_data["trawl_info_df"] + +# # to GDF +# trawl_info_gdf = gpd.GeoDataFrame( +# data=trawl_info_df, +# geometry=gpd.points_from_xy(trawl_info_df["longitude"], trawl_info_df["latitude"]), +# crs=projection, +# ) +# # to UTM +# trawl_info_new = trawl_info_gdf.to_crs(projection_new) + +# trawl_info_new["x"] = trawl_info_new["geometry"].x +# trawl_info_new["y"] = trawl_info_new["geometry"].y + +# # ---- Bin the latitude data +# trawl_info_new["stratum_x"] = pd.cut( +# trawl_info_new["x"], +# np.arange(xmin, xmax+x_step, x_step), +# right = True, +# labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1), +# ).astype(int) + 1 + +# trawl_info_new["stratum_y"] = pd.cut( +# trawl_info_new["y"], +# np.arange(ymin, ymax+y_step, y_step), +# right = True, +# labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1), +# ).astype(int) + 1 + +# # +# biology_data["trawl_info_df"]["stratum"] = ( +# trawl_info_new["stratum_x"].astype(str) + "-" + trawl_info_new["stratum_y"].astype(str) +# ) + +def initialize_grid(file_configuration = dict): + + # Get root directory, if defined + if "data_root_dir" in file_configuration: + root_dir = Path(file_configuration["data_root_dir"]) + else: + root_dir = Path() + + # Get `grid` settings + grid_database = file_configuration["input_directories"]["grid"]["database_name"] + # ---- + db_directory = Path(file_configuration["database_directory"]) + + # Create full filepath + # db_filepath = root_dir / "database" / grid_database + db_filepath = db_directory / grid_database + # ---- Update config + file_configuration["database"]["grid"] = db_filepath + + # Create if file doesn't already exist + if not db_filepath.exists(): + + # Get projection + projection = file_configuration["geospatial"]["projection"] + + # Get grid settings + grid_settings = file_configuration["geospatial"]["griddify"] + + # Get the resolution + resolution = grid_settings["grid_resolution"] + # ---- Convert from nmi to m + resolution_m = {key: distance(nautical=dist).meters for key, dist in resolution.items()} + + # Get boundary coordinates + boundary = grid_settings["bounds"] + # ---- x + x = boundary["longitude"] + # ---- y + y = boundary["latitude"] + # ---- Create DataFrame + boundary_df = pd.DataFrame({ + "x": np.array([np.min(x), np.max(x), np.max(x), np.min(x), np.min(x)]), + "y": np.array([np.min(y), np.min(y), np.max(y), np.max(y), np.min(y)]) + }) + + # Create GeoDataFrame + boundary_gdf = gpd.GeoDataFrame( + data = boundary_df, + geometry=gpd.points_from_xy(boundary_df["x"], boundary_df["y"]), + crs = projection ) + + # Convert to UTM (decimal degrees to m) + # ---- Create UTM code + utm_code = utm_string_generator((boundary_df.x.min() + boundary_df.x.max()) / 2, + (boundary_df.y.min() + boundary_df.y.max()) / 2) + # ---- Create number code + utm_num = int(utm_code) + # ---- UTM conversion + boundary_gdf_utm = boundary_gdf.to_crs(utm_num) + + # Get step sizes for each grid cell + # ---- x + x_step = resolution_m["x_distance"] + # ---- y + y_step = resolution_m["y_distance"] + + # Prepare grid cell generation + # ---- Get new boundaries + xmin, ymin, xmax, ymax = boundary_gdf_utm.total_bounds + # ---- Initialize empty list + grid_cells = [] + # ---- Initialize coordinate counter + y_ct = 0 + x_coord = []; y_coord = [] + # ---- Iterate through to generate cells + for y0 in np.arange(ymin, ymax, y_step): + y_ct += 1 + x_ct = 0 + for x0 in np.arange(xmin, xmax, x_step): + x_ct += 1 + # ---- Step forward + x_coord.append(x_ct) + y_coord.append(y_ct) + x1 = x0 - x_step + y1 = y0 + y_step + # ---- Append to list + grid_cells.append(shapely.geometry.box(x0, y0, x1, y1)) + + # Convert to a GeoDataFrame + cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=utm_code) + # ---- Add cordinates + cells_gdf.loc[:, "x"] = np.array(x_coord) + cells_gdf.loc[:, "y"] = np.array(y_coord) + + # Get coastline shapefile directory, if defined + if "coastline" in file_configuration["input_directories"]: + + # Get coastline settings + coast_settings = file_configuration["input_directories"]["coastline"] + # ---- Create filepath + shp_filepath = ( + root_dir / coast_settings["directory"] + / coast_settings["coastline_name"] / f"{coast_settings["coastline_name"]}.shp" + ) + # ---- Validate existence + if not shp_filepath.exists(): + raise FileNotFoundError( + f"{shp_filepath} does not exist!" + ) + + # Get original lat/lon geometry boundaries + xmin0, ymin0, xmax0, ymax0 = boundary_gdf.total_bounds + + # Read in file + full_coast = gpd.read_file(shp_filepath) + # ---- Convert to UTM + full_coast_utm = full_coast.to_crs(utm_code) + # ---- Remove empty + full_coast_utm = full_coast_utm[~full_coast_utm.is_empty] + + # Create bouning box with a buffer + boundary_box = box(xmin0 - 5, ymin0 - 5, xmax0 + 5, ymax0 + 5) + # ---- Create an unbuffered copy + boundary_box_unbuffered = box(xmin0, ymin0, xmax0, ymax0) + # ---- Convert to a GeoDataFrame + boundary_box_unbuffered_gdf = ( + gpd.GeoDataFrame(geometry=[boundary_box_unbuffered], crs=projection) + ) + # ---- Clip the coastline for saving + clipped_coast_original = ( + gpd.clip(full_coast, box(xmin0 + 1, ymin0 + 1, xmax0 + 1, ymax0 + 1)) + ) + + # Clip the coastline shapefile + clipped_coast = gpd.clip(full_coast, boundary_box).to_crs(utm_code) + + # Clip the grid cells + cells_gdf.loc[:, "geometry"] = ( + cells_gdf["geometry"].difference(clipped_coast.geometry.union_all()) + ) + + # Calculate area per cell + cells_gdf.loc[:, "area"] = cells_gdf.area + + # Convert back to original projection and clip + clipped_cells_latlon = ( + gpd.clip(cells_gdf.to_crs(projection), boundary_box_unbuffered_gdf) + .reset_index(drop=True) + ) + + # Initialize empty columns that can be added to later on + clipped_cells_latlon.loc[:, ["number_density_mean", "biomass_density_mean", + "abundance", "biomass"]] = 0.0 + + # Create output DataFrame + output_df = pd.DataFrame({ + "geometry": clipped_cells_latlon["geometry"].apply(lambda geom: geom.wkt) + }) + # ---- Add the required columns + output_df = pd.concat([output_df, clipped_cells_latlon.loc[:, ["x", "y", "area"]]], + axis=1) + # ---- Initialize empty columns that can be added to later on + output_df.loc[:, ["number_density_mean", "biomass_density_mean", "abundance", + "biomass"]] = 0.0 + + # Write to the database file (for the grid) + # ---- Create engine + engine = sqla.create_engine(f"sqlite:///{db_filepath}") + # ---- Connect and create table + _ = output_df.to_sql("grid_df", engine, if_exists="replace", index=False) + + # Write to the database file (for the coastline shapefile) + # ---- Create output copy + coastline_out = pd.DataFrame({ + "geometry": clipped_coast_original["geometry"].apply(lambda geom: geom.wkt) + }) + # ---- Concatenate + coastline_out = ( + pd.concat([coastline_out, clipped_coast_original.drop(columns="geometry")], axis=1) + ) + # ---- Connect and create table + _ = coastline_out.to_sql("coastline_df", engine, if_exists="replace", index=False) diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index f3cb7f5a..58c5c27c 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -35,6 +35,7 @@ weight_proportions ) +from .live_spatial_methods import initialize_grid from . import live_data_processing as eldp from . import live_data_loading as eldl @@ -73,6 +74,9 @@ def __init__( # Initialize the results attribute self.results = copy.deepcopy(LIVE_DATA_STRUCTURE["results"]) + # Initialize the extrapolation grid + initialize_grid(self.config) + # Configure the spatial settings self.input.update({"spatial": eldl.configure_spatial_settings(self.config)}) @@ -204,8 +208,8 @@ def process_biology_data(self): # Compute `sigma_bs` by sending it to the appropriate database table compute_sigma_bs(biology_unprocessed["specimen_df"], - biology_unprocessed["length_df"], - self.config) + biology_unprocessed["length_df"], + self.config) # Bin the length measurements of the biological data bin_length_data(biology_unprocessed, self.config["length_distribution"]) diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index f680e908..f9dd36eb 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -380,7 +380,8 @@ def initialize_database(root_directory: Path, file_settings: dict): # Create filepath to the SQL database # ---- Create Path to SQL database file - db_directory = root_directory / "database" + # db_directory = root_directory / "database" + db_directory = Path(file_settings["database_directory"]) # ---- Create the directory if it does not already exist db_directory.mkdir(parents=True, exist_ok=True) # ---- Complete path to the database file @@ -625,7 +626,7 @@ def query_processed_files(root_directory: Path, file_settings: dict, files: List # Create filepath to the SQL database # ---- Create Path to SQL database file - db_directory = Path(root_directory) / "database" + db_directory = Path(file_configuration["database_directory"]) # ---- Complete path to the database file db_file = db_directory / db_name From 63e79614622c5a13c87cd01ba934febcc3d0bd4f Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 12 Aug 2024 12:13:50 -0700 Subject: [PATCH 20/81] f-string fix for coastline db file creation --- echopop/live/live_spatial_methods.py | 2 +- echopop/mesh_generation.py | 132 +++-------- echopop/zarr_read_ingest_test.py | 339 +++++++++++++++++++++++++-- 3 files changed, 354 insertions(+), 119 deletions(-) diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py index 29f5df4e..d8a46523 100644 --- a/echopop/live/live_spatial_methods.py +++ b/echopop/live/live_spatial_methods.py @@ -456,7 +456,7 @@ def initialize_grid(file_configuration = dict): # ---- Create filepath shp_filepath = ( root_dir / coast_settings["directory"] - / coast_settings["coastline_name"] / f"{coast_settings["coastline_name"]}.shp" + / coast_settings["coastline_name"] / f"{coast_settings['coastline_name']}.shp" ) # ---- Validate existence if not shp_filepath.exists(): diff --git a/echopop/mesh_generation.py b/echopop/mesh_generation.py index bb78e1ba..699eed4f 100644 --- a/echopop/mesh_generation.py +++ b/echopop/mesh_generation.py @@ -12,107 +12,8 @@ # Create the grid points grid_points = [(i, j, 0) for i in x for j in y] -def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame]: - - # Get the acoustic file settings and root directory - # ---- File settings - file_settings = file_configuration["input_directories"]["acoustics"] - # ---- Root directory - root_directory = file_configuration["data_root_dir"] - - # Get and validate the acoustic data directory and files - acoustic_files = validate_data_directory(root_directory, file_settings) - - # Query `acoustics.db` to process only new files (or create the db file in the first place) - new_acoustic_files = query_acoustic_db_files(file_configuration, acoustic_files) - - # Read in the acoustic data files - # ! [REQUIRES DASK] ---- Read in the listed file - prc_nasc_df, acoustic_data_units = read_acoustic_zarr(new_acoustic_files) - # ---- Add the `acoustic_data_units` to the dictionary - file_configuration["acoustics"]["dataset_units"] = acoustic_data_units - - # Preprocess the acoustic dataset - prc_nasc_df_processed = preprocess_acoustic_data(prc_nasc_df, file_configuration) - - # Return output - return prc_nasc_df_processed - -def read_acoustic_zarr(acoustic_files: Path) -> tuple: - - # Iterate through each of the file ids and read in the data - for id in list(biology_file_ids.keys()): - # ---- Extract the specific config mapping for this tag/id - sub_config_map = biology_config_map[id] - # ---- Drop the `{FIELD_ID}` tag identifier - file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id]) - # ---- Replace all other tags with `*` placeholders - file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) - # ---- Create Path object with the generalized format - subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}") - # ---- List all files that match this pattern - subcsv_files_str = [str(file) for file in list(subfile_path_obj)] - # ---- Filter for only new files - subset_files = set(subcsv_files_str).intersection(set(new_files)) - # ---- Pull from SQL database, if applicable - if f"{id}_df" in tables: - # ---- SELECT - sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*") - # ---- Concatenate to the dictionary - sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df]) - # ---- Add data files not stored in SQL database - if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables: - if len(subset_files) > 0: - file_list = subset_files - else: - file_list = subcsv_files_str - # ---- Create a list of relevant dataframes - sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map) - for file in file_list] - # ---- Concatenate into a single DataFrame - sub_df = pd.concat(sub_df_lst, ignore_index=True) - # ---- Lower-case sex - if "sex" in sub_df.columns: - sub_df["sex"] = sub_df["sex"].str.lower() - # ---- Concatenate to the dictionary DataFrame - biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df]) - - # Get contrasts used for filtering the dataset - # ---- Species - species_filter = file_configuration["species"]["number_code"] - # ---- Trawl partition information - trawl_filter = biology_analysis_settings["catch"]["partition"] - # ---- Apply the filter - filtered_biology_output = { - key: df[ - (df['species_id'] == species_filter if 'species_id' in df.columns else True) & - (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns else True) - ] - for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty - } - - # Update the SQL database - for table_name, df in filtered_biology_output.items(): - # ---- Update - _ = SQL(db_file, "insert", table_name=table_name, columns="*", - dataframe=df) - - # Combine the two datasets - merged_output = { - key: pd.concat([ - sql_biology_output.get(key, pd.DataFrame()), - filtered_biology_output.get(key, pd.DataFrame()) - ]).drop_duplicates().reset_index(drop=True) - for key in set(sql_biology_output) | set(filtered_biology_output) - } - # ---- Return output - if update_config: - if file_configuration["database"]["biology"] is None: - file_configuration["database"]["biology"] = db_file - return merged_output, file_configuration - else: - return merged_output +def initialize_grid(): data_root_dir = Path("C:/Users/Brandyn/Documents/GitHub/EchoPro_data/") @@ -2016,6 +1917,12 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict): boundary_dict = griddify_definitions["bounds"] +from geopy.distance import distance +import numpy as np +import pandas as pd +import geopandas as gpd +from echopop.spatial.projection import utm_string_generator + ## grid_settings["grid_resolution"]["x"] = 50 grid_settings["grid_resolution"]["y"] = 50 @@ -2034,6 +1941,7 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict): crs = projection ) from echopop.spatial.projection import utm_string_generator +import shapely.geometry utm_string_generator(-117.0, 33.75) bound_gdf.total_bounds # Convert to UTM @@ -2125,7 +2033,30 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict): # plt.xlim(lon_min-3, lon_max+3) # plt.ylim(lat_min-3, lat_max+3) # plt.show() +test = SQL(db_filepath, "select", table_name="grid_df") +from shapely import wkt +import matplotlib.pyplot as plt + +test = output_df.copy() +test["geometry"] = test["geometry"].apply(wkt.loads) +test_gdf = gpd.GeoDataFrame(test, geometry="geometry", crs=projection) + +co = SQL(db_filepath, "select", table_name="coastline_df") +co["geometry"] = co["geometry"].apply(wkt.loads) +co_gdf = gpd.GeoDataFrame(co, geometry="geometry", crs=projection) + +lims = test_gdf.total_bounds + +fig, ax = plt.subplots(figsize=(10, 10)) +test_gdf.plot(ax=ax, column="abundance", edgecolor="black", cmap="viridis", legend=False) +co_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") +plt.xlim(lims[0]*1.005, lims[2]*1.01) +plt.ylim(lims[1]*0.98, lims[3]*1.005) +plt.show() + +test["geometry"].apply(wkt.loads) +clipped_cells_latlon["geometry"] len(bbox_latlon.exterior.coords) len(buffer_boundary.exterior.coords) @@ -2151,6 +2082,7 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict): custom_crs = '+proj=epsg:4326 +lat_ts=0 +lat_0=0 +lon_0=-180 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs +type=crs' cells_latlon_clipped.to_crs(custom_crs).crs ######## +import sqlalchemy as sqla import matplotlib.colors as colors import matplotlib.cm as cm cells_transformed = cells_latlon.to_crs(utm_code) diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py index e6d00cc9..0512b667 100644 --- a/echopop/zarr_read_ingest_test.py +++ b/echopop/zarr_read_ingest_test.py @@ -21,7 +21,7 @@ from echopop.live import live_data_loading as eldl from echopop.live.live_data_processing import query_dataset, get_unique_identifiers from echopop.live.live_survey import LiveSurvey -from echopop.live.live_acoustics import integrate_nasc +from echopop.live.live_acoustics import integrate_nasc, configure_transmit_frequency from echopop.live.live_biology import preprocess_biology_data from echopop.survey import Survey @@ -34,26 +34,200 @@ length_weight_dict = analysis_dict["biology"]["weight"] stratum_proportions_sexed["proportion_aged"] + stratum_proportions_sexed["proportion_unaged"] -files = data_files +updated_survey_data = nasc_biology.copy() +gridding_column = file_configuration["gridding_column"] +unique_keys = get_unique_identifiers(updated_survey_data, gridding_column) - - - # Map the table names and validate table creation - # ---- Get table names - tables = SQL(db_file, "map") - # ---- `files_read` - if "files_read" not in tables: - raise KeyError( - f"SQL database table `files_read` in `{db_file}` failed to initialize!" - ) - # ---- `files_processed` - if "files_processed" not in tables: - raise KeyError( - f"SQL database table `files_processed` in `{db_file}` failed to initialize!" +file_configuration = self.config +grid_settings["grid_resolution"]["x"] = 50 +grid_settings["grid_resolution"]["y"] = 50 +lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters +lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters +self = realtime_survey +file_configuration = self.config + +def initialize_grid(): + + # Get root directory, if defined + if "data_root_dir" in file_configuration: + root_dir = Path(file_configuration["data_root_dir"]) + else: + root_dir = Path() + + # Get `grid` settings + grid_database = file_configuration["input_directories"]["grid"]["database_name"] + + # Create full filepath + db_filepath = root_dir / "database" / grid_database + + # Create if file doesn't already exist + if not db_filepath.exists(): + + # Get projection + projection = file_configuration["geospatial"]["projection"] + + # Get grid settings + grid_settings = file_configuration["geospatial"]["griddify"] + + # Get the resolution + resolution = grid_settings["grid_resolution"] + # ---- Convert from nmi to m + resolution_m = {key: distance(nautical=dist).meters for key, dist in resolution.items()} + + # Get boundary coordinates + boundary = grid_settings["bounds"] + # ---- x + x = boundary["longitude"] + # ---- y + y = boundary["latitude"] + # ---- Create DataFrame + boundary_df = pd.DataFrame({ + "x": np.array([np.min(x), np.max(x), np.max(x), np.min(x), np.min(x)]), + "y": np.array([np.min(y), np.min(y), np.max(y), np.max(y), np.min(y)]) + }) + + # Create GeoDataFrame + boundary_gdf = gpd.GeoDataFrame( + data = boundary_df, + geometry=gpd.points_from_xy(boundary_df["x"], boundary_df["y"]), + crs = projection ) - + + # Convert to UTM (decimal degrees to m) + # ---- Create UTM code + utm_code = utm_string_generator((boundary_df.x.min() + boundary_df.x.max()) / 2, + (boundary_df.y.min() + boundary_df.y.max()) / 2) + # ---- Create number code + utm_num = int(utm_code) + # ---- Create string code + utm_str = f"epsg:{utm_num}" + # ---- UTM conversion + boundary_gdf_utm = boundary_gdf.to_crs(utm_num) + + # Get step sizes for each grid cell + # ---- x + x_step = resolution_m["x_distance"] + # ---- y + y_step = resolution_m["y_distance"] + + # Prepare grid cell generation + # ---- Get new boundaries + xmin, ymin, xmax, ymax = boundary_gdf_utm.total_bounds + # ---- Initialize empty list + grid_cells = [] + # ---- Initialize coordinate counter + y_ct = 0 + x_coord = []; y_coord = [] + # ---- Iterate through to generate cells + for y0 in np.arange(ymin, ymax, y_step): + y_ct += 1 + x_ct = 0 + for x0 in np.arange(xmin, xmax, x_step): + x_ct += 1 + # ---- Step forward + x_coord.append(x_ct) + y_coord.append(y_ct) + x1 = x0 - x_step + y1 = y0 + y_step + # ---- Append to list + grid_cells.append(shapely.geometry.box(x0, y0, x1, y1)) + + # Convert to a GeoDataFrame + cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=utm_code) + # ---- Add cordinates + cells_gdf.loc[:, "x"] = np.array(x_coord) + cells_gdf.loc[:, "y"] = np.array(y_coord) + + # Get coastline shapefile directory, if defined + if "coastline" in file_configuration["input_directories"]: + + # Get coastline settings + coast_settings = file_configuration["input_directories"]["coastline"] + # ---- Create filepath + shp_filepath = ( + root_dir / coast_settings["directory"] + / coast_settings["coastline_name"] / f"{coast_settings["coastline_name"]}.shp" + ) + # ---- Validate existence + if not shp_filepath.exists(): + raise FileNotFoundError( + f"{shp_filepath} does not exist!" + ) + + # Get original lat/lon geometry boundaries + xmin0, ymin0, xmax0, ymax0 = boundary_gdf.total_bounds + + # Read in file + full_coast = gpd.read_file(shp_filepath) + # ---- Convert to UTM + full_coast_utm = full_coast.to_crs(utm_code) + # ---- Remove empty + full_coast_utm = full_coast_utm[~full_coast_utm.is_empty] + + # Create bouning box with a buffer + boundary_box = box(xmin0 - 5, ymin0 - 5, xmax0 + 5, ymax0 + 5) + # ---- Create an unbuffered copy + boundary_box_unbuffered = box(xmin0, ymin0, xmax0, ymax0) + # ---- Convert to a GeoDataFrame + boundary_box_unbuffered_gdf = ( + gpd.GeoDataFrame(geometry=[boundary_box_unbuffered], crs=projection) + ) + # ---- Clip the coastline for saving + clipped_coast_original = ( + gpd.clip(full_coast, box(xmin0 + 1, ymin0 + 1, xmax0 + 1, ymax0 + 1)) + ) + + # Clip the coastline shapefile + clipped_coast = gpd.clip(full_coast, boundary_box).to_crs(utm_code) + + # Clip the grid cells + cells_gdf.loc[:, "geometry"] = ( + cells_gdf["geometry"].difference(clipped_coast.geometry.union_all()) + ) + + # Calculate area per cell + cells_gdf.loc[:, "area"] = cells_gdf.area + + # Convert back to original projection and clip + clipped_cells_latlon = ( + gpd.clip(cells_gdf.to_crs(projection), boundary_box_unbuffered_gdf) + .reset_index(drop=True) + ) + + # Initialize empty columns that can be added to later on + clipped_cells_latlon.loc[:, ["number_density_mean", "biomass_density_mean", + "abundance", "biomass"]] = 0.0 + + # Create output DataFrame + output_df = pd.DataFrame({ + "geometry": clipped_cells_latlon["geometry"].apply(lambda geom: geom.wkt) + }) + # ---- Add the required columns + output_df = pd.concat([output_df, clipped_cells_latlon.loc[:, ["x", "y", "area"]]], + axis=1) + # ---- Initialize empty columns that can be added to later on + output_df.loc[:, ["number_density_mean", "biomass_density_mean", "abundance", + "biomass"]] = 0.0 + + # Write to the database file (for the grid) + # ---- Create engine + engine = sqla.create_engine(f"sqlite:///{db_filepath}") + # ---- Connect and create table + _ = output_df.to_sql("grid_df", engine, if_exists="replace") + + # Write to the database file (for the coastline shapefile) + # ---- Create output copy + coastline_out = pd.DataFrame({ + "geometry": clipped_coast_original["geometry"].apply(lambda geom: geom.wkt) + }) + # ---- Concatenate + coastline_out = ( + pd.concat([coastline_out, clipped_coast_original.drop(columns="geometry")], axis=1) + ) + # ---- Connect and create table + _ = coastline_out.to_sql("coastline_df", engine, if_exists="replace") #################################################################################################### # TEST: YAML FILE CONFIGURATION @@ -245,9 +419,138 @@ def biology_pipeline(biology_dict: dict, data_table = "grid" grid_table = "reference" column_pairs = [("number_density", "abundance"), ("biomass_density", "biomass")] -coordinates = ["x", "y"] + dataframe = nasc_biology_output +import sqlalchemy as sqla +grid_db_file = file_configuration["database"]["grid"] +survey_db_file = Path(file_configuration["data_root_dir"]) / "database" / "acoustics.db" +data_table = "survey_data_df" +grid_table = "grid_df" +coordinates = ["x", "y"] +from echopop.live.sql_methods import SQL + +SQL(grid_db_file, "select", table_name=grid_table) +SQL(survey_db_file, "select", table_name=data_table) +SQL(data_table, "map") + +updated_survey_data = nasc_biology.copy() +# Get relevant table +previous_grid = query_dataset(grid_db_file, updated_survey_data, + table_name=grid_table, + data_columns=["x", "y", "area", "number_density_mean", + "biomass_density_mean", "abundance", "biomass"], + unique_columns=["x", "y"]) + +# Get unique coordinates +update_keys = get_unique_identifiers(updated_survey_data, gridding_column).set_index(["x", "y"]) +update_keys["number_density_mean"] = updated_survey_data.groupby(["x", "y"])["number_density"].mean() +update_keys["biomass_density_mean"] = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean() + + + +number_density_mean = updated_survey_data.groupby(["x", "y"])["number_density"].mean() +biomass_density_mean = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean() + +SQL(grid_db_file, "select", table_name=grid_table) + + + +pulled_data = pd.concat([SQL(grid_db_file, "select", + table_name=grid_table, + condition=f"x = {t[0]} & y = {t[1]}") for t in unique_coord]) +previous_cell_data = pd.concat([SQL(survey_db_file, "select", + table_name=data_table, + condition=f"x = {t[0]} & y = {t[1]}") for t in unique_coord]) + +from echopop.live.live_data_processing import get_nasc_sql_data, get_sigma_bs_sql_data, get_average_strata_weights, summarize_strata +from echopop.live.sql_methods import sql_group_update +from typing import List +from shapely.geometry import box +SQL(grid_db_file, "select", table_name="grid_df") +# Compute means +number_density_mean = previous_cell_data.groupby(["x", "y"])["number_density"].mean() +previous_cell_data = previous_cell_data.groupby(["x", "y"])["biomass_density"].mean() + +[SQL(grid_db_file, "select", table_name=grid_table, condition=f"x = {xi} & y = {yi}") for xi, yi in zip(nasc_data_df["x"], nasc_data_df["y"])] + +# Write to the database file (for the grid) +# ---- Create engine +engine = sqla.create_engine(f"sqlite:///{db_filepath}") + +def update_population_grid(grid_db_file: str, + data_table: str, + grid_table: str, + dataframe: pd.DataFrame, + column_pairs: Union[List[tuple[str, str]], tuple[str, str]], + coordinates: List[str]): + + # Convert `column_pairs` to a list, if needed + if not isinstance(column_pairs, list): + column_pairs = [column_pairs] + + dataframe[coordinates] + # Format the coordinate pairs + # ---- Convert coordinate values into a list of tuples + coord_pairs = [tuple(row) for row in dataframe[coordinates].itertuples(index=False)] + # ---- Get unique pairs + coords = list(set(coord_pairs)) + + # Format the SQL script command + # ---- Initialize + sql_script = [] + # ---- Iteratively update + for input_column, output_column in column_pairs: + sql_script.append( + f""" + BEGIN TRANSACTION; + + -- Calculate averages for input_column and update grid_table + WITH avgs AS ( + SELECT + {coordinates[0]}, + {coordinates[1]}, + AVG(d.{input_column}) as avg_value + FROM {data_table} d + GROUP BY d.{coordinates[0]}, d.{coordinates[1]} + ) + + -- Update the grid_table with both average and computed total + UPDATE {grid_table} + SET + mean_{input_column} = ( + SELECT avg_value + FROM avgs + WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} + AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} + ), + {output_column} = ( + SELECT avg_value * {grid_table}.area + FROM avgs + WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} + AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} + ) + WHERE EXISTS ( + SELECT 1 + FROM avgs + WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} + AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} + ); + + COMMIT; + """ + ) + + # Create the engine + engine = create_engine(f"sqlite:///{db_file}") + + # Create the SQL database connection and send the script + with engine.connect() as connection: + dbapi_conn = connection.connection + _ = dbapi_conn.executescript("\n".join(sql_script)) + + + def update_population_grid(db_file: str, data_table: str, grid_table: str, From ab6d9ffdaf084c3f2000c8b510565a5841894acb Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 12 Aug 2024 12:32:04 -0700 Subject: [PATCH 21/81] Fix to stratum/spatial config key name --- echopop/live/live_data_loading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index 84316027..e42a86dd 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -545,7 +545,7 @@ def configure_spatial_settings(file_configuration: dict): file_configuration.update({"spatial_column": []}) # Add grid - file_configuration.update({"gridding_column": file_configuration["stratum_column"] + ["x", "y"]}) + file_configuration.update({"gridding_column": file_configuration["spatial_column"] + ["x", "y"]}) # Return the dictionary as an output return spatial_dict From 8dd470c7087d35991dc0d57d2d61bfebf673317b Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 12 Aug 2024 12:37:57 -0700 Subject: [PATCH 22/81] Fix to database directory initialization --- echopop/live/live_data_loading.py | 2 ++ echopop/live/sql_methods.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index e42a86dd..8ef37ce5 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -232,6 +232,8 @@ def validate_data_directory(file_configuration: dict, dataset: str, raise TypeError( "Data loading argument `input_filenames` must be a list." ) + # + root_directory = file_configuration["database_directory"] # Initialize the database file initialize_database(root_directory, file_settings) diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index f9dd36eb..67cf424c 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -380,8 +380,8 @@ def initialize_database(root_directory: Path, file_settings: dict): # Create filepath to the SQL database # ---- Create Path to SQL database file - # db_directory = root_directory / "database" - db_directory = Path(file_settings["database_directory"]) + db_directory = Path(root_directory) + # db_directory = Path(file_settings["database_directory"]) # ---- Create the directory if it does not already exist db_directory.mkdir(parents=True, exist_ok=True) # ---- Complete path to the database file From b6fbae513983bb306068c4c78d9e34c7fccc3639 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 12 Aug 2024 12:41:48 -0700 Subject: [PATCH 23/81] Additional db directorypath changes/fixes --- echopop/live/sql_methods.py | 3 ++- echopop/zarr_read_ingest_test.py | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index 67cf424c..7ae3824f 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -626,7 +626,8 @@ def query_processed_files(root_directory: Path, file_settings: dict, files: List # Create filepath to the SQL database # ---- Create Path to SQL database file - db_directory = Path(file_configuration["database_directory"]) + # db_directory = Path(file_configuration["database_directory"]) + db_directory = Path(root_directory) # ---- Complete path to the database file db_file = db_directory / db_name diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py index 0512b667..2df92682 100644 --- a/echopop/zarr_read_ingest_test.py +++ b/echopop/zarr_read_ingest_test.py @@ -442,6 +442,15 @@ def biology_pipeline(biology_dict: dict, "biomass_density_mean", "abundance", "biomass"], unique_columns=["x", "y"]) +# Index +previous_grid.set_index(["x", "y"], inplace=True) +previous_grid["biomass_density_mean"] = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean() +previous_grid["number_density_mean"] = updated_survey_data.groupby(["x", "y"])["number_density"].mean() + +# Convert area from m^2 to nmi^2 +previous_grid["abundance"] = previous_grid["number_density_mean"] * previous_grid["area"] +previous_grid["biomass"] = previous_grid["biomass_density_mean"] * previous_grid["area"] + # Get unique coordinates update_keys = get_unique_identifiers(updated_survey_data, gridding_column).set_index(["x", "y"]) update_keys["number_density_mean"] = updated_survey_data.groupby(["x", "y"])["number_density"].mean() From 6c6214f870c00d133588d1c819cd432c27d8641f Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 12 Aug 2024 12:45:41 -0700 Subject: [PATCH 24/81] Fix `data_root_dir` missing workaround --- echopop/live/live_data_loading.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index 8ef37ce5..25c89064 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -54,6 +54,10 @@ def live_configuration(live_init_config_path: Union[str, Path], f"file." ) + # Amend root directory, if needed + if "data_root_dir" not in file_config: + file_config["data_root_dir"] = "" + # Combine both into a dictionary output that can be added to the `LiveSurvey` class object return {**init_config, **file_config} @@ -234,7 +238,7 @@ def validate_data_directory(file_configuration: dict, dataset: str, ) # root_directory = file_configuration["database_directory"] - + # Initialize the database file initialize_database(root_directory, file_settings) From d1bdc2cd74037dfa1812062ac87b3ddf2605bc75 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 12 Aug 2024 13:21:27 -0700 Subject: [PATCH 25/81] db pathing issues fixed --- echopop/live/live_acoustics.py | 5 ++++- echopop/live/live_data_loading.py | 4 ---- echopop/live/live_spatial_methods.py | 8 ++++++-- echopop/live/live_survey.py | 14 +++++++++----- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py index 6c1ebf08..24f96681 100644 --- a/echopop/live/live_acoustics.py +++ b/echopop/live/live_acoustics.py @@ -261,8 +261,11 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint df.loc[:, "id"] = key_values + # Get root database directory + root_database = file_configuration["database_directory"] + # Update the successfully processed files - query_processed_files(file_configuration["data_root_dir"], + query_processed_files(root_database, file_configuration["input_directories"]["acoustics"], meta_dict["provenance"]["acoustic_files"], processed=True) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index 25c89064..f6365689 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -54,10 +54,6 @@ def live_configuration(live_init_config_path: Union[str, Path], f"file." ) - # Amend root directory, if needed - if "data_root_dir" not in file_config: - file_config["data_root_dir"] = "" - # Combine both into a dictionary output that can be added to the `LiveSurvey` class object return {**init_config, **file_config} diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py index d8a46523..c86f20b9 100644 --- a/echopop/live/live_spatial_methods.py +++ b/echopop/live/live_spatial_methods.py @@ -453,10 +453,14 @@ def initialize_grid(file_configuration = dict): # Get coastline settings coast_settings = file_configuration["input_directories"]["coastline"] + # ---- Get root folder directory + coast_root = root_dir / coast_settings["directory"] / coast_settings["coastline_name"] # ---- Create filepath shp_filepath = ( - root_dir / coast_settings["directory"] - / coast_settings["coastline_name"] / f"{coast_settings['coastline_name']}.shp" + # root_dir / coast_settings["directory"] + # / coast_settings["coastline_name"] + coast_root + / f"{coast_settings['coastline_name']}.shp" ) # ---- Validate existence if not shp_filepath.exists(): diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index 58c5c27c..870b57da 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -39,6 +39,7 @@ from . import live_data_processing as eldp from . import live_data_loading as eldl + class LiveSurvey: """ A real-time processing version of the `echopop` base `Survey` class that ingests biological, @@ -60,7 +61,7 @@ def __init__( # initialize the Survey class object self.config = eldl.live_configuration(Path(live_init_config_path), Path(live_file_config_path)) - # ---- Initialize config key for database files + # # ---- Initialize config key for database files self.config.update( {"database": {key: None for key in self.config["input_directories"].keys()}} ) @@ -198,6 +199,9 @@ def process_biology_data(self): # ----- Unprocessed biology_unprocessed = self.input["biology"] + # Get database root directory + root_directory = self.config["database_directory"] + # Check if data are present unprocess_data_dfs = ( [True if isinstance(df, pd.DataFrame) and not df.empty else False @@ -260,10 +264,10 @@ def process_biology_data(self): }) # Update the database - query_processed_files(self.config["data_root_dir"], - self.config["input_directories"]["biology"], - self.meta["provenance"]["biology_files"], - processed=True) + query_processed_files(root_directory, + self.config["input_directories"]["biology"], + self.meta["provenance"]["biology_files"], + processed=True) def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True): From 3e252a068330e2892439cbc12fee8fea13faca2c Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 12 Aug 2024 13:26:03 -0700 Subject: [PATCH 26/81] `data_root_dir` check for `read_biology_files` --- echopop/live/live_data_loading.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index f6365689..0ad82db5 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -109,7 +109,10 @@ def read_biology_files(biology_files: List[Path], file_configuration: dict): # ---- Initialize the dictionary that will define this key in the `input` attribute biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} # # ---- Create filepath object - directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"] + if "data_root_dir" in file_configuration: + directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"] + else: + directory_path = Path(file_settings["directory"]) # Add SQL file to dict # file_configuration["database"]["biology"] = ( From a1cec0198c5758cdfbe59558ee98a8d5dd62a169 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 12 Aug 2024 19:53:29 -0700 Subject: [PATCH 27/81] Gridding methods --- echopop/live/live_data_processing.py | 79 +++---------- echopop/live/live_spatial_methods.py | 83 +++++++++++++- echopop/live/live_visualizer.py | 0 echopop/live/sql_methods.py | 60 ++++++++++ echopop/mesh_generation.py | 161 +++++++++++++++++++++++++-- echopop/test_workflow.py | 1 - echopop/zarr_read_ingest_test.py | 62 +++++++++-- 7 files changed, 355 insertions(+), 91 deletions(-) create mode 100644 echopop/live/live_visualizer.py diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py index c46317a0..a235bf58 100644 --- a/echopop/live/live_data_processing.py +++ b/echopop/live/live_data_processing.py @@ -2,8 +2,9 @@ import re from functools import reduce -from .sql_methods import SQL, sql_group_update +from .sql_methods import SQL, sql_group_update, query_dataset, get_unique_identifiers from .live_biology import summarize_strata +from .live_spatial_methods import update_population_grid from pathlib import Path from typing import Union, Tuple, Optional, List @@ -16,68 +17,6 @@ LIVE_INPUT_FILE_CONFIG_MAP ) -def get_unique_identifiers(data_dict: dict, - unique_columns: List[str]) -> pd.DataFrame: - - # Gather all dataframes from a dictionary into a list - if isinstance(data_dict, dict): - df_list = [df for _, df in data_dict.items()] - else: - df_list = [data_dict] - - # Get unique values of each contrast column across the biological datasets - # dfs = [pd.DataFrame({col: df[col].unique().tolist()}) for col in unique_columns - # for df in df_list if isinstance(df, pd.DataFrame) and not df.empty and col in df.columns] - combined_df = pd.concat( - [df[unique_columns] for df in df_list if all(col in df.columns for col in unique_columns)], - ignore_index=True - ).drop_duplicates() - - # Reduce into a single DataFrame - return combined_df - # if len(unique_columns) > 1: - # return reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs) - # else: - # return reduce(lambda left, right: pd.merge(left, right, how="outer"), dfs) - -def query_dataset(db_file: str, - data_dict: dict, - table_name: str, - data_columns: List[str], - unique_columns: List[str], - constraint: str = None): - - # Validate that the desired table exists - if SQL(db_file, "validate", table_name=table_name): - # ---- Inspect the SQL table - inspected_table = SQL(db_file, "inspect", table_name=table_name) - # ---- Create a list of intersecting column names - unique_keys = list(set(inspected_table.keys()).intersection(set(unique_columns))) - # ---- Create list of valid columns - valid_keys = list(set(inspected_table.keys()).intersection(set(data_columns))) - # ---- Get unique identifiers - unique_keys_df = get_unique_identifiers(data_dict, unique_keys) - # ---- Create conditional string - conditional_str = " | ".join( - [" & ".join([f"{col} = {val}" for col, val in row.items()]) - for _, row in unique_keys_df.iterrows()] - ) - # conditional_str = ( - # " & ".join([f"{col} in {np.unique(unique_keys_df[col]).tolist()}" - # for col in unique_keys_df.columns]) - # ) - # ---- Append the additional constraint statement if present - if constraint is not None: - conditional_str = f"({conditional_str})" + f" & {constraint}" - # ---- SELECT the dataset using the conidtional statement - data_sql = SQL(db_file, "select", table_name=table_name, columns=valid_keys, - condition=conditional_str).filter(data_columns) - else: - data_sql = None - - # Return the table DataFrame - return data_sql - def get_average_strata_weights(db_file: str, data_dict: dict, unique_columns: list): @@ -164,6 +103,10 @@ def acoustic_pipeline(acoustic_dict: dict, # Summarize strata summarize_strata(nasc_biology, strata_df, file_configuration) + # Update grid + update_population_grid(file_configuration, coordinates=["x", "y"], + dataset=nasc_biology) + def get_nasc_sql_data(db_file: str, data_dict: dict, unique_columns: List[str]): @@ -246,7 +189,7 @@ def biology_pipeline(biology_dict: dict, unique_columns=unique_columns) # Get the corresopding `sigma_bs` data (and also compute the sample-number weighted average) - sigma_bs_df = get_sigma_bs_sql_data(acoustic_db, + sigma_bs_df = get_sigma_bs_sql_data(biology_db, biology_dict, unique_columns=unique_columns) @@ -262,8 +205,8 @@ def biology_pipeline(biology_dict: dict, # Get the corresponding average strata weights (computed for all fish) weight_spatial_averages = get_average_strata_weights(biology_db, - biology_dict, - unique_columns=unique_columns) + biology_dict, + unique_columns=unique_columns) if weight_spatial_averages is not None: # Merge average weights with number density estimates @@ -282,3 +225,7 @@ def biology_pipeline(biology_dict: dict, # Summarize strata summarize_strata(nasc_biology, strata_df, file_configuration) + + # Update population grid + update_population_grid(file_configuration, coordinates=["stratum"], + dataset=nasc_biology) diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py index c86f20b9..510e26a6 100644 --- a/echopop/live/live_spatial_methods.py +++ b/echopop/live/live_spatial_methods.py @@ -7,7 +7,8 @@ from shapely.geometry import box import sqlalchemy as sqla from pathlib import Path -from typing import Union +from typing import Union, List +from .sql_methods import sql_group_update, query_dataset def create_inpfc_strata(spatial_config: dict): @@ -181,8 +182,8 @@ def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict): # Generate the cells grid_cells = [] # ---- Iterate through - for y0 in np.arange(ymin, ymax+y_step, y_step): - for x0 in np.arange(xmin, xmax+x_step, x_step): + for y0 in np.arange(ymin, ymax, y_step): + for x0 in np.arange(xmin, xmax, x_step): x1 = x0-x_step y1 = y0+y_step grid_cells.append(shapely.geometry.box(x0, y0, x1, y1)) @@ -210,9 +211,9 @@ def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict): dataset_gdf["stratum_x"] = pd.cut( dataset_gdf["x"], np.arange(xmin, xmax+x_step, x_step), - right = True, - labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1), - ).astype(int) + 1 + right = False, + labels = np.arange(1, len(np.arange(xmin, xmax+x_step, x_step))), + ).astype(int) # Bin the latitude data dataset_gdf["stratum_y"] = pd.cut( @@ -501,6 +502,8 @@ def initialize_grid(file_configuration = dict): # Calculate area per cell cells_gdf.loc[:, "area"] = cells_gdf.area + # ---- Convert back to nmi^2 from m^2 + cells_gdf.loc[:, "area"] = cells_gdf.loc[:, "area"] / 1852 ** 2 # Convert back to original projection and clip clipped_cells_latlon = ( @@ -540,3 +543,71 @@ def initialize_grid(file_configuration = dict): ) # ---- Connect and create table _ = coastline_out.to_sql("coastline_df", engine, if_exists="replace", index=False) + +def update_population_grid(file_configuration: dict, + coordinates: Union[List[str], str], + dataset: Union[dict, pd.DataFrame]): + + # Extract input directory settings + file_settings = file_configuration["input_directories"] + + # Get filepath for grid + grid_db = list( + Path(file_configuration["database_directory"]) + .glob(pattern=f"{file_settings["grid"]["database_name"]}") + )[0] + + # Get filepath for acoustics + survey_db = list( + Path(file_configuration["database_directory"]) + .glob(pattern=f"{file_settings["acoustics"]["database_name"]}") + )[0] + + # Define the SQL tables that will be parsed and queries + data_table = "survey_data_df" + grid_table = "grid_df" + + # Get indexed survey data + indexed_data = query_dataset(survey_db, + dataset, + table_name=data_table, + data_columns=coordinates + ["x", "y", "number_density", + "biomass_density"], + unique_columns=coordinates) + + # Get indexed grid data + indexed_grid = query_dataset(grid_db, + indexed_data, + table_name=grid_table, + data_columns= ["x", "y", "area", "number_density_mean", + "biomass_density_mean", "abundance", "biomass"], + unique_columns=["x", "y"]) + + # Set DataFrame index + indexed_grid.set_index(["x", "y"], inplace=True) + + # Update the areal density esitmates + # ---- Number (animals/nmi^2) + indexed_grid["number_density_mean"] = indexed_data.groupby(["x", "y"])["number_density"].mean() + # ---- Bioamss (kg/nmi^2) + indexed_grid["biomass_density_mean"] = indexed_data.groupby(["x", "y"])["biomass_density"].mean() + + # Compute the abundance and biomass per grid cell + # ---- Abundance (# animals) + indexed_grid["abundance"] = indexed_grid["number_density_mean"] * indexed_grid["area"] + # ---- kg + indexed_grid["biomass"] = indexed_grid["biomass_density_mean"] * indexed_grid["area"] + + # Update grid table + # ---- Reset index + output_df = indexed_grid.reset_index() + # ---- Grouped update + sql_group_update(grid_db, dataframe=output_df, table_name=grid_table, + columns=["number_density_mean", "biomass_density_mean", "abundance", + "biomass"], + unique_columns=["x", "y"]) + + + + + \ No newline at end of file diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py new file mode 100644 index 00000000..e69de29b diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index 7ae3824f..eb009780 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -566,6 +566,29 @@ def get_table_key_names(db_file: Path, data_dict: dict, table_name: str) -> List # Return a list of the output return list(key_columns) +def get_unique_identifiers(data_dict: dict, + unique_columns: List[str]) -> pd.DataFrame: + + # Gather all dataframes from a dictionary into a list + if isinstance(data_dict, dict): + df_list = [df for _, df in data_dict.items()] + else: + df_list = [data_dict] + + # Get unique values of each contrast column across the biological datasets + combined_df = pd.concat( + [df[unique_columns] for df in df_list if isinstance(df, pd.DataFrame) and all(col in df.columns for col in unique_columns)], + ignore_index=True + ).drop_duplicates() + + # Reduce into a single DataFrame + return combined_df + # if len(unique_columns) > 1: + # return reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs) + # else: + # return reduce(lambda left, right: pd.merge(left, right, how="outer"), dfs) + + def parse_condition(condition: str): # Replace logical operators with SQL equivalents condition = condition.replace('&', ' AND ').replace('|', ' OR ') @@ -699,6 +722,43 @@ def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str f"Attempted reset of [{str(db_file)}] failed." ) +def query_dataset(db_file: str, + data_dict: dict, + table_name: str, + data_columns: List[str], + unique_columns: List[str], + constraint: Optional[str] = None): + + # Validate that the desired table exists + if SQL(db_file, "validate", table_name=table_name): + # ---- Inspect the SQL table + inspected_table = SQL(db_file, "inspect", table_name=table_name) + # ---- Create a list of intersecting column names + unique_keys = list(set(inspected_table.keys()).intersection(set(unique_columns))) + # ---- Create list of valid columns + valid_keys = list(set(inspected_table.keys()).intersection(set(data_columns))) + # ---- Get unique identifiers + unique_keys_df = get_unique_identifiers(data_dict, unique_keys) + # ---- Create conditional string + conditional_str = " | ".join( + [" & ".join([f"{col} = {val}" for col, val in row.items()]) + for _, row in unique_keys_df.iterrows()] + ) + # conditional_str = ( + # " & ".join([f"{col} in {np.unique(unique_keys_df[col]).tolist()}" + # for col in unique_keys_df.columns]) + # ) + # ---- Append the additional constraint statement if present + if constraint is not None: + conditional_str = f"({conditional_str})" + f" & {constraint}" + # ---- SELECT the dataset using the conidtional statement + data_sql = SQL(db_file, "select", table_name=table_name, columns=valid_keys, + condition=conditional_str).filter(data_columns) + else: + data_sql = None + + # Return the table DataFrame + return data_sql def sql_update_strata_summary(source_db: str, target_db: str, source_table: str, diff --git a/echopop/mesh_generation.py b/echopop/mesh_generation.py index 699eed4f..7752fe63 100644 --- a/echopop/mesh_generation.py +++ b/echopop/mesh_generation.py @@ -2033,28 +2033,167 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict): # plt.xlim(lon_min-3, lon_max+3) # plt.ylim(lat_min-3, lat_max+3) # plt.show() -test = SQL(db_filepath, "select", table_name="grid_df") +from echopop.live.sql_methods import SQL from shapely import wkt import matplotlib.pyplot as plt +import geopandas as gpd +import matplotlib.colors as colors +import matplotlib.cm as cm +import numpy as np +from matplotlib.colors import ListedColormap +import matplotlib.dates as mdates +from datetime import datetime +db_filepath = realtime_survey.config["database"]["grid"] +survey_db = realtime_survey.config["database"]["acoustics"] +grid_df = SQL(db_filepath, "select", table_name="grid_df") +# grid_df[grid_df.abundance > 0] +grid_df[grid_df.abundance > 1e10] +# grid_df[grid_df.abundance > 0] +coast_df = SQL(db_filepath, "select", table_name="coastline_df") +survey_df = SQL(survey_db, "select", table_name="survey_data_df") + +# def parse_datetime(date_str): +# # List of possible formats +# formats = [ +# '%Y-%m-%d %H:%M:%S.%f', # With fractional seconds +# '%Y-%m-%d %H:%M:%S', # Without fractional seconds +# '%Y-%m-%dT%H:%M:%S.%f', # ISO 8601 format with fractional seconds +# '%Y-%m-%dT%H:%M:%S' # ISO 8601 format without fractional seconds +# ] + +# for fmt in formats: +# try: +# return pd.to_datetime(date_str, format=fmt) +# except (ValueError, TypeError): +# continue # Try the next format + +# return pd.NaT # Return NaT if no formats match -test = output_df.copy() -test["geometry"] = test["geometry"].apply(wkt.loads) -test_gdf = gpd.GeoDataFrame(test, geometry="geometry", crs=projection) +# survey_df["ping_time"] = survey_df["ping_time"].apply(parse_datetime) -co = SQL(db_filepath, "select", table_name="coastline_df") -co["geometry"] = co["geometry"].apply(wkt.loads) -co_gdf = gpd.GeoDataFrame(co, geometry="geometry", crs=projection) +# pd.to_datetime(survey_df["ping_time"], format='%Y-%m-%d %H:%M:%S.%f', errors="coerce") -lims = test_gdf.total_bounds +# fig, ax = plt.subplots(figsize=(5, 8)) +# ax.scatter(survey_df.ping_time, survey_df.nasc) +# plt.ylabel("NASC") +# # ax.xaxis.set_major_locator(mdates.DayLocator(5, 10, 15)) +# plt.show() -fig, ax = plt.subplots(figsize=(10, 10)) -test_gdf.plot(ax=ax, column="abundance", edgecolor="black", cmap="viridis", legend=False) -co_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") + +# times = np.arange(np.datetime64('2001-01-02'), +# np.datetime64('2002-02-03'), np.timedelta64(75, 'm')) +# y = np.random.randn(len(times)) +# survey_df[(survey_df.nasc > 0) & (survey_df.nasc < 1e5)]["nasc"].mean() +# survey_df[(survey_df.nasc > 0) & (survey_df.nasc > 1e5)]["nasc"].mean() + +# fig, ax = plt.subplots() +# ax.plot(times, y) +# survey_df[(survey_df.number_density > 0) & (survey_df.x == 21)] +# # a = self.input["acoustics"]["prc_nasc_df"] +# # survey_df[(survey_df.x) == 24 & (survey_df.y == 13)] + +grid_df["geometry"] = grid_df["geometry"].apply(wkt.loads) +coast_df["geometry"] = coast_df["geometry"].apply(wkt.loads) + +projection = realtime_survey.config["geospatial"]["projection"] + +grid_gdf = gpd.GeoDataFrame(grid_df, geometry="geometry", crs=projection) +grid_gdf_1 = grid_gdf[grid_gdf.abundance > 0] +coast_gdf = gpd.GeoDataFrame(coast_df, geometry="geometry", crs=projection) + +lims = grid_gdf.total_bounds +# nu = dataset_gdf[(dataset_gdf.stratum_x == 25) & (dataset_gdf.stratum_y == 11)] +# dataset_gdf.stratum_x.max() +# # np.linspace(1, 1, len(np.arange(xmin, xmax+x_step, x_step))-1) + +# # np.arange(1, len(np.arange(xmin, xmax+x_step, x_step))) +# pd.cut( +# nu["x"], +# np.arange(xmin, xmax, x_step), +# right = False, +# labels = np.arange(1, len(np.arange(xmin, xmax, x_step))), +# ).astype(int) - 1 +# grid_gdf["x"] = grid_gdf["x"] - 1 + +# fig, ax = plt.subplots(figsize=(5, 8)) +# grid_gdf.plot(ax=ax, edgecolor="gainsboro", color="white", linewidth=0.5, legend=False) +# plt.plot(dataset_gdf.longitude, dataset_gdf.latitude, linewidth=1, color='black') +# plt.plot(nu.longitude, nu.latitude, linewidth=1, color="red") +# # Calculate centroids and plot text +# for idx, row in grid_gdf.iterrows(): +# centroid = row.geometry.centroid +# var = f"{row.x}-{row.y}" +# ax.annotate(var, xy=(centroid.x, centroid.y), +# xytext=(0,0), fontsize=8, +# textcoords="offset points", +# ha='center', va='center', color='black') +# plt.tight_layout() +# plt.margins(0, 0) +# coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") +# plt.xlim(lims[0]*1.005, lims[2]*1.01) +# plt.ylim(lims[1]*0.98, lims[3]*1.005) +# plt.show() + + +variable = "abundance" +VARIABLE_MAP = { + "number_density_mean": { + "name": "Mean number density", + "units": "fish $\\mathregular{nmi^{-2}}$" + }, + "biomass_density_mean": { + "name": "Mean biomass density", + "units": "kg $\\mathregular{nmi^{-2}}$" + }, + "biomass": { + "name": "Biomass", + "units": "kg" + }, + "abundance": { + "name": "Abundance", + "units": "$\\it{N}$" + } +} + +viridis = plt.colormaps.get_cmap('viridis').resampled(1024) +newcolors = viridis(np.linspace(0, 1, 1024))[::-1] +white = np.array([1, 1, 1, 1]) +newcolors[0, :] = white +custom_cmap = ListedColormap(newcolors) +# Check the minimum and maximum values for normalization + + +fig, ax = plt.subplots(figsize=(5, 8)) +grid_gdf.plot(ax=ax, edgecolor="gainsboro", color="white", linewidth=0.5, legend=False) +grid_gdf_1.plot(ax=ax, column=variable, edgecolor="black", linewidth=2, cmap=custom_cmap, legend=False, norm=norm) +plt.scatter(survey_df["longitude"], survey_df["latitude"], linewidth=0.5, color="black") +vmin = grid_gdf[variable][grid_gdf[variable] > 0.0].min() +vmax = grid_gdf[variable].max() +norm = colors.Normalize(vmin=0, vmax=vmax, clip=False) +# norm = colors.Normalize(vmin=grid_gdf[variable][grid_gdf[variable] > 0.0].min(), vmax=grid_gdf[variable].max()) +# cbar = plt.colorbar(cm.ScalarMappable(norm=norm, cmap=custom_cmap), ax=ax, orientation="horizontal", shrink=0.5) +cbar = plt.colorbar(cm.ScalarMappable(cmap=custom_cmap, norm=norm), ax=ax, orientation="horizontal", shrink=0.5) +cbar.set_label(f"{VARIABLE_MAP[variable]["name"]} ({VARIABLE_MAP[variable]["units"]})", + fontsize=12, labelpad=10, loc='center') +cbar.ax.xaxis.set_label_position('top') +cbar.ax.xaxis.set_ticks_position('top') +plt.tight_layout() +plt.margins(0,0) +# grid_gdf_1.plot(ax=ax, linewidth=1.5, color="black") +coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") plt.xlim(lims[0]*1.005, lims[2]*1.01) plt.ylim(lims[1]*0.98, lims[3]*1.005) +plt.xlabel(u'Longitude (\u00B0E)') +plt.ylabel(u'Latitude (\u00B0N)') plt.show() +co = SQL(db_filepath, "select", table_name="coastline_df") +co["geometry"] = co["geometry"].apply(wkt.loads) +co_gdf = gpd.GeoDataFrame(co, geometry="geometry", crs=projection) + + + test["geometry"].apply(wkt.loads) clipped_cells_latlon["geometry"] len(bbox_latlon.exterior.coords) diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index e52c6739..7f2006c5 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -1,6 +1,5 @@ from echopop.live.live_survey import LiveSurvey from echopop.live.sql_methods import SQL - #################################################################################################### # TEST: Set up `LiveSurvey` object # NOTE: General initialization parameter configuration diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py index 2df92682..101bc81a 100644 --- a/echopop/zarr_read_ingest_test.py +++ b/echopop/zarr_read_ingest_test.py @@ -24,6 +24,27 @@ from echopop.live.live_acoustics import integrate_nasc, configure_transmit_frequency from echopop.live.live_biology import preprocess_biology_data from echopop.survey import Survey +import geopandas as gpd +import pandas as pd +import numpy as np +import shapely.geometry +from shapely.geometry import box +from echopop.spatial.projection import utm_string_generator +from geopy.distance import distance +from echopop.live.sql_methods import SQL +from shapely import wkt +import matplotlib.pyplot as plt +import geopandas as gpd +import matplotlib.colors as colors +import matplotlib.cm as cm +import numpy as np +from matplotlib.colors import ListedColormap +self = realtime_survey +spatial_config = self.config["geospatial"] +dataset = self.input["acoustics"]["nasc_df"] + + + survey_2019 = Survey("C:/Users/Brandyn/Documents/GitHub/echopop/config_files/initialization_config.yml", "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/survey_year_2019_config.yml") survey_2019.transect_analysis() @@ -434,6 +455,8 @@ def biology_pipeline(biology_dict: dict, SQL(survey_db_file, "select", table_name=data_table) SQL(data_table, "map") +gridding_column = self.config["gridding_column"] + updated_survey_data = nasc_biology.copy() # Get relevant table previous_grid = query_dataset(grid_db_file, updated_survey_data, @@ -441,23 +464,39 @@ def biology_pipeline(biology_dict: dict, data_columns=["x", "y", "area", "number_density_mean", "biomass_density_mean", "abundance", "biomass"], unique_columns=["x", "y"]) +previous_data = query_dataset(survey_db_file, updated_survey_data, + table_name=data_table, + data_columns=["x", "y", "number_density", "biomass_density"], + unique_columns=["x", "y"]) +# Get unique coordinates +update_keys = get_unique_identifiers(updated_survey_data, gridding_column).set_index(["x", "y"]) + # Index previous_grid.set_index(["x", "y"], inplace=True) -previous_grid["biomass_density_mean"] = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean() -previous_grid["number_density_mean"] = updated_survey_data.groupby(["x", "y"])["number_density"].mean() +previous_grid["biomass_density_mean"] = previous_data.groupby(["x", "y"])["biomass_density"].mean() +previous_grid["number_density_mean"] = previous_data.groupby(["x", "y"])["number_density"].mean() # Convert area from m^2 to nmi^2 previous_grid["abundance"] = previous_grid["number_density_mean"] * previous_grid["area"] previous_grid["biomass"] = previous_grid["biomass_density_mean"] * previous_grid["area"] +previous_grid = previous_grid.reset_index() -# Get unique coordinates -update_keys = get_unique_identifiers(updated_survey_data, gridding_column).set_index(["x", "y"]) -update_keys["number_density_mean"] = updated_survey_data.groupby(["x", "y"])["number_density"].mean() -update_keys["biomass_density_mean"] = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean() +sql_group_update(grid_db_file, dataframe=previous_grid, + table_name=grid_table, + columns=["number_density_mean", "biomass_density_mean", "abundance", "biomass"], + unique_columns=["x", "y"]) +murr = SQL(grid_db_file, "select", table_name=grid_table) +murr[murr.abundance > 0] +update_keys["number_density_mean"] = updated_survey_data.groupby(["x", "y"])["number_density"].mean() +update_keys["biomass_density_mean"] = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean() +am = SQL(grid_db_file, "select", table_name="grid_df") +am[am.abundance > 0] +bm = SQL(grid_db_file, "select", table_name="grid_df") +bm[bm.abundance > 0] number_density_mean = updated_survey_data.groupby(["x", "y"])["number_density"].mean() biomass_density_mean = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean() @@ -1656,6 +1695,10 @@ def __init__( TS_SLOPE = 20.0 TS_INTERCEPT = -68.0 +acoustic_db = realtime_survey.config["database"]["acoustics"] +SQL(acoustic_db, "select", table_name="files_processed") +biology_db = realtime_survey.config["database"]["biology"] +SQL(biology_db, "select", table_name="files_processedk") #### # CONCATENATE FILE SOURCES specimen_reframed = specimen_df.groupby(["haul_num", "station", "sex", "length"])["length"].value_counts().to_frame("length_count").reset_index() @@ -1666,6 +1709,12 @@ def __init__( comb_lengths = all_lengths.groupby(["haul_num", "sex", "length"])["length_count"].sum().to_frame("length_count").reset_index() +from echopop.live.sql_methods import SQL + +# Assuming that you have a LiveSurvey object defined +# ---- Get the database file name (and path) +biology_db = livesurvey_object.config["database"]["biology"] +# ---- # CONVERT TO TS comb_lengths["ts"] = TS_SLOPE * np.log10(comb_lengths["length"]) + TS_INTERCEPT # TO SIGMA_BS @@ -1673,7 +1722,6 @@ def __init__( # WEIGHTED MEAN SIGMA_BS sigma_mean = np.average(comb_lengths["sigma_bs"], weights=comb_lengths["length_count"]) -### # INTEGRATE NASC path2file = "C:/Users/15052/Downloads/win_1720457505_1720460000_NASC.zarr" From dd87bc909216736f1a422e4c8a94c3776f150d12 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Tue, 13 Aug 2024 10:16:51 -0700 Subject: [PATCH 28/81] Grid fix --- echopop/live/live_spatial_methods.py | 4 +- echopop/test_workflow.py | 94 +++++++++++++++++++++++++++- 2 files changed, 95 insertions(+), 3 deletions(-) diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py index 510e26a6..33f534dd 100644 --- a/echopop/live/live_spatial_methods.py +++ b/echopop/live/live_spatial_methods.py @@ -554,13 +554,13 @@ def update_population_grid(file_configuration: dict, # Get filepath for grid grid_db = list( Path(file_configuration["database_directory"]) - .glob(pattern=f"{file_settings["grid"]["database_name"]}") + .glob(pattern=f"{file_settings['grid']['database_name']}") )[0] # Get filepath for acoustics survey_db = list( Path(file_configuration["database_directory"]) - .glob(pattern=f"{file_settings["acoustics"]["database_name"]}") + .glob(pattern=f"{file_settings['acoustics']['database_name']}") )[0] # Define the SQL tables that will be parsed and queries diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index 7f2006c5..e95f7336 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -65,4 +65,96 @@ # NOTE: Quantized length-binned weights (summed) SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df") # NOTE: Average weights per stratum -SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df") \ No newline at end of file +SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df") + +dat = realtime_survey.input["acoustics"]["prc_nasc_df"].copy() +dat = dat[dat.latitude > 40] +dat = dat[dat.depth > 20] + +import matplotlib.pyplot as plt +import seaborn as sns +from geopy.distance import geodesic +import pandas as pd +from datetime import datetime +import matplotlib.dates as mdates +def calculate_distances(df): + distances = [0] # Start with 0 for the first point + for i in range(1, len(df)): + point1 = (df.iloc[i - 1]['latitude'], df.iloc[i - 1]['longitude']) + point2 = (df.iloc[i]['latitude'], df.iloc[i]['longitude']) + distances.append(geodesic(point1, point2).meters) + return distances + +def parse_datetime(date_str): + # List of possible formats + formats = [ + '%Y-%m-%d %H:%M:%S.%f', # With fractional seconds + '%Y-%m-%d %H:%M:%S', # Without fractional seconds + '%Y-%m-%dT%H:%M:%S.%f', # ISO 8601 format with fractional seconds + '%Y-%m-%dT%H:%M:%S' # ISO 8601 format without fractional seconds + ] + + for fmt in formats: + try: + return pd.to_datetime(date_str, format=fmt) + except (ValueError, TypeError): + continue # Try the next format + + return pd.NaT # Return NaT if no formats match + +dat["ping_time"] = dat["ping_time"].apply(parse_datetime) + +pivot_table = dat.pivot_table(index=["depth"], columns=["ping_time"], values=["NASC"], aggfunc="mean") +# Get the unique distance and depth values for plotting +plt.figure(figsize=(10, 8)) +ax = sns.heatmap(pivot_table, cmap="viridis", cbar_kws={'label': 'NASC'}) +plt.gca().xaxis.set_major_locator(mdates.MinuteLocator(interval=30)) # Major ticks every 30 minutes +plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%H:%M')) # Format as hour:minute +plt.gcf().autofmt_xdate() +ax.set_xticks(ax.get_xticks()[::max(len(ax.get_xticks()) // 10, 1)]) # Show fewer ticks if necessary +plt.xlabel('Ping time') +plt.ylabel('Depth') +# plt.gca().invert_yaxis() # To have depth increasing downwards like in a typical depth plot +plt.show() + + +dat.groupby(["ping_time"]).size() +unique_pairs = dat.drop_duplicates(subset=['latitude', 'longitude']).sort_values("ping_time") + +unique_pairs["d"] = calculate_distances(dat) +df['cumulative_distance'] = df['distance'].cumsum() + +unique_distances = dat.groupby('source')[['latitude', 'longitude']].unique().reset_index() +unique_distances = unique_distances.explode('distance') + + +# Create a pivot table to reshape the dataframe suitable for a heatmap +dat['source_id'] = dat['source'].astype('category').cat.codes +pivot_table = dat.pivot(index=["depth"], columns=["distance"], values=["NASC"]) +dat.groupby('source')['distance'].cumsum() +plt.plot(index="depth", columns="distance", values="NASC") +plt.show() + +data = { + 'distance': [1, 1, 2, 2, 1, 1, 3, 3], + 'depth': [1, 2, 1, 2, 1, 2, 1, 2], + 'source': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'] +} +dat = pd.DataFrame(data) +dat = dat.sort_values(by=['source', 'distance']) +unique_distances = dat.groupby('source')['distance'].unique().reset_index() +unique_distances = unique_distances.explode('distance') + +unique_distances['distance'] = pd.to_numeric(unique_distances['distance'], errors='coerce') +unique_distances['distance_diff'] = unique_distances.groupby('source')['distance'].diff().fillna(0) +unique_distances['cumsum_diff'] = unique_distances.groupby('source')['distance_diff'].cumsum() +unique_distances['Cumsum_dist'] = unique_distances['cumsum_diff'].cumsum() +unique_distances['Cumsum_dist'] = pd.to_numeric(unique_distances['Cumsum_dist'], errors='coerce') +dat = dat.merge(unique_distances[['source', 'distance', 'Cumsum_dist']], on=['source', 'distance'], how='left') + + +# Calculate cumulative sum of distances for each source +dat['Cumsum_dist'] = dat.groupby('source')['distance'].transform(lambda x: x.cumsum()) +dat['Cumsum_dist_within_source'] = dat.groupby('source')['distance'].cumsum() + +dat['Cumsum_dist'] = dat.groupby('source')['Cumsum_dist_within_source'].transform(lambda x: x + x.shift(1).fillna(0).cumsum()) From 8693641f1bc6dfd5069cc5db9fa853680308138c Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 10:32:53 -0700 Subject: [PATCH 29/81] Add `xarray` kwargs options --- echopop/live/live_data_loading.py | 10 ++++++---- echopop/live/live_survey.py | 8 +++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index 0ad82db5..e8b29ff9 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -57,7 +57,8 @@ def live_configuration(live_init_config_path: Union[str, Path], # Combine both into a dictionary output that can be added to the `LiveSurvey` class object return {**init_config, **file_config} -def read_acoustic_files(acoustic_files: List[Path]) -> tuple: +def read_acoustic_files(acoustic_files: List[Path], + xarray_kwargs: dict = {}) -> tuple: # Get the file-specific settings, datatypes, columns, etc. # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` @@ -66,7 +67,8 @@ def read_acoustic_files(acoustic_files: List[Path]) -> tuple: # Read all of the zarr files results_list = [(data_df, unit_dict) if i ==0 else (data_df, None) for i, (data_df, unit_dict) in enumerate( - read_acoustic_zarr(Path(file), acoustics_config_map) + read_acoustic_zarr(Path(file), acoustics_config_map, + xarray_kwargs=xarray_kwargs) for file in acoustic_files )] @@ -154,7 +156,7 @@ def read_biology_files(biology_files: List[Path], file_configuration: dict): # Return the output return biology_output -def read_acoustic_zarr(file: Path, config_map: dict) -> tuple: +def read_acoustic_zarr(file: Path, config_map: dict, xarray_kwargs: dict = {}) -> tuple: # Format the file reading configuration # ---- Concatenate into a full configuration map @@ -162,7 +164,7 @@ def read_acoustic_zarr(file: Path, config_map: dict) -> tuple: **config_map["xarray_variables"]} # Determine the file loading method for the `acoustic_files` - zarr_data_ds = xr.open_dataset(file, engine="zarr", chunks="auto") + zarr_data_ds = xr.open_dataset(file, engine="zarr", chunks="auto", **xarray_kwargs) # Pre-process the Dataset, convert it to a DataFrame, and validate the structure # ---- Convert to a DataFrame diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index 870b57da..9da07806 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -128,12 +128,14 @@ def __str__(self): return self.__repr__() def load_acoustic_data(self, - input_filenames: Optional[list] = None, + xarray_kwargs: dict = {}, + input_filenames: Optional[list] = None, verbose: bool = True): # Validate the data directory and format the filepaths - acoustic_files = eldl.validate_data_directory(self.config, dataset="acoustics", - input_filenames=input_filenames) + acoustic_files = eldl.validate_data_directory(self.config, dataset="acoustics", + input_filenames=input_filenames, + xarray_kwargs=xarray_kwargs) # Read in the acoustic data files if acoustic_files: From faed21a809bb9a23620cabf129ba6f7f5abc4b9b Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 10:38:19 -0700 Subject: [PATCH 30/81] `pandas` kwargs storage options --- echopop/live/live_data_loading.py | 10 ++++++---- echopop/live/live_survey.py | 4 +++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index e8b29ff9..30e14904 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -96,7 +96,8 @@ def filter_filenames(directory_path: Path, filename_id: str, # Find intersection with the proposed filenames and return the output return list(set(subfile_str).intersection(set(file_str))) -def read_biology_files(biology_files: List[Path], file_configuration: dict): +def read_biology_files(biology_files: List[Path], file_configuration: dict, + pandas_kwargs: dict = {}): # Get the biology data file settings file_settings = file_configuration["input_directories"]["biology"] @@ -137,7 +138,8 @@ def read_biology_files(biology_files: List[Path], file_configuration: dict): # ---- Read in validated biology data dataframe_list = [read_biology_csv(Path(file), file_settings["file_name_formats"][dataset], - biology_config_map[dataset]) + biology_config_map[dataset], + pandas_kwargs) for file in dataset_files] # ---- Concatenate the dataset dataframe_combined = pd.concat(dataframe_list, ignore_index=True) @@ -265,10 +267,10 @@ def compile_filename_format(file_name_format: str): # Compile the regex pattern and return the output return re.compile(regex_pattern) -def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict): +def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict, pandas_kwargs: dict = {}): # Read in the `*.csv` file - df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys())) + df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys()), **pandas_kwargs) # Validate the dataframe # ---- Check for any missing columns diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index 9da07806..fa738967 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -165,6 +165,7 @@ def load_acoustic_data(self, self.input["acoustics"]["prc_nasc_df"] = None def load_biology_data(self, + pandas_kwargs: dict = {}, input_filenames: Optional[list] = None, verbose: bool = True): @@ -182,7 +183,8 @@ def load_biology_data(self, ) # Read in the biology data files - initial_biology_output = eldl.read_biology_files(biology_files, self.config) + initial_biology_output = eldl.read_biology_files(biology_files, self.config, + pandas_kwargs=pandas_kwargs) # Preprocess the biology dataset self.input["biology"], self.input["biology_processed"] = ( From af9385170ca0ac5f0ae57a11e74ac626e83a63ca Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 11:08:40 -0700 Subject: [PATCH 31/81] `xarray_kwargs` patch --- echopop/live/live_survey.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index fa738967..3d5dd446 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -128,20 +128,20 @@ def __str__(self): return self.__repr__() def load_acoustic_data(self, - xarray_kwargs: dict = {}, + xarray_kwargs: dict = {}, input_filenames: Optional[list] = None, verbose: bool = True): # Validate the data directory and format the filepaths acoustic_files = eldl.validate_data_directory(self.config, dataset="acoustics", - input_filenames=input_filenames, - xarray_kwargs=xarray_kwargs) + input_filenames=input_filenames) # Read in the acoustic data files if acoustic_files: # ! [REQUIRES DASK] ---- Read in the listed file # ---- Read in the acoustic data files - prc_nasc_df, acoustic_data_units = eldl.read_acoustic_files(acoustic_files) + prc_nasc_df, acoustic_data_units = eldl.read_acoustic_files(acoustic_files, + xarray_kwargs=xarray_kwargs) # ---- Add the `acoustic_data_units` to the dictionary self.config["acoustics"]["dataset_units"] = acoustic_data_units # ---- Preprocess the acoustic dataset From 0b13fa74d70cb88999165897c589ba7eb22fd9e8 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 11:17:45 -0700 Subject: [PATCH 32/81] Disable file/directory existence checker --- echopop/live/live_data_loading.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index 30e14904..caa55244 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -217,19 +217,19 @@ def validate_data_directory(file_configuration: dict, dataset: str, # Validate filepath, columns, datatypes # ---- Error evaluation (if applicable) - if not directory_path.exists(): - raise FileNotFoundError( - f"The acoustic data directory [{directory_path}] does not exist." - ) + # if not directory_path.exists(): + # raise FileNotFoundError( + # f"The acoustic data directory [{directory_path}] does not exist." + # ) # Validate that files even exist # ---- List available *.zarr files data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}")) # ---- Error evaluation (if applicable) - if not data_files: - raise FileNotFoundError( - f"No `*.{file_settings['extension']}` files found in [{directory_path}]!" - ) + # if not data_files: + # raise FileNotFoundError( + # f"No `*.{file_settings['extension']}` files found in [{directory_path}]!" + # ) # Check and format specific input filenames if isinstance(input_filenames, list): From 9e9ae077202988c4515ddbb4746262ebf96e3b94 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 11:26:38 -0700 Subject: [PATCH 33/81] Remove `Path` typing for acoustic zarr input --- echopop/live/live_data_loading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index caa55244..afd9be00 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -67,7 +67,7 @@ def read_acoustic_files(acoustic_files: List[Path], # Read all of the zarr files results_list = [(data_df, unit_dict) if i ==0 else (data_df, None) for i, (data_df, unit_dict) in enumerate( - read_acoustic_zarr(Path(file), acoustics_config_map, + read_acoustic_zarr(file, acoustics_config_map, xarray_kwargs=xarray_kwargs) for file in acoustic_files )] From a46ccacb95beb69d74e28df3f5e2c7eb7558577b Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 11:52:57 -0700 Subject: [PATCH 34/81] Attempts pathing fixes --- echopop/live/live_data_loading.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index afd9be00..96485229 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -207,13 +207,20 @@ def validate_data_directory(file_configuration: dict, dataset: str, # Get the acoustic file settings and root directory # ---- Root directory if "data_root_dir" in file_configuration.keys(): - root_directory = Path(file_configuration["data_root_dir"]) + # root_directory = Path(file_configuration["data_root_dir"]) + root_directory = file_configuration["data_root_dir"] else: - root_directory = Path() + # root_directory = Path() + root_directory = "" # ---- File folder - data_directory = Path(file_settings["directory"]) + # data_directory = Path(file_settings["directory"]) + data_directory = file_settings["directory"] # ---- Createa directory path - directory_path = root_directory / data_directory + # directory_path = root_directory / data_directory + if root_directory != "": + directory_path = "/".join([root_directory, data_directory]) + else: + directory_path = data_directory # Validate filepath, columns, datatypes # ---- Error evaluation (if applicable) @@ -224,7 +231,7 @@ def validate_data_directory(file_configuration: dict, dataset: str, # Validate that files even exist # ---- List available *.zarr files - data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}")) + # data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}")) # ---- Error evaluation (if applicable) # if not data_files: # raise FileNotFoundError( @@ -233,21 +240,25 @@ def validate_data_directory(file_configuration: dict, dataset: str, # Check and format specific input filenames if isinstance(input_filenames, list): - data_files = [directory_path / filename for filename in input_filenames] + # data_files = [directory_path / filename for filename in input_filenames] + data_files = ["/".join([directory_path, filename]) for filename in input_filenames] # ---- Raise Error elif input_filenames is not None: raise TypeError( "Data loading argument `input_filenames` must be a list." ) - # - root_directory = file_configuration["database_directory"] + else: + data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}")) + + # Database root directory + database_root_directory = file_configuration["database_directory"] # Initialize the database file - initialize_database(root_directory, file_settings) + initialize_database(database_root_directory, file_settings) # Query the SQL database to process only new files (or create the db file in the first place) valid_files, file_configuration["database"][dataset] = ( - query_processed_files(root_directory, file_settings, data_files) + query_processed_files(database_root_directory, file_settings, data_files) ) # Return the valid filenames/paths From 48dd27a2fb04e151f668f97efa2dcd2694516f12 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 12:00:37 -0700 Subject: [PATCH 35/81] More Path removal changes --- echopop/live/live_data_loading.py | 2 +- echopop/live/sql_methods.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index 96485229..6b9b50ef 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -96,7 +96,7 @@ def filter_filenames(directory_path: Path, filename_id: str, # Find intersection with the proposed filenames and return the output return list(set(subfile_str).intersection(set(file_str))) -def read_biology_files(biology_files: List[Path], file_configuration: dict, +def read_biology_files(biology_files: List[str], file_configuration: dict, pandas_kwargs: dict = {}): # Get the biology data file settings diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index eb009780..d1504f90 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -650,9 +650,11 @@ def query_processed_files(root_directory: Path, file_settings: dict, files: List # Create filepath to the SQL database # ---- Create Path to SQL database file # db_directory = Path(file_configuration["database_directory"]) - db_directory = Path(root_directory) + # db_directory = Path(root_directory) + db_directory = root_directory # ---- Complete path to the database file - db_file = db_directory / db_name + # db_file = db_directory / db_name + db_file = "/".join([db_directory, db_name]) # Create a list of string-formatted Path names files_str = [str(file) for file in files] From 7c5d38ec5c06616dd8945a79823500986e901853 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 12:01:08 -0700 Subject: [PATCH 36/81] More Path removal --- echopop/live/live_data_loading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index 6b9b50ef..cbf04526 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -57,7 +57,7 @@ def live_configuration(live_init_config_path: Union[str, Path], # Combine both into a dictionary output that can be added to the `LiveSurvey` class object return {**init_config, **file_config} -def read_acoustic_files(acoustic_files: List[Path], +def read_acoustic_files(acoustic_files: List[str], xarray_kwargs: dict = {}) -> tuple: # Get the file-specific settings, datatypes, columns, etc. From 152b703c8748904ee80ff828b601978c838c3484 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 12:46:40 -0700 Subject: [PATCH 37/81] Coastline db update fixes (pathing) --- echopop/live/live_spatial_methods.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py index 33f534dd..9fbefd05 100644 --- a/echopop/live/live_spatial_methods.py +++ b/echopop/live/live_spatial_methods.py @@ -358,18 +358,22 @@ def initialize_grid(file_configuration = dict): # Get root directory, if defined if "data_root_dir" in file_configuration: - root_dir = Path(file_configuration["data_root_dir"]) + # root_dir = Path(file_configuration["data_root_dir"]) + root_dir = file_configuration["data_root_dir"] else: - root_dir = Path() + # root_dir = Path() + root_dir = "" # Get `grid` settings grid_database = file_configuration["input_directories"]["grid"]["database_name"] # ---- db_directory = Path(file_configuration["database_directory"]) + # db_directory = file_configuration["database_directory"] # Create full filepath # db_filepath = root_dir / "database" / grid_database db_filepath = db_directory / grid_database + # db_filepath = "/".join([db_directory, grid_database]) # ---- Update config file_configuration["database"]["grid"] = db_filepath @@ -455,19 +459,23 @@ def initialize_grid(file_configuration = dict): # Get coastline settings coast_settings = file_configuration["input_directories"]["coastline"] # ---- Get root folder directory - coast_root = root_dir / coast_settings["directory"] / coast_settings["coastline_name"] + # coast_root = root_dir / coast_settings["directory"] / coast_settings["coastline_name"] + coast_root = ( + "/".join([root_dir, coast_settings["directory"], coast_settings["coastline_name"]]) + ) # ---- Create filepath shp_filepath = ( # root_dir / coast_settings["directory"] # / coast_settings["coastline_name"] - coast_root - / f"{coast_settings['coastline_name']}.shp" + # coast_root + # / f"{coast_settings['coastline_name']}.shp" + "/".join([coast_root, f"{coast_settings['coastline_name']}.shp"]) ) # ---- Validate existence - if not shp_filepath.exists(): - raise FileNotFoundError( - f"{shp_filepath} does not exist!" - ) + # if not shp_filepath.exists(): + # raise FileNotFoundError( + # f"{shp_filepath} does not exist!" + # ) # Get original lat/lon geometry boundaries xmin0, ymin0, xmax0, ymax0 = boundary_gdf.total_bounds From c75be735edaf86adbbe69f307ba02a79513179f9 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 12:54:17 -0700 Subject: [PATCH 38/81] Add `storage_options` input for `pygrio.read_file` --- echopop/live/live_spatial_methods.py | 2 +- echopop/live/live_survey.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py index 9fbefd05..b57e0746 100644 --- a/echopop/live/live_spatial_methods.py +++ b/echopop/live/live_spatial_methods.py @@ -481,7 +481,7 @@ def initialize_grid(file_configuration = dict): xmin0, ymin0, xmax0, ymax0 = boundary_gdf.total_bounds # Read in file - full_coast = gpd.read_file(shp_filepath) + full_coast = gpd.read_file(shp_filepath, **file_configuration["storage_options"]) # ---- Convert to UTM full_coast_utm = full_coast.to_crs(utm_code) # ---- Remove empty diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index 3d5dd446..07387c26 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -50,6 +50,7 @@ def __init__( self, live_init_config_path: Union[str, Path], live_file_config_path: Union[str, Path], + cloud_storage_options: dict = {}, verbose: bool = True, ): # Initialize `meta` attribute @@ -61,10 +62,14 @@ def __init__( # initialize the Survey class object self.config = eldl.live_configuration(Path(live_init_config_path), Path(live_file_config_path)) - # # ---- Initialize config key for database files + # ---- Initialize config key for database files self.config.update( {"database": {key: None for key in self.config["input_directories"].keys()}} ) + # ---- Add cloud storage options, if needed + self.config.update( + {"storage_options": cloud_storage_options} + ) # Initialize input attribute self.input = copy.deepcopy(LIVE_DATA_STRUCTURE["input"]) From 755c9cf762a10e3d3b46c927191ae0a6146f9de3 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 13:01:18 -0700 Subject: [PATCH 39/81] Fix to `storage_options` arg for `geopandas` --- echopop/live/live_spatial_methods.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py index b57e0746..00bc2711 100644 --- a/echopop/live/live_spatial_methods.py +++ b/echopop/live/live_spatial_methods.py @@ -481,7 +481,8 @@ def initialize_grid(file_configuration = dict): xmin0, ymin0, xmax0, ymax0 = boundary_gdf.total_bounds # Read in file - full_coast = gpd.read_file(shp_filepath, **file_configuration["storage_options"]) + full_coast = gpd.read_file(shp_filepath, + storage_options=file_configuration["storage_options"]) # ---- Convert to UTM full_coast_utm = full_coast.to_crs(utm_code) # ---- Remove empty From 9fc24936c1e65139826701ac10233d7f1eb1f861 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 13:19:12 -0700 Subject: [PATCH 40/81] Updated `pygrio` engine settings --- echopop/live/live_spatial_methods.py | 1 + 1 file changed, 1 insertion(+) diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py index 00bc2711..2d7ac606 100644 --- a/echopop/live/live_spatial_methods.py +++ b/echopop/live/live_spatial_methods.py @@ -482,6 +482,7 @@ def initialize_grid(file_configuration = dict): # Read in file full_coast = gpd.read_file(shp_filepath, + engine="pyogrio", storage_options=file_configuration["storage_options"]) # ---- Convert to UTM full_coast_utm = full_coast.to_crs(utm_code) From 1c6c81a9792617e9813ea903318f83f6f7dcdb32 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 15:19:21 -0700 Subject: [PATCH 41/81] Fixed random/inconsistent column key missing --- echopop/live/live_biology.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py index 5fcf3c32..99264e0f 100644 --- a/echopop/live/live_biology.py +++ b/echopop/live/live_biology.py @@ -120,8 +120,11 @@ def preprocess_biology_data(biology_output: dict, spatial_dict: dict, file_confi id_columns=["id"], primary_keys=["id"], output_type=pd.DataFrame) - # ---- Add to the outgoing dictionary (and drop SQL db identifier) - sql_results_dict.update({table_name: table_df.drop(columns="id")}) + # ---- Drop SQL db identifier + if "id" in table_df.columns: + table_df.drop(columns="id", inplace=True) + # ---- Add to the outgoing dictionary + sql_results_dict.update({table_name: table_df}) # Return the output return filtered_biology_output, sql_results_dict From 95901d354c802a4a452bd25d433ab24f32567d69 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 15:53:11 -0700 Subject: [PATCH 42/81] Change files read/processed tracking --- echopop/live/live_acoustics.py | 2 +- echopop/live/live_data_loading.py | 2 +- echopop/live/live_survey.py | 20 ++++++++++++++------ 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py index 24f96681..5aea43f7 100644 --- a/echopop/live/live_acoustics.py +++ b/echopop/live/live_acoustics.py @@ -267,7 +267,7 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict # Update the successfully processed files query_processed_files(root_database, file_configuration["input_directories"]["acoustics"], - meta_dict["provenance"]["acoustic_files"], + meta_dict["provenance"]["acoustic_files_read"], processed=True) # Insert the new data into the database & pull in the combined dataset diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index cbf04526..abc69322 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -248,7 +248,7 @@ def validate_data_directory(file_configuration: dict, dataset: str, "Data loading argument `input_filenames` must be a list." ) else: - data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}")) + data_files = list(Path(directory_path).glob(f"*{'.'+file_settings['extension']}")) # Database root directory database_root_directory = file_configuration["database_directory"] diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index 07387c26..97ac4425 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -95,7 +95,7 @@ def __repr__(self): # Get any acoustic files created if "acoustic_files" in self.meta["provenance"]: # ---- Get the filenames - acoustic_filenames = self.meta["provenance"]["acoustic_files"] + acoustic_filenames = self.meta["provenance"]["acoustic_files_read"] # ---- Subset if many files are being processed if len(acoustic_filenames) > 2: acoustic_filenames = acoustic_filenames[:2] + ["..."] + [f"[n = {len(acoustic_filenames)}]"] @@ -107,7 +107,7 @@ def __repr__(self): # Get any biology files created if "biology_files" in self.meta["provenance"]: # ---- Get the filenames - biology_filenames = self.meta["provenance"]["biology_files"] + biology_filenames = self.meta["provenance"]["biology_files_read"] # ---- Subset if many files are being processed if len(biology_filenames) > 4: biology_filenames = biology_filenames + ["..."] @@ -156,7 +156,7 @@ def load_acoustic_data(self, self.config) # ---- Add meta key self.meta["provenance"].update({ - "acoustic_files": acoustic_files, + "acoustic_files_read": acoustic_files, }) # TODO: Add verbosity for printing database filepaths/connections if verbose: @@ -198,7 +198,7 @@ def load_biology_data(self, # Add meta key self.meta["provenance"].update({ - "biology_files": biology_files, + "biology_files_read": biology_files, }) def process_biology_data(self): @@ -275,9 +275,14 @@ def process_biology_data(self): # Update the database query_processed_files(root_directory, self.config["input_directories"]["biology"], - self.meta["provenance"]["biology_files"], + self.meta["provenance"]["biology_files_read"], processed=True) + # Add meta key + self.meta["provenance"].update({ + "biology_files_processed": self.meta["provenance"]["biology_files_read"] + }) + def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True): @@ -303,7 +308,10 @@ def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True): self.config, self.meta) - # Update the database + # Add meta key + self.meta["provenance"].update({ + "acoustic_files_processed": self.meta["provenance"]["acoustic_files_read"] + }) def estimate_population(self, working_dataset: Literal["acoustic", "biology"], From 10ebb88fff1d24bcf05cbad45db5e8d374265961 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 16:19:53 -0700 Subject: [PATCH 43/81] Add file read checkpointing (`load_biology_data`) --- echopop/live/live_survey.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index 97ac4425..03c0651b 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -178,6 +178,9 @@ def load_biology_data(self, biology_files = eldl.validate_data_directory(self.config, dataset="biology", input_filenames=input_filenames) + # ! REMOVE + self.meta["provenance"]["biology_files_checkpoint1"] = biology_files + # TODO: Add verbosity for printing database filepaths/connections if biology_files and verbose: # ---- Create file list @@ -190,12 +193,22 @@ def load_biology_data(self, # Read in the biology data files initial_biology_output = eldl.read_biology_files(biology_files, self.config, pandas_kwargs=pandas_kwargs) + + # ! REMOVE + self.meta["provenance"]["biology_files_checkpoint2"] =( + {key: df.shape for key, df in initial_biology_output.items()} + ) # Preprocess the biology dataset self.input["biology"], self.input["biology_processed"] = ( preprocess_biology_data(initial_biology_output, self.input["spatial"], self.config) ) + # ! REMOVE + self.meta["provenance"]["biology_files_checkpoint2"] = ( + {key: df.shape for key, df in self.input["biology_processed"].items()} + ) + # Add meta key self.meta["provenance"].update({ "biology_files_read": biology_files, From 3e1d06032cc9c3cec4e0f4bea3fac46f4c5a05b5 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 17:17:28 -0700 Subject: [PATCH 44/81] fix to `read_csv` --- echopop/live/live_data_loading.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index abc69322..ecb60426 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -4,6 +4,7 @@ import re from .sql_methods import SQL, query_processed_files, sql_data_exchange, initialize_database import pandas as pd +import numpy as np from datetime import datetime import xarray as xr @@ -113,16 +114,18 @@ def read_biology_files(biology_files: List[str], file_configuration: dict, biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} # # ---- Create filepath object if "data_root_dir" in file_configuration: - directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"] + # directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"] + directory_path = "/".join([file_configuration["data_root_dir"], file_settings["directory"]]) else: - directory_path = Path(file_settings["directory"]) + directory_path = file_settings["directory"] # Add SQL file to dict # file_configuration["database"]["biology"] = ( # Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"] # ) file_configuration["database"]["biology"] = ( - Path(file_configuration["database_directory"]) / file_settings["database_name"] + # Path(file_configuration["database_directory"]) / file_settings["database_name"] + "/".join([file_configuration["database_directory"], file_settings["database_name"]]) ) @@ -136,7 +139,7 @@ def read_biology_files(biology_files: List[str], file_configuration: dict, # ---- If there are dataset files available if dataset_files: # ---- Read in validated biology data - dataframe_list = [read_biology_csv(Path(file), + dataframe_list = [read_biology_csv(file, file_settings["file_name_formats"][dataset], biology_config_map[dataset], pandas_kwargs) @@ -281,7 +284,7 @@ def compile_filename_format(file_name_format: str): def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict, pandas_kwargs: dict = {}): # Read in the `*.csv` file - df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys()), **pandas_kwargs) + df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys()), storage_options=pandas_kwargs) # Validate the dataframe # ---- Check for any missing columns From 5ed0ce2970d9a0b2bd9a5e05dc8b69295a9db679 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 17:24:18 -0700 Subject: [PATCH 45/81] Fix glob cmd --- echopop/live/live_data_loading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index ecb60426..84993104 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -87,7 +87,7 @@ def filter_filenames(directory_path: Path, filename_id: str, # ---- Replace all other tags with `*` placeholders file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) # ---- Create Path object with the generalized format - subfile_path_obj = directory_path.glob(f"{file_id_format}.{file_extension}") + subfile_path_obj = Path(directory_path).glob(f"{file_id_format}.{file_extension}") # ---- List all files that match this pattern subfile_str = [str(file) for file in list(subfile_path_obj)] From c6697d2b4333ee839aef07831e160962561fbc62 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 17:28:42 -0700 Subject: [PATCH 46/81] Index fix --- echopop/live/live_survey.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index 03c0651b..9b468191 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -205,7 +205,7 @@ def load_biology_data(self, ) # ! REMOVE - self.meta["provenance"]["biology_files_checkpoint2"] = ( + self.meta["provenance"]["biology_files_checkpoint3"] = ( {key: df.shape for key, df in self.input["biology_processed"].items()} ) From c7d2244ee8e16a832e56b211fd07d141845343b6 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 18:00:03 -0700 Subject: [PATCH 47/81] Fixed methods for s3 bucket --- echopop/live/live_data_loading.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index 84993104..2e707c5c 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -86,8 +86,18 @@ def filter_filenames(directory_path: Path, filename_id: str, file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', filename_id) # ---- Replace all other tags with `*` placeholders file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) - # ---- Create Path object with the generalized format - subfile_path_obj = Path(directory_path).glob(f"{file_id_format}.{file_extension}") + # ---- Compile the pattern + pattern = re.compile(rf'{file_id_format.replace(".", r"\.").replace("*", ".*")}') + # ---- Create Path object with the generalized format: S3 + s3_files = [filename for filename in files + if filename.startswith("s3://") and pattern.search(filename)] + # ---- Local search + local_files = Path(directory_path).glob(f"{file_id_format}.{file_extension}") + # ---- Assign to subfile path object + if s3_files: + subfile_path_obj = s3_files + else: + subfile_path_obj = local_files # ---- List all files that match this pattern subfile_str = [str(file) for file in list(subfile_path_obj)] @@ -128,7 +138,6 @@ def read_biology_files(biology_files: List[str], file_configuration: dict, "/".join([file_configuration["database_directory"], file_settings["database_name"]]) ) - # Iterate through the different biology datasets and read them in for dataset in list(biology_file_ids.keys()): # ---- Get dataset-specific file lists @@ -284,7 +293,9 @@ def compile_filename_format(file_name_format: str): def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict, pandas_kwargs: dict = {}): # Read in the `*.csv` file - df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys()), storage_options=pandas_kwargs) + df = pd.read_csv(file, + usecols=list(config_map["dtypes"].keys()), + storage_options=pandas_kwargs) # Validate the dataframe # ---- Check for any missing columns @@ -309,7 +320,7 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict, pandas_k # Compile the filename regular expression compiled_regex = compile_filename_format(pattern) # ---- Create the `Match` object that will be used to parse the string - match_obj = compiled_regex.search(file.name) + match_obj = compiled_regex.search(file) # Iterate through the filename-derived tags and add them to the DataFrame for i in valid_tags: From 89880bbae45f158bbe24bd93a4fadad2356bd45a Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 18:08:03 -0700 Subject: [PATCH 48/81] Removed f-string --- echopop/live/live_data_loading.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index 2e707c5c..96631b35 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -87,7 +87,9 @@ def filter_filenames(directory_path: Path, filename_id: str, # ---- Replace all other tags with `*` placeholders file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) # ---- Compile the pattern - pattern = re.compile(rf'{file_id_format.replace(".", r"\.").replace("*", ".*")}') + escaped_file_id_format = re.escape(file_id_format) + pattern = re.compile(escaped_file_id_format.replace(r"\*", ".*")) + # pattern = re.compile(rf'{file_id_format.replace(".", r"\.").replace("*", ".*")}') # ---- Create Path object with the generalized format: S3 s3_files = [filename for filename in files if filename.startswith("s3://") and pattern.search(filename)] From 724424693469d8a4bf45f2ac9d50475db3748a12 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 14 Aug 2024 18:38:34 -0700 Subject: [PATCH 49/81] `live_visualizer` module --- echopop/live/live_visualizer.py | 358 ++++++++++++++++++++++++++++++++ echopop/test_workflow.py | 108 ++-------- 2 files changed, 373 insertions(+), 93 deletions(-) diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py index e69de29b..0975c08a 100644 --- a/echopop/live/live_visualizer.py +++ b/echopop/live/live_visualizer.py @@ -0,0 +1,358 @@ +from echopop.live.sql_methods import SQL +from shapely import wkt +import matplotlib.pyplot as plt +from matplotlib.colors import ListedColormap +import numpy as np +import pandas as pd +import geopandas as gpd +from typing import Union, Optional +from pathlib import Path + +def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame], + projection: str, + coast_db: Optional[Union[Path, pd.DataFrame]] = None): + + # Extract grid data from database if needed + if isinstance(grid_db, Path): + # ---- SELECT + grid_data = SQL(grid_db, "select", table_name="grid_df") + elif not isinstance(grid_db, pd.DataFrame): + raise TypeError( + "Grid data input (`grid_data`) must either be a `Path` or `pandas.DataFrame` object." + ) + else: + grid_data = grid_db + + # Extract coast data from database if needed + if isinstance(coast_db, Path): + # ---- SELECT + coast_data = SQL(coast_db, "select", table_name="coastline_df") + elif coast_data is None: + # ---- SELECT from `grid_data` + coast_data = SQL(grid_db, "select", table_name="coastline_df") + elif not isinstance(coast_db, pd.DataFrame): + raise TypeError( + "Coast data input (`coast_data`) must either be a `Path` or `pandas.DataFrame` object, " + "or exist within the SQL database as a table (`'coastline_df'`) within the `grid_data` " + "input (i.e. `grid_data.db`)." + ) + else: + coast_data = coast_db + + # Format columns if needed (well-known-text to Polygon) + # ---- `grid_data` + if isinstance(grid_data["geometry"][0], str): + grid_data["geometry"] = grid_data["geometry"].apply(wkt.loads) + # ---- `coastline_data` + if isinstance(coast_data["geometry"][0], str): + coast_data["geometry"] = coast_data["geometry"].apply(wkt.loads) + + # Generate GeoDataFrames + # ---- `grid` + grid_gdf = gpd.GeoDataFrame(grid_data, geometry="geometry", crs=projection) + # ---- `coast` + coast_gdf = gpd.GeoDataFrame(coast_data, geometry="geometry", crs=projection) + + # Get appropriate plot axis-limits + axis_limits = grid_gdf.total_bounds + + # Variable label dictionary map + VARIABLE_MAP = { + "number_density_mean": { + "name": "Mean number density", + "units": "fish $\\mathregular{nmi^{-2}}$", + "colormap": "viridis", + }, + "biomass_density_mean": { + "name": "Mean biomass density", + "units": "kg $\\mathregular{nmi^{-2}}$", + "colormap": "plasma", + }, + "biomass": { + "name": "Biomass", + "units": "kg", + "colormap": "cividis", + }, + "abundance": { + "name": "Abundance", + "units": "$\\it{N}$", + "colormap": "inferno", + } + } + + # Create a figure and a 2x2 grid of subplots + fig, axes = plt.subplots(2, 2, figsize=(10, 10)) + + # List of variables to plot + variables = list(VARIABLE_MAP.keys()) + + # Iterate through and plot all subplots + for ax, var in zip(axes.flat, variables): + # ---- Get the colormap + colormap = plt.colormaps.get_cmap(VARIABLE_MAP[var]["colormap"]).resampled(256) + # ---- Invert + newcolors = colormap (np.linspace(0, 1, 256))[::-1] + # ---- Define `white` + white = np.array([1, 1, 1, 1]) + # ---- Replace "start" color + newcolors[0, :] = white + # ---- Create the new custom colormap + custom_cmap = ListedColormap(newcolors) + # ---- Normalize colorscale + norm=plt.Normalize(vmin=grid_gdf[var].min(), vmax=grid_gdf[var].max()) + # ---- Plot the polygons with color fills based on the variable (non-zero) + grid_gdf.plot(column=var, ax=ax, edgecolor="gainsboro", legend=False, cmap=custom_cmap, + norm=norm, + markersize=0, linewidth=0.5) + # ---- Add coastline data layer + coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") + # ---- Set axis limits + ax.set_xlim(axis_limits[0]*1.005, axis_limits[2]*1.01) + ax.set_ylim(axis_limits[1]*0.98, axis_limits[3]*1.005) + # ---- Trim down the margins + ax.margins(0,0) + # ---- Set adjustable aspect ratio + # ax.set_aspect('equal', adjustable='box') + # ---- Set the title and labels + var_info = VARIABLE_MAP[var] + ax.set_title(f"{var_info['name']}") + # ---- Set axis labels + plt.xlabel(u'Longitude (\u00B0E)') + plt.ylabel(u'Latitude (\u00B0N)') + # ---- Add colorbar + sm = plt.cm.ScalarMappable(cmap=custom_cmap, + norm=plt.Normalize(vmin=grid_gdf[var].min(), + vmax=grid_gdf[var].max())) + sm._A = [] # fake up the array of the scalar mappable + cbar = fig.colorbar(sm, ax=ax, shrink=0.5) + cbar.set_label(f"{var_info['units']}") + # ---- Add scalebar + scalebar_length = 250 # Length of scale bar in km + scalebar_length_in_degrees = scalebar_length / 111 # Assuming 1 degree = 111 km + # ---- Transform scale bar coordinates to axis units + # scalebar_x = axis_limits[0]*1.005 + (axis_limits[2]*1.01 - axis_limits[0]*1.005) * 0.1 + # scalebar_y = axis_limits[1]*0.98 + (axis_limits[3]*1.005 - axis_limits[1]*0.98) * 0.1 + x0, x1 = ax.get_xlim() + y0, y1 = ax.get_ylim() + x_scale = (x1 - x0) * 0.1 + y_scale = (y1 - y0) * 0.1 + # scalebar_y_offset = (axis_limits[3]*1.005 - axis_limits[1]*0.98) * 0.05 + # ---- Plot scalebar + # ax.plot([scalebar_x, scalebar_x + scalebar_length / 100], + # [scalebar_y, scalebar_y], color='black', lw=2) + ax.plot([x0 + x_scale, x0 + x_scale + scalebar_length_in_degrees], + [y0 + y_scale, y0 + y_scale], color='black', lw=2) + # ---- Add scale text + ax.text(x0 + x_scale + scalebar_length_in_degrees / 2, y0 + y_scale - (y1 - y0) * 0.025, + f'{scalebar_length} km', ha='center', va='top', color='black') + + # ax.text(scalebar_x + (scalebar_length / 200), + # scalebar_y - scalebar_y_offset, + # f'{scalebar_length} km', ha='center', va='bottom', color='black') + + # Adjust layout + plt.tight_layout() + + # Show the plot + plt.show() + +def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame], + projection: str, + coast_db: Optional[Union[Path, pd.DataFrame]] = None): + + # Extract grid data from database if needed + if isinstance(survey_data_db, Path): + # ---- SELECT + survey_data = SQL(survey_data_db, "select", table_name="survey_data_df") + elif not isinstance(survey_data_db, pd.DataFrame): + raise TypeError( + "Grid data input (`grid_data`) must either be a `Path` or `pandas.DataFrame` object." + ) + else: + survey_data = survey_data_db + + # Extract coast data from database if needed + if isinstance(coast_db, Path): + # ---- SELECT + coast_data = SQL(coast_db, "select", table_name="coastline_df") + elif not isinstance(coast_db, pd.DataFrame): + raise TypeError( + "Coast data input (`coast_data`) must either be a `Path` or `pandas.DataFrame` object." + ) + else: + coast_data = coast_db + + # Format columns if needed (well-known-text to Polygon) + # ---- `coastline_data` + if isinstance(coast_data["geometry"][0], str): + coast_data["geometry"] = coast_data["geometry"].apply(wkt.loads) + + # Generate GeoDataFrames + # ---- `grid` + survey_gdf = gpd.GeoDataFrame(survey_data, + geometry=gpd.points_from_xy(survey_data["longitude"], + survey_data["latitude"]), + crs=projection) + # ---- `coast` + coast_gdf = gpd.GeoDataFrame(coast_data, geometry="geometry", crs=projection) + + # Get appropriate plot axis-limits + axis_limits = survey_gdf.total_bounds + + # Variable label dictionary map + VARIABLE_MAP = { + "number_density": { + "name": "Mean number density", + "units": "fish $\\mathregular{nmi^{-2}}$", + "colormap": "inferno", + "minimum": 0.0, + "cbar_reverse": True, + "size": [25, 250] + }, + "biomass_density": { + "name": "Mean biomass density", + "units": "kg $\\mathregular{nmi^{-2}}$", + "colormap": "plasma", + "minimum": 0.0, + "cbar_reverse": True, + "size": [25, 250] + }, + "nasc": { + "name": "Nautical area scattering coefficient", + "units": "$\\mathregular{m^{2}~nmi^{-2}}$", + "colormap": "viridis", + "minimum": 0.0, + "cbar_reverse": False, + "size": [25, 250] + }, + "max_Sv": { + "name": "Max $\\mathregular{S_V}$", + "units": "dB re. 1 $\\mathregular{m^-1}$", + "colormap": "viridis", + "minimum": -999, + "cbar_reverse": True, + "color_threshold": { + "minimum": -80.0, + "maximum": -36.0 + }, + "size": [5, 200] + }, + # "mean_Sv": { + # "name": "$Mean \\mathregular{S_V}$", + # "units": "dB re. 1 $\\mathregular{m^-1}$", + # "colormap": "viridis", + # "minimum": -999, + # "cbar_reverse": True, + # "color_threshold": { + # "minimum": -80.0, + # "maximum": -36.0 + # } + # }, + } + + # List of variables to plot + variables = list(VARIABLE_MAP.keys()) + + def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): + + # Censor values if needed + sizes = values.copy() + sizes.loc[sizes < min_value] = min_value + sizes.loc[sizes > max_value] = max_value + + return ( + ((sizes - min_value) / (max_value - min_value)) + * (max_size - min_size) + min_size + ) + + # Create a figure and a 2x2 grid of subplots + fig, axes = plt.subplots(2, 2, figsize=(10, 10)) + + # Iterate through and plot all subplots + for ax, var in zip(axes.flat, variables): + # ---- Get the colormap + colormap = plt.colormaps.get_cmap(VARIABLE_MAP[var]["colormap"]).resampled(256) + # ---- Invert + if VARIABLE_MAP[var]["cbar_reverse"]: + newcolors = colormap(np.linspace(0, 1, 256))[::-1] + # ---- Create the new custom colormap + custom_cmap = ListedColormap(newcolors) + # ---- Plot cruisetrack + # survey_gdf.plot(ax=ax, color="dimgray", linewidth=0.25, linestyle="-") + ax.plot(survey_gdf.geometry.x, survey_gdf.geometry.y, color="dimgray", + linewidth=0.25, linestyle="-") + # ---- Drop "empty" values + sub_gdf = survey_gdf[survey_gdf[var] > VARIABLE_MAP[var]["minimum"]] + # ---- Assign color range + if "color_threshold" in VARIABLE_MAP[var].keys(): + min_value = VARIABLE_MAP[var]["color_threshold"]["minimum"] + max_value = VARIABLE_MAP[var]["color_threshold"]["maximum"] + else: + min_value = sub_gdf[var].min() + max_value = sub_gdf[var].max() + # ---- Normalize colorscale + norm=plt.Normalize(vmin=min_value, vmax=max_value) + # ---- Plot the points with color fills based on the variable (non-zero) + ax.scatter( + [geom.x for geom in sub_gdf.geometry], + [geom.y for geom in sub_gdf.geometry], + c=sub_gdf[var], + s=scale_sizes(values=sub_gdf[var], + min_value=min_value, + max_value=max_value, + min_size=VARIABLE_MAP[var]["size"][0], + max_size=VARIABLE_MAP[var]["size"][1]), + cmap=custom_cmap, + norm=norm, + edgecolor="black", + linewidths=0.5 + ) + # ---- Add coastline data layer + coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") + # ---- Set axis limits + ax.set_xlim(axis_limits[0]*1.005, axis_limits[2]*0.995) + ax.set_ylim(axis_limits[1]*0.98, axis_limits[3]*1.005) + # ---- Trim down the margins + ax.margins(0,0) + # ---- Set adjustable aspect ratio + # ax.set_aspect('equal', adjustable='box') + # ---- Set the title and labels + var_info = VARIABLE_MAP[var] + ax.set_title(f"{var_info['name']}") + # ---- Set axis labels + plt.xlabel(u'Longitude (\u00B0E)') + plt.ylabel(u'Latitude (\u00B0N)') + # ---- Add colorbar + sm = plt.cm.ScalarMappable(cmap=custom_cmap, norm=norm) + sm._A = [] # fake up the array of the scalar mappable + cbar = fig.colorbar(sm, ax=ax, shrink=0.5) + cbar.set_label(f"{var_info['units']}") + # ---- Add scalebar + scalebar_length = 250 # Length of scale bar in km + scalebar_length_in_degrees = scalebar_length / 111 # Assuming 1 degree = 111 km + # ---- Transform scale bar coordinates to axis units + # scalebar_x = axis_limits[0]*1.005 + (axis_limits[2]*1.01 - axis_limits[0]*1.005) * 0.1 + # scalebar_y = axis_limits[1]*0.98 + (axis_limits[3]*1.005 - axis_limits[1]*0.98) * 0.1 + x0, x1 = ax.get_xlim() + y0, y1 = ax.get_ylim() + x_scale = (x1 - x0) * 0.1 + y_scale = (y1 - y0) * 0.1 + # scalebar_y_offset = (axis_limits[3]*1.005 - axis_limits[1]*0.98) * 0.05 + # ---- Plot scalebar + # ax.plot([scalebar_x, scalebar_x + scalebar_length / 100], + # [scalebar_y, scalebar_y], color='black', lw=2) + ax.plot([x0 + x_scale, x0 + x_scale + scalebar_length_in_degrees], + [y0 + y_scale, y0 + y_scale], color='black', lw=2) + # ---- Add scale text + ax.text(x0 + x_scale + scalebar_length_in_degrees / 2, y0 + y_scale - (y1 - y0) * 0.025, + f'{scalebar_length} km', ha='center', va='top', color='black') + + # ax.text(scalebar_x + (scalebar_length / 200), + # scalebar_y - scalebar_y_offset, + # f'{scalebar_length} km', ha='center', va='bottom', color='black') + + # Adjust layout + plt.tight_layout() + + # Show the plot + plt.show() diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index e95f7336..47844f25 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -1,5 +1,7 @@ from echopop.live.live_survey import LiveSurvey from echopop.live.sql_methods import SQL +import echopop.live.live_visualizer as elv +from pathlib import Path #################################################################################################### # TEST: Set up `LiveSurvey` object # NOTE: General initialization parameter configuration @@ -56,7 +58,7 @@ # !!! The SQL functions will fail if the tables have not yet been created/initialized # ---- ACOUSTICS # NOTE: Mean linear backscatter coefficient (`sigma_bs`) keyed for each haul and stratum -SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df") +SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df") # NOTE: Along-track acoustically-derived number/biomass densities and NASC SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") # ---- BIOLOGICAL @@ -66,95 +68,15 @@ SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df") # NOTE: Average weights per stratum SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df") - -dat = realtime_survey.input["acoustics"]["prc_nasc_df"].copy() -dat = dat[dat.latitude > 40] -dat = dat[dat.depth > 20] - -import matplotlib.pyplot as plt -import seaborn as sns -from geopy.distance import geodesic -import pandas as pd -from datetime import datetime -import matplotlib.dates as mdates -def calculate_distances(df): - distances = [0] # Start with 0 for the first point - for i in range(1, len(df)): - point1 = (df.iloc[i - 1]['latitude'], df.iloc[i - 1]['longitude']) - point2 = (df.iloc[i]['latitude'], df.iloc[i]['longitude']) - distances.append(geodesic(point1, point2).meters) - return distances - -def parse_datetime(date_str): - # List of possible formats - formats = [ - '%Y-%m-%d %H:%M:%S.%f', # With fractional seconds - '%Y-%m-%d %H:%M:%S', # Without fractional seconds - '%Y-%m-%dT%H:%M:%S.%f', # ISO 8601 format with fractional seconds - '%Y-%m-%dT%H:%M:%S' # ISO 8601 format without fractional seconds - ] - - for fmt in formats: - try: - return pd.to_datetime(date_str, format=fmt) - except (ValueError, TypeError): - continue # Try the next format - - return pd.NaT # Return NaT if no formats match - -dat["ping_time"] = dat["ping_time"].apply(parse_datetime) - -pivot_table = dat.pivot_table(index=["depth"], columns=["ping_time"], values=["NASC"], aggfunc="mean") -# Get the unique distance and depth values for plotting -plt.figure(figsize=(10, 8)) -ax = sns.heatmap(pivot_table, cmap="viridis", cbar_kws={'label': 'NASC'}) -plt.gca().xaxis.set_major_locator(mdates.MinuteLocator(interval=30)) # Major ticks every 30 minutes -plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%H:%M')) # Format as hour:minute -plt.gcf().autofmt_xdate() -ax.set_xticks(ax.get_xticks()[::max(len(ax.get_xticks()) // 10, 1)]) # Show fewer ticks if necessary -plt.xlabel('Ping time') -plt.ylabel('Depth') -# plt.gca().invert_yaxis() # To have depth increasing downwards like in a typical depth plot -plt.show() - - -dat.groupby(["ping_time"]).size() -unique_pairs = dat.drop_duplicates(subset=['latitude', 'longitude']).sort_values("ping_time") - -unique_pairs["d"] = calculate_distances(dat) -df['cumulative_distance'] = df['distance'].cumsum() - -unique_distances = dat.groupby('source')[['latitude', 'longitude']].unique().reset_index() -unique_distances = unique_distances.explode('distance') - - -# Create a pivot table to reshape the dataframe suitable for a heatmap -dat['source_id'] = dat['source'].astype('category').cat.codes -pivot_table = dat.pivot(index=["depth"], columns=["distance"], values=["NASC"]) -dat.groupby('source')['distance'].cumsum() -plt.plot(index="depth", columns="distance", values="NASC") -plt.show() - -data = { - 'distance': [1, 1, 2, 2, 1, 1, 3, 3], - 'depth': [1, 2, 1, 2, 1, 2, 1, 2], - 'source': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'] -} -dat = pd.DataFrame(data) -dat = dat.sort_values(by=['source', 'distance']) -unique_distances = dat.groupby('source')['distance'].unique().reset_index() -unique_distances = unique_distances.explode('distance') - -unique_distances['distance'] = pd.to_numeric(unique_distances['distance'], errors='coerce') -unique_distances['distance_diff'] = unique_distances.groupby('source')['distance'].diff().fillna(0) -unique_distances['cumsum_diff'] = unique_distances.groupby('source')['distance_diff'].cumsum() -unique_distances['Cumsum_dist'] = unique_distances['cumsum_diff'].cumsum() -unique_distances['Cumsum_dist'] = pd.to_numeric(unique_distances['Cumsum_dist'], errors='coerce') -dat = dat.merge(unique_distances[['source', 'distance', 'Cumsum_dist']], on=['source', 'distance'], how='left') - - -# Calculate cumulative sum of distances for each source -dat['Cumsum_dist'] = dat.groupby('source')['distance'].transform(lambda x: x.cumsum()) -dat['Cumsum_dist_within_source'] = dat.groupby('source')['distance'].cumsum() - -dat['Cumsum_dist'] = dat.groupby('source')['Cumsum_dist_within_source'].transform(lambda x: x + x.shift(1).fillna(0).cumsum()) +#################################################################################################### +# FROM THE `LiveSurvey` object ! +# ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table +grid_db = Path(realtime_survey.config["database"]["grid"]) +survey_data_db = Path('C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files/database/acoustics.db') +coast_db = grid_db +projection = realtime_survey.config["geospatial"]["projection"] +# NOTE: PLOTS +# ---- PLOT GRID +elv.plot_livesurvey_grid(grid_db, projection, coast_db) +# ---- PLOT TRACK +elv.plot_livesurvey_track(survey_data_db, projection, coast_db) From 76085c3f63b879d7c06053b15d0409dd2434b162 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Thu, 15 Aug 2024 11:57:27 -0700 Subject: [PATCH 50/81] Minor changes to axis labels --- echopop/live/live_visualizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py index 0975c08a..cd085399 100644 --- a/echopop/live/live_visualizer.py +++ b/echopop/live/live_visualizer.py @@ -117,8 +117,8 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame], var_info = VARIABLE_MAP[var] ax.set_title(f"{var_info['name']}") # ---- Set axis labels - plt.xlabel(u'Longitude (\u00B0E)') - plt.ylabel(u'Latitude (\u00B0N)') + ax.set_xlabel(u'Longitude (\u00B0E)') + ax.set_ylabel(u'Latitude (\u00B0N)') # ---- Add colorbar sm = plt.cm.ScalarMappable(cmap=custom_cmap, norm=plt.Normalize(vmin=grid_gdf[var].min(), @@ -320,8 +320,8 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): var_info = VARIABLE_MAP[var] ax.set_title(f"{var_info['name']}") # ---- Set axis labels - plt.xlabel(u'Longitude (\u00B0E)') - plt.ylabel(u'Latitude (\u00B0N)') + ax.set_xlabel(u'Longitude (\u00B0E)') + ax.set_ylabel(u'Latitude (\u00B0N)') # ---- Add colorbar sm = plt.cm.ScalarMappable(cmap=custom_cmap, norm=norm) sm._A = [] # fake up the array of the scalar mappable From e1ec7a0d4cd70fe1640cda858b0f409c3deafa2b Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Thu, 15 Aug 2024 14:13:50 -0700 Subject: [PATCH 51/81] Plotting function for bio distirbutions --- echopop/live/live_visualizer.py | 134 ++++++++++++++++++++++++++++++++ echopop/test_workflow.py | 14 ++++ 2 files changed, 148 insertions(+) diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py index cd085399..89977b12 100644 --- a/echopop/live/live_visualizer.py +++ b/echopop/live/live_visualizer.py @@ -356,3 +356,137 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): # Show the plot plt.show() + +def plot_livesurvey_distributions(weight_table: pd.DataFrame, + stratum_table: pd.DataFrame, + specimen_table: pd.DataFrame, + length_table: pd.DataFrame, + biology_db: Optional[Path] = None): + + # If calling from SQL database + if biology_db is not None: + weight_table = SQL(biology_db, "select", table_name="length_weight_df") + stratum_table = SQL(biology_db, "select", table_name="strata_summary_df") + specimen_table = SQL(biology_db, "select", table_name="specimen_data_df") + length_table = SQL(biology_db, "select", table_name="length_df") + elif not all([isinstance(df, pd.DataFrame) for df in [weight_table, stratum_table, + specimen_table, length_table]]): + raise TypeError( + "All tables must be a `pandas.DataFrame." + ) + + # Organize the weight table data + # ---- Sum weights by stratum, sex, and length_bin + aggregated_data = ( + weight_table.groupby(['stratum', 'sex', 'length_bin'])['weight'].sum().reset_index() + ) + # ---- Create a column to indicate 'all' sexes + aggregated_data_all = ( + aggregated_data.groupby(['stratum', 'length_bin'])['weight'].sum().reset_index() + ) + aggregated_data_all['sex'] = 'all' + # ---- Combine the male, female, and all data + plot_weight_data = pd.concat([aggregated_data, aggregated_data_all], ignore_index=True) + + # Define the sexes + sexes = plot_weight_data.sex.unique().tolist() + + # Organize the length table data + bins = plot_weight_data.length_bin.unique() + 1 + full_bins = np.concatenate([[bins[0] - np.diff(bins).mean() / 2], bins]) + length_table["length_bin"] = ( + pd.cut(length_table["length"], bins=full_bins, labels=bins - 1).astype(float) + ) + length_table_sex = ( + length_table.groupby(["stratum", "sex", "length_bin"])["length_count"].sum().reset_index() + ) + length_table_all = ( + length_table.groupby(["stratum", "length_bin"])["length_count"].sum().reset_index() + ) + length_table_all['sex'] = 'all' + full_count = ( + specimen_table.meld(length_table_all, contrasts=["stratum", "sex", "species_id", "length_bin"]) + .loc[lambda x: x.sex.isin(sexes)] + .groupby(['stratum', 'sex', 'length_bin'])['length_count'].sum().reset_index() + ) + full_count["total"] = full_count.groupby(["stratum", "sex"])["length_count"].transform("sum") + full_count["number_proportion"] = full_count["length_count"] / full_count["total"] + # ---- Combine into the full dataset for plotting + plot_count_data = ( + plot_weight_data + .merge(full_count.filter(["stratum", "sex", "length_bin", "number_proportion"]), + on=["stratum", "sex", "length_bin"], how="left") + ).fillna(0.0) + + # Get a color map + colors = plt.colormaps['tab10'] + num_strata = len(stratum_table['stratum'].unique()) + num_sexes = len(sexes) + color_map = colors(num_strata) + + # Plot + fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(6, 8), sharex=True, sharey=True) + plt.subplots_adjust(hspace=0.08, wspace=0.05, bottom=0.25) # Adjust spacing between plots + + # Plot weights and counts + for i, sex in enumerate(sexes): + # Weight plot (left column) + ax_weight = axes[i, 0] + data_weight = plot_weight_data[plot_weight_data['sex'] == sex] + for j, (stratum, group) in enumerate(data_weight.groupby('stratum')): + # color = colors(i / num_strata) if num_strata > 1 else colors(0) + color = colors(j / num_strata) if num_strata > 1 else colors(0) + total = group["weight"].sum() + group["proportions"] = group["weight"] / total if total > 0.0 else 0.0 + ms = 5 if group["proportions"].max() > 0.0 else 0.1 + # handle, = ax_weight.plot(group['length_bin'], group['proportions'], marker='o', + # label=f'Stratum {stratum}', color=color, ms=ms) + ax_weight.plot(group['length_bin'], group['proportions'], marker='o', + label=f'Stratum {stratum}', color=color, ms=ms) + if i == 0: + ax_weight.set_title(f'Weight') + if i < num_sexes - 1: # No x-ticks for non-bottom plots + ax_weight.set_xlabel('') + if i == num_sexes // 2: + ax_weight.set_ylabel('Within-stratum proportion [0, 1]') + if i == num_sexes - 1: # Bottom plot + ax_weight.set_xlabel('Length bin (cm)') + ax_weight.set_ylim(0.0, 1.0) + # Add label in the top-left corner + ax_weight.text(0.05, 1.00 - 0.05 * (num_sexes - 1), sex.title(), + transform=ax_weight.transAxes, + fontsize=12, verticalalignment='top', + bbox=dict(facecolor='white', alpha=0.8, + edgecolor='none')) + + # Count plot (right column) + ax_count = axes[i, 1] + data_count = plot_count_data[plot_count_data['sex'] == sex] + for j, (stratum, group) in enumerate(data_count.groupby('stratum')): + color = colors(j / num_strata) if num_strata > 1 else colors(0) + ms = 5 if group["number_proportion"].max() > 0.0 else 0.1 + ax_count.plot(group['length_bin'], group['number_proportion'], + marker='o', label=f'Stratum {stratum}', color=color, ms=ms) + if i == 0: + ax_count.set_title(f"Number") + if i < num_sexes - 1: # No x-ticks for non-bottom plots + ax_count.set_xlabel('') + if i == num_sexes - 1: # Bottom plot + ax_count.set_xlabel('Length bin (cm)') + ax_count.set_ylim(0.0, 1.0) + # Add label in the top-left corner + ax_count.text(0.05, 1.00 - 0.05 * (num_sexes - 1), sex.title(), + transform=ax_count.transAxes, + fontsize=12, verticalalignment='top', + bbox=dict(facecolor='white', alpha=0.8, + edgecolor='none')) + # Create a new axes for the legend + legend_ax = fig.add_axes([0.15, 0.05, 0.7, 0.1]) # Position the legend axes (left, bottom, width, height) + legend_ax.axis('off') # Hide the new axes + + # Create a shared legend in the bottom-most subplot + handles, labels = axes[2, 1].get_legend_handles_labels() # Get handles and labels from the bottom-left plot + fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 0.2), + ncol=num_strata // 2 + 1, fontsize='small', title='INPFC stratum') + + plt.show() diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index 47844f25..74968fdc 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -68,6 +68,10 @@ SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df") # NOTE: Average weights per stratum SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df") +# NOTE: Stratum summary tables +SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df") +SQL(realtime_survey.config["database"]["biology"], "map") +SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_df") #################################################################################################### # FROM THE `LiveSurvey` object ! # ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table @@ -80,3 +84,13 @@ elv.plot_livesurvey_grid(grid_db, projection, coast_db) # ---- PLOT TRACK elv.plot_livesurvey_track(survey_data_db, projection, coast_db) +# ---- PLOT DISTRIBUTIONS +weight_table = SQL(realtime_survey.config["database"]["biology"], "select", + table_name="length_weight_df") +stratum_table = SQL(realtime_survey.config["database"]["biology"], "select", + table_name="strata_summary_df") +specimen_table = SQL(realtime_survey.config["database"]["biology"], "select", + table_name="specimen_data_df") +length_table = SQL(realtime_survey.config["database"]["biology"], "select", + table_name="length_df") +elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table) \ No newline at end of file From b36e28489c299f6f63ab6e75925acc9a526bbca4 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Fri, 16 Aug 2024 12:21:32 -0700 Subject: [PATCH 52/81] Possible fix to `BIGINT` SQL error --- echopop/live/sql_methods.py | 1 + 1 file changed, 1 insertion(+) diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index d1504f90..ce118dce 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -435,6 +435,7 @@ def initialize_database(root_directory: Path, file_settings: dict): "INTEGER": int, "DATETIME": str, "TEXT": str, + "BIGINT": int, } def sql_group_update(db_file: str, From c761a9b2a226c0f64fde1857bc702b7cfb7ce41b Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Fri, 16 Aug 2024 12:24:41 -0700 Subject: [PATCH 53/81] Validator for successful population run --- echopop/live/live_survey.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index 9b468191..79ac8704 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -83,6 +83,8 @@ def __init__( # Initialize the extrapolation grid initialize_grid(self.config) + # TODO: add quick utility function to get db filepaths + # Configure the spatial settings self.input.update({"spatial": eldl.configure_spatial_settings(self.config)}) @@ -335,7 +337,9 @@ def estimate_population(self, eldp.acoustic_pipeline(self.input["acoustics"], self.input["spatial"]["strata"], self.config, - verbose=verbose) + verbose=verbose) + # --- Validate successful run + self.meta["provenance"]["acoustic_population"] = True # method if working_dataset == "biology": @@ -343,4 +347,7 @@ def estimate_population(self, self.input["spatial"]["strata"], self.config, verbose=verbose) + # --- Validate successful run + self.meta["provenance"]["biology_population"] = True + From 0f1e8f2b9bc8188406c06b8370f51250bf2e5364 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Fri, 16 Aug 2024 12:26:26 -0700 Subject: [PATCH 54/81] Fix to population validation handshake --- echopop/live/live_survey.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index 79ac8704..5523c0e4 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -331,23 +331,25 @@ def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True): def estimate_population(self, working_dataset: Literal["acoustic", "biology"], verbose: bool = True): - + + self.meta["provenance"][f"{working_dataset}_population"] = False + # method - if working_dataset == "acoustic": - eldp.acoustic_pipeline(self.input["acoustics"], - self.input["spatial"]["strata"], - self.config, - verbose=verbose) - # --- Validate successful run - self.meta["provenance"]["acoustic_population"] = True + if working_dataset == "acoustic": + eldp.acoustic_pipeline(self.input["acoustics"], + self.input["spatial"]["strata"], + self.config, + verbose=verbose) + # --- Validate successful run + self.meta["provenance"]["acoustic_population"] = True # method - if working_dataset == "biology": - eldp.biology_pipeline(self.input["biology"], - self.input["spatial"]["strata"], - self.config, - verbose=verbose) - # --- Validate successful run - self.meta["provenance"]["biology_population"] = True + if working_dataset == "biology": + eldp.biology_pipeline(self.input["biology"], + self.input["spatial"]["strata"], + self.config, + verbose=verbose) + # --- Validate successful run + self.meta["provenance"]["biology_population"] = True From 370b16f31da66ae0b1ade933f9bcf5cb55f20793 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Fri, 16 Aug 2024 15:12:48 -0700 Subject: [PATCH 55/81] Database pathing changes --- echopop/live/live_data_processing.py | 18 ++++++++++++++++-- echopop/live/live_survey.py | 3 ++- echopop/live/sql_methods.py | 6 ++++++ 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py index a235bf58..8673bd53 100644 --- a/echopop/live/live_data_processing.py +++ b/echopop/live/live_data_processing.py @@ -33,6 +33,20 @@ def get_average_strata_weights(db_file: str, else: return None +def configure_database_paths(file_configuration: dict): + + # Extract input directory settings + file_settings = file_configuration["input_directories"] + + # Get database directory + database_dir = file_configuration["database_directory"] + + # Update configuration + file_configuration["database"].update({ + dataset: "/".join([database_dir, file_settings[dataset]["database_name"]]) + for dataset in file_settings.keys() if "database_name" in file_settings[dataset] + }) + def acoustic_pipeline(acoustic_dict: dict, strata_df: pd.DataFrame, file_configuration: dict, @@ -81,8 +95,8 @@ def acoustic_pipeline(acoustic_dict: dict, # Get the corresponding average strata weights (computed for all fish) weight_spatial_averages = get_average_strata_weights(biology_db, - acoustic_dict, - unique_columns=spatial_column + contrast_columns) + acoustic_dict, + unique_columns=spatial_column + contrast_columns) if weight_spatial_averages is not None: # Merge average weights with number density estimates diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index 5523c0e4..d8470366 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -83,7 +83,8 @@ def __init__( # Initialize the extrapolation grid initialize_grid(self.config) - # TODO: add quick utility function to get db filepaths + # Add database paths to configuration attribute + eldp.configure_database_paths(self.config) # Configure the spatial settings self.input.update({"spatial": eldl.configure_spatial_settings(self.config)}) diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index ce118dce..a0cf299c 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -786,6 +786,12 @@ def sql_update_strata_summary(source_db: str, ATTACH DATABASE '{source_db}' AS source; ATTACH DATABASE '{target_db}' AS target; + -- Verify the source database tables + SELECT name FROM source.sqlite_master WHERE type='table'; + + -- Query the source table directly + SELECT * FROM source.{source_table} LIMIT 1; + """ # Dynamically format the cross-database command From aca08e47efe293bb0e26eb4c7d11924a0fa8392a Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Fri, 16 Aug 2024 15:46:15 -0700 Subject: [PATCH 56/81] Fixes to oddities due to `NaN` for cruise plot --- echopop/live/live_acoustics.py | 4 ++++ echopop/live/live_visualizer.py | 10 +++++----- echopop/test_workflow.py | 15 +++++++-------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py index 5aea43f7..44e61ae0 100644 --- a/echopop/live/live_acoustics.py +++ b/echopop/live/live_acoustics.py @@ -113,6 +113,10 @@ def estimate_echometrics(acoustic_data_df: pd.DataFrame): # Pre-compute the change in depth acoustic_df["dz"] = acoustic_df["depth"].diff() + # ---- Change first cell ! + acoustic_df.loc[0, "dz"] = ( + acoustic_df.loc[1, "depth"] - acoustic_df.loc[0, "depth"] + ) # Initialize echometrics dictionary echometrics = {} diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py index 89977b12..3985ec53 100644 --- a/echopop/live/live_visualizer.py +++ b/echopop/live/live_visualizer.py @@ -207,7 +207,7 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame], "colormap": "inferno", "minimum": 0.0, "cbar_reverse": True, - "size": [25, 250] + "size": [25, 150] }, "biomass_density": { "name": "Mean biomass density", @@ -215,7 +215,7 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame], "colormap": "plasma", "minimum": 0.0, "cbar_reverse": True, - "size": [25, 250] + "size": [25, 150] }, "nasc": { "name": "Nautical area scattering coefficient", @@ -223,7 +223,7 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame], "colormap": "viridis", "minimum": 0.0, "cbar_reverse": False, - "size": [25, 250] + "size": [25, 150] }, "max_Sv": { "name": "Max $\\mathregular{S_V}$", @@ -235,7 +235,7 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame], "minimum": -80.0, "maximum": -36.0 }, - "size": [5, 200] + "size": [5, 100] }, # "mean_Sv": { # "name": "$Mean \\mathregular{S_V}$", @@ -305,7 +305,7 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): cmap=custom_cmap, norm=norm, edgecolor="black", - linewidths=0.5 + linewidths=0.1 ) # ---- Add coastline data layer coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index 74968fdc..05b04c03 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -70,14 +70,13 @@ SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df") # NOTE: Stratum summary tables SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df") -SQL(realtime_survey.config["database"]["biology"], "map") -SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_df") #################################################################################################### # FROM THE `LiveSurvey` object ! # ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table +survey_data_db = Path(realtime_survey.config["database"]["acoustics"]) grid_db = Path(realtime_survey.config["database"]["grid"]) -survey_data_db = Path('C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files/database/acoustics.db') coast_db = grid_db +biology_db = Path(realtime_survey.config["database"]["biology"]) projection = realtime_survey.config["geospatial"]["projection"] # NOTE: PLOTS # ---- PLOT GRID @@ -85,12 +84,12 @@ # ---- PLOT TRACK elv.plot_livesurvey_track(survey_data_db, projection, coast_db) # ---- PLOT DISTRIBUTIONS -weight_table = SQL(realtime_survey.config["database"]["biology"], "select", +weight_table = SQL(biology_db, "select", table_name="length_weight_df") -stratum_table = SQL(realtime_survey.config["database"]["biology"], "select", +stratum_table = SQL(biology_db, "select", table_name="strata_summary_df") -specimen_table = SQL(realtime_survey.config["database"]["biology"], "select", +specimen_table = SQL(biology_db, "select", table_name="specimen_data_df") -length_table = SQL(realtime_survey.config["database"]["biology"], "select", +length_table = SQL(biology_db, "select", table_name="length_df") -elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table) \ No newline at end of file +elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table) From eaec504fbfd3e7899a22b19039ad51794e46ab69 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Fri, 16 Aug 2024 16:14:18 -0700 Subject: [PATCH 57/81] Updated plotting method for `None` --- echopop/live/live_visualizer.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py index 3985ec53..1260b787 100644 --- a/echopop/live/live_visualizer.py +++ b/echopop/live/live_visualizer.py @@ -253,6 +253,9 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame], # List of variables to plot variables = list(VARIABLE_MAP.keys()) + # Go completed variables + intact_variables = [var for var in variables if not survey_gdf[var].isnull().all()] + def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): # Censor values if needed @@ -265,11 +268,16 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): * (max_size - min_size) + min_size ) - # Create a figure and a 2x2 grid of subplots - fig, axes = plt.subplots(2, 2, figsize=(10, 10)) + # Create a figure and a 2xn grid of subplots + if len(intact_variables) == 4: + fig, axes = plt.subplots(2, 2, figsize=(10, 10)) + elif len(intact_variables) == 3: + fig, axes = plt.subplots(1, 3, figsize=(10, 10)) + elif len(intact_variables) == 2: + fig, axes = plt.subplots(1, 1, figsize=(10, 10)) # Iterate through and plot all subplots - for ax, var in zip(axes.flat, variables): + for ax, var in zip(axes.flat, intact_variables): # ---- Get the colormap colormap = plt.colormaps.get_cmap(VARIABLE_MAP[var]["colormap"]).resampled(256) # ---- Invert @@ -304,8 +312,8 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): max_size=VARIABLE_MAP[var]["size"][1]), cmap=custom_cmap, norm=norm, - edgecolor="black", - linewidths=0.1 + # edgecolor="black", + # linewidths=0.1 ) # ---- Add coastline data layer coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") @@ -325,7 +333,7 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): # ---- Add colorbar sm = plt.cm.ScalarMappable(cmap=custom_cmap, norm=norm) sm._A = [] # fake up the array of the scalar mappable - cbar = fig.colorbar(sm, ax=ax, shrink=0.5) + cbar = fig.colorbar(sm, ax=ax, shrink=0.5, fraction=0.075, pad=0.1) cbar.set_label(f"{var_info['units']}") # ---- Add scalebar scalebar_length = 250 # Length of scale bar in km From 0924489337c48a3e5c9432db55b2edd6ac6ae73b Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Fri, 16 Aug 2024 16:32:59 -0700 Subject: [PATCH 58/81] Matplotlib to panel update --- echopop/live/live_visualizer.py | 9 ++++++--- echopop/test_workflow.py | 22 +++++++++++++++++++--- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py index 1260b787..37b67a7b 100644 --- a/echopop/live/live_visualizer.py +++ b/echopop/live/live_visualizer.py @@ -154,7 +154,8 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame], plt.tight_layout() # Show the plot - plt.show() + # plt.show() + return fig def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame], projection: str, @@ -363,7 +364,8 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): plt.tight_layout() # Show the plot - plt.show() + # plt.show() + return fig def plot_livesurvey_distributions(weight_table: pd.DataFrame, stratum_table: pd.DataFrame, @@ -497,4 +499,5 @@ def plot_livesurvey_distributions(weight_table: pd.DataFrame, fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 0.2), ncol=num_strata // 2 + 1, fontsize='small', title='INPFC stratum') - plt.show() + # plt.show() + return fig diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index 05b04c03..02b4e557 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -72,6 +72,8 @@ SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df") #################################################################################################### # FROM THE `LiveSurvey` object ! +# ---- Convert to a Panel +import panel as pn # ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table survey_data_db = Path(realtime_survey.config["database"]["acoustics"]) grid_db = Path(realtime_survey.config["database"]["grid"]) @@ -79,10 +81,20 @@ biology_db = Path(realtime_survey.config["database"]["biology"]) projection = realtime_survey.config["geospatial"]["projection"] # NOTE: PLOTS +# Ensure Panel is initialized +pn.extension() +# ---- Helper function +def plt_to_pn(fig): + # Convert to a panel object + panel = pn.panel(fig) + # Display + panel.show() # OR panel.servable() if you want to serve it in a Panel server # ---- PLOT GRID -elv.plot_livesurvey_grid(grid_db, projection, coast_db) +fig = elv.plot_livesurvey_grid(grid_db, projection, coast_db) +plt_to_pn(fig) # ---- PLOT TRACK -elv.plot_livesurvey_track(survey_data_db, projection, coast_db) +fig = elv.plot_livesurvey_track(survey_data_db, projection, coast_db) +plt_to_pn(fig) # ---- PLOT DISTRIBUTIONS weight_table = SQL(biology_db, "select", table_name="length_weight_df") @@ -92,4 +104,8 @@ table_name="specimen_data_df") length_table = SQL(biology_db, "select", table_name="length_df") -elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table) +fig = elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table) +plt_to_pn(fig) + + + From 1a901864c15437cfd23a3e4d6c3a0389cd721a23 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Fri, 16 Aug 2024 17:09:35 -0700 Subject: [PATCH 59/81] Panel naming update --- echopop/test_workflow.py | 89 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 2 deletions(-) diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index 02b4e557..32105daa 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -93,7 +93,7 @@ def plt_to_pn(fig): fig = elv.plot_livesurvey_grid(grid_db, projection, coast_db) plt_to_pn(fig) # ---- PLOT TRACK -fig = elv.plot_livesurvey_track(survey_data_db, projection, coast_db) +fig1 = elv.plot_livesurvey_track(survey_data_db, projection, coast_db) plt_to_pn(fig) # ---- PLOT DISTRIBUTIONS weight_table = SQL(biology_db, "select", @@ -104,8 +104,93 @@ def plt_to_pn(fig): table_name="specimen_data_df") length_table = SQL(biology_db, "select", table_name="length_df") -fig = elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table) +fig2 = elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table) plt_to_pn(fig) +### MULTIPANEL +panel0 = pn.panel(fig, name='Gridded population estimates') +panel1 = pn.panel(fig1, name='Alongtrack population estimates') +panel2 = pn.panel(fig2, name='Length and weight distributions') + +def serve_panels(): + # Create links to each panel + home = pn.Column( + pn.pane.Markdown("# Main Page"), + pn.pane.Markdown("[Gridded population estimates](gridded_population_estimates)", sizing_mode="stretch_width"), + pn.pane.Markdown("[Alongtrack population estimates](alongtrack_population_estimates)", sizing_mode="stretch_width"), + pn.pane.Markdown("[Length and weight distributions](length_weight_distributions)", sizing_mode="stretch_width") + ) + + # Serve the home page and individual panels + pn.serve({ + 'Main Page': home, + 'gridded_population_estimates': panel0, + 'alongtrack_population_estimates': panel1, + 'length_weight_distributions': panel2 + }, show=True) +# Run the function to serve panels +serve_panels() + + + +def serve_panels(): + panel0.servable(title='Gridded population', location=True) + panel1.servable(title='Alongtrack population') + panel2.servable(title='Length/weight distribution') + pn.serve({'gridded a': panel0, 'fig1': panel1, 'fig2': panel2}, show=True) +serve_panels() + +def serve_panels(): + # Assign titles and make panels servable + panel0.servable(title='Gridded population') + panel1.servable(title='Alongtrack population') + panel2.servable(title='Length/weight distribution') + + # Create a dictionary layout + layout = { + 'Gridded population': panel0, + 'Alongtrack population': panel1, + 'Length/weight distribution': panel2 + } + + # Serve the panels + pn.serve(layout, show=True) + +# Run the function to serve panels +serve_panels() + +layout = pn.Column( + pn.pane.Markdown("# Gridded population", style={'font-size': '20px'}), + panel0, + pn.pane.Markdown("# Alongtrack population", style={'font-size': '20px'}), + panel1, + pn.pane.Markdown("# Length/weight distribution", style={'font-size': '20px'}), + panel2 +) + +def serve_panels(): + # Serve the layout with titles + layout.servable() + pn.serve(layout, show=True) + +# Run the function to serve panels +serve_panels() + +# Create a layout +layout = pn.Tabs(('Plot 1', panel1), ('Plot 2', panel2)) + +# Serve the layout +layout.servable() +pn.serve(layout, show=True) +# Run the server to display panels in separate windows +# Create a layout with tabs +tabs = pn.Tabs(('Alongtrack population', panel1), ('Length/weight distribution', panel2)) +tabs.servable() +pn.serve(tabs, port=5006, show=True) +pn.serve({'Plot 1': panel1, 'Plot 2': panel2}, show=True) + +combined_panel = pn.Column(panel1, panel2) +combined_panel.show() +panel1.show() From 32c0b993a923a47ecee6f80b53aa955fb75060c0 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Fri, 16 Aug 2024 19:16:13 -0700 Subject: [PATCH 60/81] Cleaned up `test_workflow` --- echopop/test_workflow.py | 67 +--------------------------------------- 1 file changed, 1 insertion(+), 66 deletions(-) diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index 32105daa..a22fa10b 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -128,69 +128,4 @@ def serve_panels(): 'length_weight_distributions': panel2 }, show=True) # Run the function to serve panels -serve_panels() - - - -def serve_panels(): - panel0.servable(title='Gridded population', location=True) - panel1.servable(title='Alongtrack population') - panel2.servable(title='Length/weight distribution') - pn.serve({'gridded a': panel0, 'fig1': panel1, 'fig2': panel2}, show=True) -serve_panels() - -def serve_panels(): - # Assign titles and make panels servable - panel0.servable(title='Gridded population') - panel1.servable(title='Alongtrack population') - panel2.servable(title='Length/weight distribution') - - # Create a dictionary layout - layout = { - 'Gridded population': panel0, - 'Alongtrack population': panel1, - 'Length/weight distribution': panel2 - } - - # Serve the panels - pn.serve(layout, show=True) - -# Run the function to serve panels -serve_panels() - -layout = pn.Column( - pn.pane.Markdown("# Gridded population", style={'font-size': '20px'}), - panel0, - pn.pane.Markdown("# Alongtrack population", style={'font-size': '20px'}), - panel1, - pn.pane.Markdown("# Length/weight distribution", style={'font-size': '20px'}), - panel2 -) - -def serve_panels(): - # Serve the layout with titles - layout.servable() - pn.serve(layout, show=True) - -# Run the function to serve panels -serve_panels() - -# Create a layout -layout = pn.Tabs(('Plot 1', panel1), ('Plot 2', panel2)) - -# Serve the layout -layout.servable() -pn.serve(layout, show=True) -# Run the server to display panels in separate windows -# Create a layout with tabs -tabs = pn.Tabs(('Alongtrack population', panel1), ('Length/weight distribution', panel2)) -tabs.servable() -pn.serve(tabs, port=5006, show=True) -pn.serve({'Plot 1': panel1, 'Plot 2': panel2}, show=True) - -combined_panel = pn.Column(panel1, panel2) -combined_panel.show() -panel1.show() - - - +serve_panels() \ No newline at end of file From 65ab70dd361a81056e13bd59dca270b0fc4d39c4 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Fri, 16 Aug 2024 20:24:02 -0700 Subject: [PATCH 61/81] Changed dynamic colorrange for some plots --- echopop/live/live_visualizer.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py index 37b67a7b..1ba8dd74 100644 --- a/echopop/live/live_visualizer.py +++ b/echopop/live/live_visualizer.py @@ -208,6 +208,10 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame], "colormap": "inferno", "minimum": 0.0, "cbar_reverse": True, + "color_threshold": { + "minimum": 1e1, + "maximum": 1e6, + }, "size": [25, 150] }, "biomass_density": { @@ -216,6 +220,10 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame], "colormap": "plasma", "minimum": 0.0, "cbar_reverse": True, + "color_threshold": { + "minimum": 1e1, + "maximum": 1e6, + }, "size": [25, 150] }, "nasc": { @@ -224,6 +232,10 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame], "colormap": "viridis", "minimum": 0.0, "cbar_reverse": False, + "color_threshold": { + "minimum": 1e2, + "maximum": 1e4 + }, "size": [25, 150] }, "max_Sv": { From 27ff2d388c4cd41bad87db0d5158262f68a90414 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 19 Aug 2024 10:57:21 -0700 Subject: [PATCH 62/81] Fix to grid plot colormap scaling/range --- echopop/live/live_visualizer.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py index 1ba8dd74..2b48ae5c 100644 --- a/echopop/live/live_visualizer.py +++ b/echopop/live/live_visualizer.py @@ -62,21 +62,37 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame], "name": "Mean number density", "units": "fish $\\mathregular{nmi^{-2}}$", "colormap": "viridis", + "color_threshold": { + "minimum": 1e1, + "maximum": 1e6 + }, }, "biomass_density_mean": { "name": "Mean biomass density", "units": "kg $\\mathregular{nmi^{-2}}$", "colormap": "plasma", + "color_threshold": { + "minimum": 1e1, + "maximum": 1e6 + }, }, "biomass": { "name": "Biomass", "units": "kg", "colormap": "cividis", + "color_threshold": { + "minimum": 1e1 * grid_gdf["area"].max(), + "maximum": 1e6 * grid_gdf["area"].max() + }, }, "abundance": { "name": "Abundance", "units": "$\\it{N}$", "colormap": "inferno", + "color_threshold": { + "minimum": 1e1 * grid_gdf["area"].max(), + "maximum": 1e6 * grid_gdf["area"].max() + }, } } @@ -98,8 +114,16 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame], newcolors[0, :] = white # ---- Create the new custom colormap custom_cmap = ListedColormap(newcolors) + # ---- Drop "empty" values + sub_grid_gdf = grid_gdf[grid_gdf[var] > 0.0] + if "color_threshold" in VARIABLE_MAP[var].keys(): + min_value = VARIABLE_MAP[var]["color_threshold"]["minimum"] + max_value = VARIABLE_MAP[var]["color_threshold"]["maximum"] + else: + min_value = sub_grid_gdf[var].min() + max_value = sub_grid_gdf[var].max() # ---- Normalize colorscale - norm=plt.Normalize(vmin=grid_gdf[var].min(), vmax=grid_gdf[var].max()) + norm=plt.Normalize(vmin=min_value, vmax=max_value) # ---- Plot the polygons with color fills based on the variable (non-zero) grid_gdf.plot(column=var, ax=ax, edgecolor="gainsboro", legend=False, cmap=custom_cmap, norm=norm, @@ -121,8 +145,7 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame], ax.set_ylabel(u'Latitude (\u00B0N)') # ---- Add colorbar sm = plt.cm.ScalarMappable(cmap=custom_cmap, - norm=plt.Normalize(vmin=grid_gdf[var].min(), - vmax=grid_gdf[var].max())) + norm=norm) sm._A = [] # fake up the array of the scalar mappable cbar = fig.colorbar(sm, ax=ax, shrink=0.5) cbar.set_label(f"{var_info['units']}") @@ -232,7 +255,7 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame], "colormap": "viridis", "minimum": 0.0, "cbar_reverse": False, - "color_threshold": { + "color_threshold": { "minimum": 1e2, "maximum": 1e4 }, @@ -349,7 +372,7 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): cbar = fig.colorbar(sm, ax=ax, shrink=0.5, fraction=0.075, pad=0.1) cbar.set_label(f"{var_info['units']}") # ---- Add scalebar - scalebar_length = 250 # Length of scale bar in km + scalebar_length = 100 # Length of scale bar in km scalebar_length_in_degrees = scalebar_length / 111 # Assuming 1 degree = 111 km # ---- Transform scale bar coordinates to axis units # scalebar_x = axis_limits[0]*1.005 + (axis_limits[2]*1.01 - axis_limits[0]*1.005) * 0.1 From 8e21e13d3d90d3291eef4564637fbb1656586359 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 19 Aug 2024 16:53:55 -0700 Subject: [PATCH 63/81] Add dataset validator for biodata --- echopop/live/live_data_loading.py | 73 +++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index 96631b35..bde4db7f 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -7,6 +7,7 @@ import numpy as np from datetime import datetime import xarray as xr +import os from .live_core import( LIVE_FILE_FORMAT_MAP, @@ -269,6 +270,17 @@ def validate_data_directory(file_configuration: dict, dataset: str, # Initialize the database file initialize_database(database_root_directory, file_settings) + + # Clean the file names + data_files = [ + re.sub(r'//', r'\\', str(filename)).replace('/', '\\') + if not str(filename).startswith('s3://') + else str(filename) + for filename in data_files + ] + + # Drop incomplete datasets + data_files = validate_complete_biology_dataset(data_files, directory_path, file_configuration) # Query the SQL database to process only new files (or create the db file in the first place) valid_files, file_configuration["database"][dataset] = ( @@ -278,6 +290,67 @@ def validate_data_directory(file_configuration: dict, dataset: str, # Return the valid filenames/paths return valid_files +def validate_complete_biology_dataset(data_files: List[str], + directory_path: str, + file_configuration: dict): + + # Get the biology data file settings + file_settings = file_configuration["input_directories"]["biology"] + + # Get the file-specific settings, datatypes, columns, etc. + # ---- Extract the expected file name ID's + biology_file_ids = file_settings["file_name_formats"] + + # Define helper function for extract haul number from filename strings + def get_file_haul_number(filename, format_string): + # Step 1: Extract the filename from the full path + filename_only = os.path.basename(filename) + + # Remove the file extension from the filename + filename_no_ext = os.path.splitext(filename_only)[0] + + # Split the format string and filename into parts + format_parts = re.findall(r'\{[^}]+\}|[^_]+', format_string) + filename_parts = filename_no_ext.split('_') + + # Find the index of {HAUL} in format_parts + haul_index = format_parts.index('{HAUL}') + + # Extract and return the haul number from filename_parts + if haul_index < len(filename_parts): + return filename_parts[haul_index] + return None + + # Organize dataset by their respective dataset-type + dataset_dict = {key: filter_filenames(directory_path, + ds, + data_files, + file_settings["extension"]) + for key, ds in biology_file_ids.items()} + + # Extract the haul numbers + extracted_hauls = { + key: set(get_file_haul_number(filename, biology_file_ids.get(key, '')) + for filename in filenames) + for key, filenames in dataset_dict.items() + } + + # Find haul numbers that appear in all keys + common_hauls = set.intersection(*extracted_hauls.values()) + + # Filter filenames to keep only those with haul numbers in the common set + filtered_filenames = [ + filename + for key, filenames in dataset_dict.items() + for filename in filenames + if get_file_haul_number(filename, biology_file_ids.get(key, '')) + in common_hauls + ] + + # Return the curated filename list + return filtered_filenames + + def compile_filename_format(file_name_format: str): # Create a copy of `file_name_format` From 0d8e73256d1c084b3ef37e29de112d0ae9f4dc2a Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 19 Aug 2024 16:56:30 -0700 Subject: [PATCH 64/81] Apply biodata validator only to biodata... --- echopop/live/live_data_loading.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index bde4db7f..d46cec1e 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -280,7 +280,9 @@ def validate_data_directory(file_configuration: dict, dataset: str, ] # Drop incomplete datasets - data_files = validate_complete_biology_dataset(data_files, directory_path, file_configuration) + if dataset == "biology": + data_files = validate_complete_biology_dataset(data_files, directory_path, + file_configuration) # Query the SQL database to process only new files (or create the db file in the first place) valid_files, file_configuration["database"][dataset] = ( From 3ab32ac0d3e0bccdc0c3c137002c84fd05f75cd4 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 19 Aug 2024 18:48:40 -0700 Subject: [PATCH 65/81] Fix to biodata dataset validator --- echopop/live/live_data_loading.py | 17 ++++++++++++++++- echopop/live/sql_methods.py | 4 +++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index d46cec1e..0d90e2d3 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -281,7 +281,8 @@ def validate_data_directory(file_configuration: dict, dataset: str, # Drop incomplete datasets if dataset == "biology": - data_files = validate_complete_biology_dataset(data_files, directory_path, + data_files = validate_complete_biology_dataset(data_files, + directory_path, file_configuration) # Query the SQL database to process only new files (or create the db file in the first place) @@ -349,6 +350,20 @@ def get_file_haul_number(filename, format_string): in common_hauls ] + # Get bad files for DEBUG + non_filtered_filenames = [ + filename + for key, filenames in dataset_dict.items() + for filename in filenames + if get_file_haul_number(filename, biology_file_ids.get(key, '')) + not in common_hauls + ] + + print( + f"The following files are parts of incomplete filesets: " + f"{'\n'.join(non_filtered_filenames)}" + ) + # Return the curated filename list return filtered_filenames diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index a0cf299c..682bef65 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -669,11 +669,13 @@ def query_processed_files(root_directory: Path, file_settings: dict, files: List if processed: SQL(db_file, "insert", table_name="files_processed", dataframe=current_files, id_columns=["filepath"]) - else: + elif not current_files.empty: SQL(db_file, "insert", table_name="files_read", dataframe=current_files, id_columns=["filepath"]) # ---- Apply filter by comparing sets and return the output return list(set(files_str) - set(previous_files)), db_file + else: + return None, db_file # TODO: Documentation def sql_data_exchange(database_file: Path, **kwargs): From 80d2b555fabd740f9d3198a1d824ba09529b1b4f Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Tue, 20 Aug 2024 09:03:05 -0700 Subject: [PATCH 66/81] f-string adjustment --- echopop/live/live_data_loading.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index 0d90e2d3..06d1a6fc 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -358,10 +358,11 @@ def get_file_haul_number(filename, format_string): if get_file_haul_number(filename, biology_file_ids.get(key, '')) not in common_hauls ] - + # ---- Create list + non_filtered_filenames_lst = "\n".join(non_filtered_filenames) print( - f"The following files are parts of incomplete filesets: " - f"{'\n'.join(non_filtered_filenames)}" + f"The following files are parts of incomplete filesets: \n" + f"{non_filtered_filenames_lst}" ) # Return the curated filename list From dd24ebd71888fc3b842e63a4ab8a1ef65d536ea0 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 21 Aug 2024 10:20:27 -0700 Subject: [PATCH 67/81] Change to enable multiple ship data sources --- echopop/live/live_acoustics.py | 13 ++++++++----- echopop/live/live_survey.py | 3 ++- echopop/live/live_visualizer.py | 22 ++++++++++++++++++++-- echopop/live/sql_methods.py | 13 +++++++++++-- 4 files changed, 41 insertions(+), 10 deletions(-) diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py index 44e61ae0..3d8ac59c 100644 --- a/echopop/live/live_acoustics.py +++ b/echopop/live/live_acoustics.py @@ -60,6 +60,8 @@ def preprocess_acoustic_data(survey_data: pd.DataFrame, ) # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object + # ---- Add `ship_id` from the file configuration + prc_nasc_df_filtered.loc[:, "ship_id"] = file_configuration["ship_id"] # ---- Replace NASC `NaN` values with `0.0` prc_nasc_df_filtered.loc[:, "NASC"] = prc_nasc_df_filtered.loc[:, "NASC"].fillna(0.0) # ---- Drop the `frequency_nominal` column and return the output @@ -221,7 +223,7 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict, # else: nasc_data_df = ( acoustic_data_df - .groupby(["longitude", "latitude", "ping_time", "source"] + gridding_column, + .groupby(["ship_id", "longitude", "latitude", "ping_time", "source"] + gridding_column, observed=False) .apply(integrate_nasc, echometrics, include_groups=False).droplevel(-1) .reset_index() @@ -239,9 +241,9 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict, # ---- Reorder columns nasc_data_df = nasc_data_df[ gridding_column - + ["longitude", "latitude", "ping_time", "source", "nasc", "n_layers", "nasc_db", - "mean_Sv", "max_Sv", "aggregation_index", "center_of_mass", "dispersion", "evenness", - "occupied_area"] + + ["ship_id", "longitude", "latitude", "ping_time", "source", "nasc", "n_layers", + "nasc_db", "mean_Sv", "max_Sv", "aggregation_index", "center_of_mass", "dispersion", + "evenness", "occupied_area"] ] # Return the output @@ -261,7 +263,8 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict # ---- df[add_columns] = 0.0 # ---- Assign values for key values - key_values = [f"{str(index)}-{df.loc[index, 'source']}" for index in df.index] + key_values = [f"{df.loc[index, "ship_id"]}-{str(index)}-{df.loc[index, 'source']}" + for index in df.index] # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint df.loc[:, "id"] = key_values diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index d8470366..a8a2a70e 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -340,7 +340,8 @@ def estimate_population(self, eldp.acoustic_pipeline(self.input["acoustics"], self.input["spatial"]["strata"], self.config, - verbose=verbose) + verbose=verbose, + contrast_columns=["ship_id"]) # --- Validate successful run self.meta["provenance"]["acoustic_population"] = True diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py index 2b48ae5c..69c33dd5 100644 --- a/echopop/live/live_visualizer.py +++ b/echopop/live/live_visualizer.py @@ -304,6 +304,12 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): * (max_size - min_size) + min_size ) + # Define colors for ship_ids (you can customize these colors as needed) + ship_id_colors = { + ship_id: plt.cm.tab10(i) # Use a colormap for distinct colors; adjust as needed + for i, ship_id in enumerate(survey_gdf['ship_id'].unique()) + } + # Create a figure and a 2xn grid of subplots if len(intact_variables) == 4: fig, axes = plt.subplots(2, 2, figsize=(10, 10)) @@ -323,8 +329,18 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): custom_cmap = ListedColormap(newcolors) # ---- Plot cruisetrack # survey_gdf.plot(ax=ax, color="dimgray", linewidth=0.25, linestyle="-") - ax.plot(survey_gdf.geometry.x, survey_gdf.geometry.y, color="dimgray", - linewidth=0.25, linestyle="-") + # ax.plot(survey_gdf.geometry.x, survey_gdf.geometry.y, color="dimgray", + # linewidth=0.25, linestyle="-") + handles = [] # List to store legend handles + for ship_id, group in survey_gdf.groupby("ship_id"): + # Sort the group by latitude or longitude + # group = group.sort_values(by=["latitude", "longitude"]) + color = ship_id_colors.get(ship_id, 'gray') + line_handle, = ax.plot(group.geometry.x, group.geometry.y, color=color, + linewidth=0.25, linestyle="-", label=ship_id, zorder=1) + handles.append(line_handle) # Add handle to legend + # ax.plot(group.geometry.x, group.geometry.y, label=ship_id, linewidth=0.25, + # linestyle="-", zorder=1) # ---- Drop "empty" values sub_gdf = survey_gdf[survey_gdf[var] > VARIABLE_MAP[var]["minimum"]] # ---- Assign color range @@ -348,6 +364,7 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): max_size=VARIABLE_MAP[var]["size"][1]), cmap=custom_cmap, norm=norm, + zorder = 2 # edgecolor="black", # linewidths=0.1 ) @@ -390,6 +407,7 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): # ---- Add scale text ax.text(x0 + x_scale + scalebar_length_in_degrees / 2, y0 + y_scale - (y1 - y0) * 0.025, f'{scalebar_length} km', ha='center', va='top', color='black') + # ax.legend(handles=handles, title='Ship ID') # ax.text(scalebar_x + (scalebar_length / 200), # scalebar_y - scalebar_y_offset, diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index 682bef65..fc28d2d1 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -744,11 +744,20 @@ def query_dataset(db_file: str, valid_keys = list(set(inspected_table.keys()).intersection(set(data_columns))) # ---- Get unique identifiers unique_keys_df = get_unique_identifiers(data_dict, unique_keys) + # ---- Conditional string formatting helper function + def format_value(value): + if isinstance(value, str): + return f"'{value.replace("'", "''")}'" + return str(value) # ---- Create conditional string conditional_str = " | ".join( - [" & ".join([f"{col} = {val}" for col, val in row.items()]) + [" & ".join([f"{col} = {format_value(val)}" for col, val in row.items()]) for _, row in unique_keys_df.iterrows()] - ) + ) + # conditional_str = " | ".join( + # [" & ".join([f"{col} = {val}" for col, val in row.items()]) + # for _, row in unique_keys_df.iterrows()] + # ) # conditional_str = ( # " & ".join([f"{col} in {np.unique(unique_keys_df[col]).tolist()}" # for col in unique_keys_df.columns]) From 3c4830ac22ee095921d5476deb300795f37f375e Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 21 Aug 2024 11:27:50 -0700 Subject: [PATCH 68/81] Fix to cases where lon/lat/ping_time were NaN/NaT --- echopop/live/live_acoustics.py | 2 ++ echopop/live/live_data_processing.py | 7 ++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py index 3d8ac59c..f59b3d13 100644 --- a/echopop/live/live_acoustics.py +++ b/echopop/live/live_acoustics.py @@ -45,6 +45,8 @@ def preprocess_acoustic_data(survey_data: pd.DataFrame, # ---- Filter out any unused frequency coordinates prc_nasc_df_filtered = ( survey_data[survey_data["frequency_nominal"] == transmit_settings["frequency"]] + # ---- Drop NaN/NaT values from longitude/latitude/ping_time + .dropna(subset=["longitude", "latitude", "ping_time"]) ) # Get grid coordinates diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py index 8673bd53..a2dcaa46 100644 --- a/echopop/live/live_data_processing.py +++ b/echopop/live/live_data_processing.py @@ -86,7 +86,7 @@ def acoustic_pipeline(acoustic_dict: dict, if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]): # ---- Merge the NASC and sigma_bs datasets - nasc_biology = acoustic_df.merge(sigma_bs_df, on=spatial_column + contrast_columns) + nasc_biology = acoustic_df.merge(sigma_bs_df, on=spatial_column) # ---- Compute the number densities (animals nmi^-2) nasc_biology["number_density"] = ( nasc_biology["nasc"] @@ -101,7 +101,7 @@ def acoustic_pipeline(acoustic_dict: dict, if weight_spatial_averages is not None: # Merge average weights with number density estimates nasc_biology = nasc_biology.merge(weight_spatial_averages, - on=spatial_column + contrast_columns) + on=spatial_column) # Compute biomass densities nasc_biology["biomass_density"] = ( @@ -109,7 +109,8 @@ def acoustic_pipeline(acoustic_dict: dict, ) # Update the survey population estimate DataFrame with the newly computed densities - if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]): + if (all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]) + and not nasc_biology.empty): sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df", columns=["number_density", "biomass_density"], unique_columns=["id"]) From 4bce56044815049fc6e51b2a6b7e943ab1dd22ed Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 21 Aug 2024 12:05:05 -0700 Subject: [PATCH 69/81] f-string fix for sql_methods --- echopop/live/sql_methods.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index fc28d2d1..9b6354e6 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -745,10 +745,10 @@ def query_dataset(db_file: str, # ---- Get unique identifiers unique_keys_df = get_unique_identifiers(data_dict, unique_keys) # ---- Conditional string formatting helper function - def format_value(value): - if isinstance(value, str): - return f"'{value.replace("'", "''")}'" - return str(value) + def format_value(x): + if isinstance(x, str): + return "'{}'".format(x.replace("'", "''")) + return str(x) # ---- Create conditional string conditional_str = " | ".join( [" & ".join([f"{col} = {format_value(val)}" for col, val in row.items()]) From 50937f123806ad337959c8a5630356737cc52b40 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 21 Aug 2024 12:24:05 -0700 Subject: [PATCH 70/81] Fixed `ship_id` f-string issue --- echopop/live/live_acoustics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py index f59b3d13..2a9347a0 100644 --- a/echopop/live/live_acoustics.py +++ b/echopop/live/live_acoustics.py @@ -265,7 +265,7 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict # ---- df[add_columns] = 0.0 # ---- Assign values for key values - key_values = [f"{df.loc[index, "ship_id"]}-{str(index)}-{df.loc[index, 'source']}" + key_values = [f"{df.loc[index, 'ship_id']}-{str(index)}-{df.loc[index, 'source']}" for index in df.index] # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint df.loc[:, "id"] = key_values From f324647f5e8ce8307e2f498875bb2113453f9c2f Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 21 Aug 2024 17:13:16 -0700 Subject: [PATCH 71/81] Fixes to odd SQL table column shuffling --- echopop/live/live_biology.py | 5 +- echopop/live/live_core.py | 192 +++++++++++++++++++++++++++++++++++ echopop/live/live_survey.py | 3 +- echopop/live/sql_methods.py | 13 ++- 4 files changed, 207 insertions(+), 6 deletions(-) diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py index 99264e0f..5e70d92b 100644 --- a/echopop/live/live_biology.py +++ b/echopop/live/live_biology.py @@ -195,9 +195,10 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, insertion_df = sigma_bs_df.copy() # ---- Create SQL(biology_db, "create", table_name="sigma_bs_mean_df", dataframe=insertion_df, - primary_keys=["id"]) + primary_keys=key_list+["id"]) # ---- Populate table - SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=insertion_df) + SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=insertion_df, + id_columns=key_list+["id"]) else: # ---- Get previous values in the table table_df = SQL(biology_db, "select", table_name="sigma_bs_mean_df") diff --git a/echopop/live/live_core.py b/echopop/live/live_core.py index 256b9f27..388a8240 100644 --- a/echopop/live/live_core.py +++ b/echopop/live/live_core.py @@ -32,6 +32,198 @@ }, } +# Required data configuration YAML structure +LIVE_CONFIG_INIT_MODEL = { + "required_keys": ["acoustics", "biology", "geospatial"], + "optional_keys": [], + "keys": { + "acoustics": { + "required_keys": ["transmit", "TS_length_regression_parameters"], + "optional_keys": [], + "keys": { + "transmit": { + "required_keys": ["frequency", "units"], + "optional_keys": [], + "keys": { + "frequency": float, + "units": ["Hz", "kHz"], + }, + }, + "TS_length_regression_parameters": { + "required_keys": ["*"], + "optional_keys": [], + "keys": { + "*": { + "required_keys": ["number_code", "TS_L_slope", "TS_L_intercept", + "length_units"], + "optional_keys": ["character_code"], + "keys": { + "number_code": int, + "characeter_code": str, + "TS_L_slope": float, + "TS_L_intercept": float, + "length_units": ["mm", "cm", "m"], + }, + }, + }, + }, + }, + }, + "biology": { + "required_keys": ["length_distribution", "catch"], + "optional_keys": ["stations"], + "keys": { + "length_distribution": { + "required_keys": ["bins"], + "optional_keys": [], + "keys": { + "bins": [float, int], + }, + }, + "stations": { + "required_keys": ["separate_stations", "station_id"], + "optional_keys": [], + "keys": { + "separate_stations": bool, + "station_id": [str], + }, + }, + "catch": { + "required_keys": ["partition"], + "optional_keys": [], + "keys": { + "partition": str, + }, + }, + }, + }, + "geospatial": { + "required_keys": ["projection", "link_biology_acoustics"], + "optional_keys": ["inpfc", "griddify"], + "keys": { + "inpfc": { + "required_keys": ["latitude_max", "stratum_names"], + "optional_keys": [], + "keys": { + "latitude_max": [float], + "stratum_names": [int, str], + }, + }, + "griddify": { + "required_keys": ["bounds", "grid_resolution"], + "optional_keys": [], + "keys": { + "bounds": { + "required_keys": [("latitude", "longitude"), ("x", "y")], + "optional_keys": [], + "keys": { + "latitude": [float], + "longitude": [float], + "x": [float], + "y": [float] + }, + }, + "grid_resolution": { + "required_keys":[("latitude_distance", "longitude_distance"), + ("x_distance", "y_distance")], + "optional_keys": [], + "keys": { + "longitude_distance": float, + "latitude_distance": float, + "x_distance": float, + "y_distnace": float, + } + } + }, + }, + "link_biology_acoustics": ["closest_haul", "global", "INPFC", "weighted_haul"], + "projection": str, + }, + }, + }, +} + +# Required data configuration YAML structure +LIVE_CONFIG_DATA_MODEL = { + "required_keys": ["ship_id", "survey_year", "database_directory", "input_directories"], + "optional_keys": ["species", "data_root_dir"], + "keys": { + "data_root_dir": str, + "database_directory": str, + "input_directories": { + "required_keys": ["acoustics", "biology"], + "optional_keys": ["coastline", "grid"], + "keys": { + "acoustics": { + "required_keys": ["database_name", "directory", "extension"], + "optional_keys": [], + "keys": { + "directory": str, + "database_name": str, + "extension": ["zarr"], + }, + }, + "biology": { + "required_keys": ["database_name", "directory", "extension", "file_index", + "file_ids", "file_name_formats"], + "optional_keys": [], + "keys": { + "directory": str, + "database_name": str, + "extension": ["csv"], + "file_name_formats": { + "required_keys": ["*"], + "optional_keys": [], + "keys": { + "*": str, + }, + }, + "file_ids": { + "required_keys": ["*"], + "optional_keys": [], + "keys": { + "*": str, + }, + }, + "file_index": { + "required_keys": ["*"], + "optional_keys": [], + "keys": { + "*": [str], + }, + }, + }, + }, + "coastline": { + "required_keys": ["directory", "coastline_name"], + "optional_keys": [], + "keys": { + "directory": str, + "coastline_name": str, + }, + }, + "grid": { + "required_keys": ["database_name"], + "optional_keys": [], + "keys": { + "database_name": str, + }, + }, + }, + }, + "ship_id": [str, int], + "species": { + "required_keys": [], + "optional_keys": ["text_code", "number_code"], + "keys": { + "text_code": str, + "number_code": int, + }, + }, + "survey_year": int, + }, +} + # TODO: Update structure with additional information (as needed) # TODO: Documentation LIVE_INPUT_FILE_CONFIG_MAP = { diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index a8a2a70e..51cc4ba8 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -60,8 +60,7 @@ def __init__( # Loading the configuration settings and definitions that are used to # initialize the Survey class object - self.config = eldl.live_configuration(Path(live_init_config_path), - Path(live_file_config_path)) + self.config = eldl.live_configuration(live_init_config_path, live_file_config_path) # ---- Initialize config key for database files self.config.update( {"database": {key: None for key in self.config["input_directories"].keys()}} diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index 9b6354e6..0e5f6a97 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -116,16 +116,23 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data conflict_columns (list): List of column names to use for conflict resolution. """ + # Create 'inspector' for the db file + inspector = inspect(connection) + # ---- Get the column names from the db file + table_columns = [col['name'] for col in inspector.get_columns(table_name)] + # Prepare the SQL statement for insertion # ---- Check whether `columns` is '*' if "*" in columns: # ---- Create 'inspector' for the db file inspector = inspect(connection) # ---- Get the column names from the db file - columns = [col['name'] for col in inspector.get_columns(table_name)] + columns = table_columns # ---- If not a List elif not isinstance(columns, list): columns = [columns] + # ---- Match column indexing with original table + columns = [col for col in table_columns if col in columns] # ---- Prepare the columns as a string of column names column_names = ", ".join(columns) @@ -136,6 +143,8 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data # Convert the DataFrame into a tuple and then into a string # ---- Replace NaN with None dataframe = dataframe.replace([np.nan], [None]) + # ---- Match column indexing with original table + dataframe = dataframe[columns] # ---- DataFrame to Tuple data_tuple = [tuple(row) for row in dataframe.itertuples(index=False)] @@ -157,7 +166,7 @@ def format_value(x): # else 'NULL' if x is None else str(x), row))})" # for row in data_tuple # ) - data_str = ", ".join(f"({','.join(map(lambda x: format_value(x), row))})" for row in data_tuple) + data_str = ", ".join(f"({','.join(map(lambda x: format_value(x), row))})" for row in data_tuple) # Construct the "ON CONFLICT, DO UPDATE SET" if needed on_conflict_clause = "" From f0f8001cb2f2a8a5765d31c4c56fe67c2fd88698 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 21 Aug 2024 17:37:55 -0700 Subject: [PATCH 72/81] Minor improvements to visualizer code --- echopop/live/live_visualizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py index 69c33dd5..4c59d975 100644 --- a/echopop/live/live_visualizer.py +++ b/echopop/live/live_visualizer.py @@ -7,6 +7,7 @@ import geopandas as gpd from typing import Union, Optional from pathlib import Path +import matplotlib.gridspec as gridspec def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame], projection: str, @@ -317,6 +318,7 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): fig, axes = plt.subplots(1, 3, figsize=(10, 10)) elif len(intact_variables) == 2: fig, axes = plt.subplots(1, 1, figsize=(10, 10)) + plt.subplots_adjust(left=0.1, right=0.9, bottom=0.1, top=0.9, wspace=0.0, hspace=0.0) # Iterate through and plot all subplots for ax, var in zip(axes.flat, intact_variables): From 5fcc0c9ceeefdfefd77f1e387579cb90af44ffad Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 21 Aug 2024 17:39:49 -0700 Subject: [PATCH 73/81] New configuration file validator --- echopop/live/live_data_loading.py | 220 +++++++++++++++++++++--------- 1 file changed, 154 insertions(+), 66 deletions(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index 06d1a6fc..80dee085 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -8,11 +8,14 @@ from datetime import datetime import xarray as xr import os +import copy from .live_core import( LIVE_FILE_FORMAT_MAP, LIVE_INPUT_FILE_CONFIG_MAP, - SPATIAL_CONFIG_MAP + SPATIAL_CONFIG_MAP, + LIVE_CONFIG_INIT_MODEL, + LIVE_CONFIG_DATA_MODEL ) from .live_spatial_methods import create_inpfc_strata @@ -42,8 +45,11 @@ def live_configuration(live_init_config_path: Union[str, Path], # Read the YAML configuration/recipe file to parameterize the `LiveSurvey` class # ---- Initialization settings init_config = yaml.safe_load(Path(live_init_config_path).read_text()) + # -------- Validate + init_config = validate_live_config(copy.deepcopy(init_config), LIVE_CONFIG_INIT_MODEL) # ---- Filepath/directory settings file_config = yaml.safe_load(Path(live_file_config_path).read_text()) + file_config = validate_live_config(copy.deepcopy(file_config), LIVE_CONFIG_DATA_MODEL) # Check for intersecting/duplicative configuration keys # ---- Compare sets of keys from each dictionary @@ -456,71 +462,6 @@ def convert_datetime(timestamp: Union[int, str, pd.Series]): else: return datetime.strptime(timestamp, datetime_format) -# def load_biology_data(file_configuration: dict): - -# # Get the acoustic file settings and root directory -# # ---- File settings -# file_settings = file_configuration["input_directories"]["biology"] -# # ---- Root directory -# root_directory = file_configuration["data_root_dir"] - -# # Get and validate the acoustic data directory and files -# biology_files = validate_data_directory(root_directory, file_settings) - -# # Query `biology.db` to process only new files (or create the db file in the first place) -# # SQL(biology_db, "drop", table_name="files_read") -# new_biology_files, file_configuration["database"]["biology"] = ( -# query_processed_files(root_directory, file_settings, biology_files) -# ) - -# # Get the file-specific settings, datatypes, columns, etc. -# # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` -# biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] -# # ---- Extract the expected file name ID's -# biology_file_ids = file_settings["file_name_formats"] -# # ---- Extract all of the file ids -# biology_config_ids = list(biology_file_ids.keys()) -# # ---- Initialize the dictionary that will define this key in the `input` attribute -# biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} -# # ---- Create filepath object -# directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"] - -# # Add SQL file to dict -# file_configuration["database"]["biology"] = ( -# Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"] -# ) - -# # Iterate through the different biology datasets and read them in -# for dataset in list(biology_file_ids.keys()): -# # ---- Get dataset-specific file lists -# dataset_files = filter_filenames(directory_path, -# file_settings["file_name_formats"][dataset], -# new_biology_files, -# file_settings["extension"]) -# # ---- If there are dataset files available -# if dataset_files: -# # ---- Read in validated biology data -# dataframe_list = [read_biology_csv(Path(file), -# file_settings["file_name_formats"][dataset], -# biology_config_map[dataset]) -# for file in dataset_files] -# # ---- Concatenate the dataset -# dataframe_combined = pd.concat(dataframe_list, ignore_index=True) -# # ---- Lower-case sex -# if "sex" in dataframe_combined.columns: -# dataframe_combined["sex"] = dataframe_combined["sex"].str.lower() -# # ---- Lower-case trawl partition type -# if "trawl_partition" in dataframe_combined.columns: -# dataframe_combined["trawl_partition"] = dataframe_combined["trawl_partition"].str.lower() -# # ---- Reformat datetime column -# if "datetime" in dataframe_combined.columns: -# dataframe_combined["datetime"] = convert_datetime(dataframe_combined["datetime"]) -# # ---- Add to the data dictionary -# biology_output[f"{dataset}_df"] = dataframe_combined - -# # Pre-process and return the results -# return preprocess_biology_data(biology_output, file_configuration) - def validate_hauls_config(spatial_config: dict, link_method: str): # Get the link method configuration map @@ -705,3 +646,150 @@ def validate_spatial_config(spatial_config: dict): validate_inpfc_config(spatial_config, link_method) elif link_method != "global": validate_hauls_config(spatial_config, link_method) + +def validate_live_config(config, reference_model): + """Validate configuration inputs""" + + # Recursive function for validating entire nested dictionary + def validate_keys(config, model, path=""): + + # Get the required/optional/actual keys + # ---- Keys that are required by the software + required_keys = model.get("required_keys", []) + # ---- Keys that are optionally incorporated into the software + optional_keys = model.get("optional_keys", []) + # ---- Navigate the nested branches + keys = model.get("keys", {}) + + # General helper functions + # ---- + def get_keys_from_tuples(tuples): + """Parse key names from tuples""" + return {key for group in tuples if isinstance(group, tuple) for key in group} + # ---- + def find_missing_keys(required_keys, keys_to_check): + """Find any missing keys""" + all_required_keys = get_keys_from_tuples(required_keys) + valid_keys_in_tuples = set() + for group in required_keys: + if isinstance(group, tuple): + if any(key in keys_to_check for key in group): + valid_keys_in_tuples.update(group) + missing_keys = [key for key in valid_keys_in_tuples if key not in keys_to_check] + unexpected_keys = [key for key in keys_to_check if key not in all_required_keys] + return missing_keys, unexpected_keys + # ---- + def check_for_missing_keys(required_keys, config_keys, path): + """Check whether any required keys are missing""" + missing_required = [] + for key in required_keys: + if isinstance(key, tuple): + missing_keys, unexpected_keys_for_keys = find_missing_keys(required_keys, + config_keys) + if missing_keys: + raise ValueError( + f"Missing required key(s): {', '.join(missing_keys)} at {path}" + ) + return unexpected_keys_for_keys + elif key not in config_keys and key != "*": + missing_required.append(key) + if missing_required: + raise ValueError( + f"Missing required key(s): {', '.join(missing_required)} at {path}" + ) + return [] + # ---- + def check_for_unexpected_keys(config_keys, required_keys): + """Check for unexpected keys""" + unexpected_keys = [] + for key in config_keys: + if (key not in required_keys + and key not in optional_keys + and "*" not in required_keys): + if not any(key in group for group in required_keys if isinstance(group, tuple)): + unexpected_keys.append(key) + return unexpected_keys + + # Top-level validation + if path == "": + missing_primary_keys = [key for key in required_keys + if key != "*" and key not in config] + if missing_primary_keys: + raise ValueError(f"Missing primary key(s): {', '.join(missing_primary_keys)}") + unexpected_primary_keys = [key for key in config + if key not in required_keys + and key not in optional_keys + and "*" not in required_keys] + # ---- Raise error + if unexpected_primary_keys: + raise ValueError( + f"Unexpected primary key(s) found: {', '.join(unexpected_primary_keys)}" + ) + # Nested validation + else: + config_keys = config.keys() + unexpected_keys = check_for_missing_keys(required_keys, config_keys, path) + unexpected_keys.extend(check_for_unexpected_keys(config_keys, required_keys)) + # ---- Raise error + if unexpected_keys: + raise ValueError(f"Unexpected key(s) found: {', '.join(unexpected_keys)} at {path}") + + # Recursively validate nested dictionaries and lists + for key, sub_model in keys.items(): + if key == "*" and isinstance(sub_model, dict): + for sub_key in config: + validate_keys(config[sub_key], + sub_model, path=f"{path}.{sub_key}" if path else sub_key) + elif key == "*" and isinstance(sub_model, list): + for sub_key in config: + validate_list(config[sub_key], sub_model, key, path) + elif key == "*": + for sub_key in config: + validate_type(config[sub_key], sub_model, key, path) + elif key in config: + if isinstance(sub_model, dict): + validate_keys(config[key], sub_model, path=f"{path}.{key}" if path else key) + elif isinstance(sub_model, list): + validate_list(config[key], sub_model, key, path) + else: + validate_type(config[key], sub_model, key, path) + + # Additional helper functions + # ---- + def validate_list(config_value, allowed_types, key, path): + """Validate configuration with model that is formatted as a list""" + if all(isinstance(item, (str, int, float)) for item in allowed_types): + if config_value not in allowed_types: + raise ValueError( + f"Invalid value for key '{key}' at {path}. Expected one of: {allowed_types}" + ) + elif not isinstance(config_value, list): + if type(config_value) not in allowed_types: + raise ValueError( + f"Invalid value for key '{key}' at {path}. Expected one of: {allowed_types}" + ) + else: + if isinstance(config_value, list): + for item in config_value: + if not any(isinstance(item, t) for t in allowed_types): + raise ValueError( + f"Invalid type for items in list '{key}' at {path}. Expected one of: " + f"{allowed_types}" + ) + else: + raise ValueError( + f"Invalid type for key '{key}' at {path}. Expected a list of: {allowed_types}" + ) + # ---- + def validate_type(config_value, expected_type, key, path): + """Validate configuration with model that is at the furthest point along a branch""" + if not isinstance(config_value, expected_type): + raise ValueError( + f"Invalid type for key '{key}' at {path}. Expected type: {expected_type}" + ) + + # Validate all branches within the configuration dictionary + validate_keys(config, reference_model) + + # Return + return config From 7db976454f930b71e8471cb3f092d169d698157a Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 21 Aug 2024 18:25:27 -0700 Subject: [PATCH 74/81] Data reading validators --- config_files/live_survey_year_2019_config.yml | 4 +- echopop/live/live_data_loading.py | 176 ++++++++---- echopop/test_workflow.py | 260 +++++++++++++++++- 3 files changed, 382 insertions(+), 58 deletions(-) diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml index e52db83c..fe8bb8b7 100644 --- a/config_files/live_survey_year_2019_config.yml +++ b/config_files/live_survey_year_2019_config.yml @@ -7,7 +7,8 @@ ############################################################################## # Parameters -survey_year: 2019 # survey year being considered +ship_id: R/V Shimada +survey_year: 2024 # survey year being considered species: text_code: pacific_hake # target species for the survey year -- species name number_code: 22500 # target species for the survey year -- numeric code @@ -26,6 +27,7 @@ input_directories: extension: zarr biology: directory: biology/ + # directory: s3://sh2407-upload/data/Echopop-biology/ database_name: biology.db extension: csv file_name_formats: diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index 80dee085..f096ebed 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -9,6 +9,8 @@ import xarray as xr import os import copy +import boto3 +from botocore.exceptions import ClientError from .live_core import( LIVE_FILE_FORMAT_MAP, @@ -218,66 +220,65 @@ def read_acoustic_zarr(file: Path, config_map: dict, xarray_kwargs: dict = {}) - # Return a Tuple return zarr_data_df_filtered, data_units -# TODO: Documentation -def validate_data_directory(file_configuration: dict, dataset: str, - input_filenames: Optional[list] = None) -> List[Path]: - - # Get the dataset file settings - file_settings = file_configuration["input_directories"][dataset] +def construct_directorypath(file_configuration: dict, file_settings: dict): + """Construct the root directory path.""" - # Get the acoustic file settings and root directory - # ---- Root directory - if "data_root_dir" in file_configuration.keys(): - # root_directory = Path(file_configuration["data_root_dir"]) + # Get the general root_directory, if present + if "data_root_dir" in file_configuration: root_directory = file_configuration["data_root_dir"] - else: - # root_directory = Path() + else: root_directory = "" - # ---- File folder - # data_directory = Path(file_settings["directory"]) + + # Get the local directory (or this may be the root directory depending on the config) data_directory = file_settings["directory"] - # ---- Createa directory path - # directory_path = root_directory / data_directory + + # Return the directory path if root_directory != "": - directory_path = "/".join([root_directory, data_directory]) + return "/".join([root_directory, data_directory]) else: - directory_path = data_directory + return data_directory - # Validate filepath, columns, datatypes - # ---- Error evaluation (if applicable) - # if not directory_path.exists(): - # raise FileNotFoundError( - # f"The acoustic data directory [{directory_path}] does not exist." - # ) +def is_s3_path(path): + """Check if a path is an S3 path.""" + return path.startswith("s3://") - # Validate that files even exist - # ---- List available *.zarr files - # data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}")) - # ---- Error evaluation (if applicable) - # if not data_files: - # raise FileNotFoundError( - # f"No `*.{file_settings['extension']}` files found in [{directory_path}]!" - # ) - - # Check and format specific input filenames - if isinstance(input_filenames, list): - # data_files = [directory_path / filename for filename in input_filenames] - data_files = ["/".join([directory_path, filename]) for filename in input_filenames] - # ---- Raise Error - elif input_filenames is not None: +# TODO: Documentation +def validate_data_directory(file_configuration: dict, dataset: str, + input_filenames: Optional[list] = None) -> List[Path]: + + # Get the dataset file settings + file_settings = file_configuration["input_directories"][dataset] + + # Get the data file settings and directorypath + directory_path = construct_directorypath(file_configuration, file_settings) + + # Validate `input_filenames` input + if input_filenames is not None and not isinstance(input_filenames, list): raise TypeError( "Data loading argument `input_filenames` must be a list." ) - else: - data_files = list(Path(directory_path).glob(f"*{'.'+file_settings['extension']}")) - - # Database root directory - database_root_directory = file_configuration["database_directory"] - - # Initialize the database file - initialize_database(database_root_directory, file_settings) + + # Format data filenames + if input_filenames is not None: + data_files = ["/".join([directory_path, filename]) for filename in input_filenames] - # Clean the file names + # Validate directories and format filepath names + # ---- S3 bucket + if is_s3_path(directory_path): + # ---- Validate + validate_s3_path(directory_path, file_configuration["storage_options"]) + # ---- Format data files + if input_filenames is None: + data_files = [] + # ---- Local + else: + # ---- Validate + validate_local_path(directory_path, file_settings) + # ---- Format data files + if input_filenames is None: + data_files = list(Path(directory_path).glob(f"*{'.'+file_settings['extension']}")) + + # Clean the filenames data_files = [ re.sub(r'//', r'\\', str(filename)).replace('/', '\\') if not str(filename).startswith('s3://') @@ -285,6 +286,12 @@ def validate_data_directory(file_configuration: dict, dataset: str, for filename in data_files ] + # Database root directory + database_root_directory = file_configuration["database_directory"] + + # Initialize the database file + initialize_database(database_root_directory, file_settings) + # Drop incomplete datasets if dataset == "biology": data_files = validate_complete_biology_dataset(data_files, @@ -299,6 +306,79 @@ def validate_data_directory(file_configuration: dict, dataset: str, # Return the valid filenames/paths return valid_files +def validate_s3_path(s3_path: str, cloud_credentials: dict): + """Check if (parts of) S3 path exists.""" + + # Redundant validation that S3 object validation is appropriate + if not is_s3_path(s3_path): + raise ValueError("The path is not an S3 path.") + + # Validate credentials + if not all([True if param in cloud_credentials.keys() else False + for param in ["key", "secret"]]): + # ---- Find missing credentials + missing_creds = set(["key", "secret"]) - set(cloud_credentials) + # ---- Format into string + missing_creds_str = ", ".join(["'{}'".format(x.replace("'", "''")) for x in missing_creds]) + # ---- Raise Error + raise PermissionError( + f"Required S3 credentials missing: {missing_creds_str}." + ) + + # Remove the s3:// prefix + s3_path_reduced = s3_path[len("s3://"):] + + # Split into bucket and key + parts = s3_path_reduced.split("/", 1) + if len(parts) < 2: + raise ValueError(f"Invalid S3 path format for '{s3_path}'.") + + # Get bucket name and directory keys + bucket_name, directory = parts + + # Initialize the S3 client + s3_client = boto3.client("s3", + aws_access_key_id=cloud_credentials["key"], + aws_secret_access_key=cloud_credentials["secret"]) + + # Check if the bucket exists + try: + s3_client.head_bucket(Bucket=bucket_name) + except ClientError as e: + raise FileNotFoundError( + f"S3 bucket '{bucket_name}' does not exist or you do not have access." + ) + + # Check if the S3 directory exists + try: + # ---- Ping a response from the bucket + response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=directory, MaxKeys=1) + # ---- Check for `Contents` + if "Contents" not in response: + raise FileNotFoundError(f"S3 path '{s3_path}' does not exist.") + except ClientError as e: + # --- Raise Error and propagate it upwards + raise e + +def validate_local_path(directory_path: str, file_settings: dict): + + # Validate filepath + # ---- Error evaluation (if applicable) + if not Path(directory_path).exists(): + raise FileNotFoundError( + f"The data directory [{directory_path}] does not exist." + ) + + # Validate that files even exist + # ---- List available files of target extension + data_files = list(Path(directory_path).glob(f"*{'.'+file_settings['extension']}")) + # ---- Error evaluation (if applicable) + if not data_files: + raise FileNotFoundError( + f"No `*.{file_settings['extension']}` files found in [{directory_path}]!" + ) + + def validate_complete_biology_dataset(data_files: List[str], directory_path: str, file_configuration: dict): diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index a22fa10b..8e15088c 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -2,6 +2,24 @@ from echopop.live.sql_methods import SQL import echopop.live.live_visualizer as elv from pathlib import Path +from echopop.live import live_data_processing as eldp +from echopop.live import live_data_loading as eldl +from echopop.live.live_core import( + LIVE_DATA_STRUCTURE, LIVE_INPUT_FILE_CONFIG_MAP +) +import boto3 +from botocore.exceptions import NoCredentialsError, ClientError +import pandas as pd +import numpy as np +from echopop.live.sql_methods import SQL, sql_data_exchange, get_table_key_names, sql_group_update, query_processed_files, sql_update_strata_summary +from echopop.live.live_spatial_methods import apply_spatial_definitions +from echopop.live.live_acoustics import average_sigma_bs, compute_nasc +from echopop.live.live_biology import compute_sigma_bs +from echopop.acoustics import ts_length_regression, to_dB, to_linear +from echopop.utils.operations import group_interpolator_creator +from functools import reduce +from echopop.live.live_data_loading import filter_filenames, read_biology_csv + #################################################################################################### # TEST: Set up `LiveSurvey` object # NOTE: General initialization parameter configuration @@ -9,10 +27,174 @@ # NOTE: File configuration live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" # NOTE: Create object -realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path, verbose=True) +realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True) # NOTE: String-representation via `LiveSurvey.__repr__`: # NOTE: Lists current files being processed and linked databases (WIP) -realtime_survey +self = realtime_survey +file_configuration = self.config + +input_filenames = ["202407_003_operation_info.csv", "202407_22500_003_lf.csv", "202407_22500_003_spec.csv", "202407_003_catch_perc.csv"] +realtime_survey.config["input_directories"]["biology"]["directory"] = "s3://sh2407-upload/data/Echopop-biology" + +survey_data = SQL("C:/Users/Brandyn/Downloads/acoustics.db", "select", table_name="survey_data_df") + + +del realtime_survey.config["data_root_dir"] +self = realtime_survey + +# realtime_survey.config["storage_options"] = aws_credentials +realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True) +realtime_survey.load_biology_data(input_filenames=input_filenames) +realtime_survey.input["biology"] +def is_s3_path(path): + """Check if a path is an S3 path.""" + return path.startswith("s3://") + +dataset_directory = realtime_survey.config["input_directories"]["biology"]["directory"] +s3_path = dataset_directory +is_s3_path(dataset_directory) + +cloud_credentials = aws_credentials +cloud_credentials = {} +def validate_s3_path(s3_path: str, cloud_credentials: dict): + """Check if (parts of) S3 path exists.""" + + # Redundant validation that S3 object validation is appropriate + if not is_s3_path(s3_path): + raise ValueError("The path is not an S3 path.") + + # Validate credentials + if not all([True if param in cloud_credentials.keys() else False + for param in ["key", "secret"]]): + # ---- Find missing credentials + missing_creds = set(["key", "secret"]) - set(cloud_credentials) + # ---- Format into string + missing_creds_str = ", ".join(["'{}'".format(x.replace("'", "''")) for x in missing_creds]) + # ---- Raise Error + raise PermissionError( + f"Required S3 credentials missing: {missing_creds_str}." + ) + + # Remove the s3:// prefix + s3_path_reduced = s3_path[len("s3://"):] + + # Split into bucket and key + parts = s3_path_reduced.split("/", 1) + if len(parts) < 2: + raise ValueError(f"Invalid S3 path format for '{s3_path}'.") + + # Get bucket name and directory keys + bucket_name, directory = parts + + # Initialize the S3 client + s3_client = boto3.client("s3", + aws_access_key_id=cloud_credentials["key"], + aws_secret_access_key=cloud_credentials["secret"]) + + # Check if the bucket exists + try: + s3_client.head_bucket(Bucket=bucket_name) + except ClientError as e: + raise FileNotFoundError( + f"S3 bucket '{bucket_name}' does not exist or you do not have access." + ) + + # Check if the S3 directory exists + try: + # ---- Ping a response from the bucket + response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=directory, MaxKeys=1) + # ---- Check for `Contents` + if "Contents" not in response: + raise FileNotFoundError(f"S3 path '{s3_path}' does not exist.") + except ClientError as e: + # --- Raise Error and propagate it upwards + raise e + +validate_s3_path(s3_path, cloud_credentials) + +import pandas as pd + +self = realtime_survey +biology_files = self.meta["provenance"]["biology_files_read"] +file_configuration = self.config +dataset = "biology" + +# Get the dataset file settings +file_settings = file_configuration["input_directories"][dataset] + +def construct_directorypath(file_configuration: dict, file_settings: dict): + """Construct the root directory path.""" + + # Get the general root_directory, if present + if "data_root_dir" in file_configuration: + root_directory = file_configuration["data_root_dir"] + else: + root_directory = "" + + # Get the local directory (or this may be the root directory depending on the config) + data_directory = file_settings["directory"] + + # Return the directory path + if root_directory != "": + return "/".join([root_directory, data_directory]) + else: + return data_directory + +directory_path = construct_directorypath(file_configuration, file_settings) + +def validate_local_path(directory_path: str): + + # Validate filepath + # ---- Error evaluation (if applicable) + if not Path(directory_path).exists(): + raise FileNotFoundError( + f"The acoustic data directory [{directory_path}] does not exist." + ) + + # Validate that files even exist + # ---- List available files of target extension + data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}")) + # ---- Error evaluation (if applicable) + if not data_files: + raise FileNotFoundError( + f"No `*.{file_settings['extension']}` files found in [{directory_path}]!" + ) + + + + +# Get the biology data file settings +file_settings = file_configuration["input_directories"]["biology"] + +# Get the file-specific settings, datatypes, columns, etc. +# ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` +biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] +# ---- Extract the expected file name ID's +biology_file_ids = file_settings["file_name_formats"] +# ---- Extract all of the file ids +biology_config_ids = list(biology_file_ids.keys()) +# ---- Initialize the dictionary that will define this key in the `input` attribute +biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} + + +# Initialize a session with AWS credentials +s3_client = boto3.client( + 's3', + aws_access_key_id=aws_credentials["key"], + aws_secret_access_key=aws_credentials["secret"] +) +response = s3_client.list_buckets() +buckets = response.get('Buckets', []) +for bucket in buckets: + print(f"Bucket Name: {bucket['Name']}") +s3_client.head_bucket(Bucket="sh2407-upload") +realtime_survey.load_biology_data(pandas_kwargs=aws_credentials, input_filenames=input_filenames) +realtime_survey.config["ship_id"] +grid_data = SQL(realtime_survey.config["database"]["grid"], "select", table_name="grid_df") +grid_data[grid_data.abundance > 0] +bucket = boto3.client("s3", region_name=None) +bucket.head_bucket(Bucket=realtime_survey.config["input_directories"]["biology"]["directory"] + "/") +bucket.list_objects_v2(Bucket=realtime_survey.config["input_directories"]["biology"]["directory"], Prefix=path, MaxKeys=1) #################################################################################################### # TEST: TRIGGER --> NEW ACOUSTIC DATA # NOTE: Load new acoustic data (Either glob file search or `input_filenames Optional[List[str]]`) @@ -25,11 +207,13 @@ realtime_survey.estimate_population(working_dataset="acoustic") # NOTE: String-representation via `LiveSurvey.__repr__`: # NOTE: Lists current files being processed and linked databases (WIP) -realtime_survey +realtime_survey.input["acoustics"] #################################################################################################### # TEST: TRIGGER --> NEW BIOLOGY DATA # NOTE: Load new biological data (Either glob file search or `input_filenames Optional[List[str]]`) realtime_survey.load_biology_data() +len(realtime_survey.meta["provenance"]["biology_files_checkpoint1"]) +realtime_survey.meta["provenance"]["biology_files_checkpoint3"] # NOTE: Process new biological data # NOTE: This will update linked database tables realtime_survey.process_biology_data() @@ -50,15 +234,18 @@ # ---- ACOUSTIC SQL(db_file=realtime_survey.config["database"]["acoustics"], command="select", table_name="files_processed") +dat = SQL(db_file=realtime_survey.config["database"]["acoustics"],command="select", table_name="files_processed") # ---- BIOLOGICAL -SQL(db_file=realtime_survey.config["database"]["biology"], - command="select", table_name="files_processed") +SQL(db_file=realtime_survey.config["database"]["biology"],command="select", table_name="files_processed") +dat.loc[0:, "filepath"][105] #################################################################################################### # TEST: `LiveSurvey` --[(key) SQL tables]--> Users # !!! The SQL functions will fail if the tables have not yet been created/initialized # ---- ACOUSTICS # NOTE: Mean linear backscatter coefficient (`sigma_bs`) keyed for each haul and stratum SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df") +SQL(realtime_survey.config["database"]["biology"], "select", table_name="specimen_df").latitude.max() +realtime_survey.input["spatial"]["strata"] # NOTE: Along-track acoustically-derived number/biomass densities and NASC SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") # ---- BIOLOGICAL @@ -76,7 +263,59 @@ import panel as pn # ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table survey_data_db = Path(realtime_survey.config["database"]["acoustics"]) -grid_db = Path(realtime_survey.config["database"]["grid"]) +# grid_db = Path(realtime_survey.config["database"]["grid"]) +grid_db = Path("C:/Users/Brandyn/Downloads/grid.db") +dat = SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") +dat +dat1 = SQL(grid_db, "select", table_name="grid_df") +SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df") + +sql_cmd = "SELECT * FROM sigma_bs_mean_df ORDER BY stratum, haul_num, species_id" +# Create the engine +engine = create_engine(f"sqlite:///{"C:/Users/Brandyn/Downloads/biology.db"}") +# Create the SQL database connection and send the script +with engine.connect() as connection: + table = connection.execute(text(sql_cmd)) + +data = table.fetchall() +dd = pd.DataFrame(data, columns=table.keys()).loc[0:1, :] +dd = dd[["stratum", "haul_num", "species_id", "sigma_bs", "sigma_bs_count", "sigma_bs_sum", "id"]] +dd.loc[:, "id"] = pd.Series([f"{(4,4,4)}", f"{(5,5,5)}"]) +SQL("C:/Users/Brandyn/Downloads/biology.db", "insert", table_name="sigma_bs_mean_df", dataframe=dd) +SQL("C:/Users/Brandyn/Downloads/biology.db", "map") +SQL(biology_db, "drop", table_name="sigma_bs_mean_df") +SQL(biology_db, "select", table_name="sigma_bs_mean_df") +dd.loc[:, "haul_num"] = pd.Series([101, 103]) +dd = dd[["species_id", "haul_num", "id", "stratum", "sigma_bs", "sigma_bs_count", "sigma_bs_sum"]] +SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=dd, id_columns=key_list+["id"]) +SQL(biology_db, "select", table_name="sigma_bs_mean_df") +import numpy as np; import pandas as pd +SQL("C:/Users/Brandyn/Downloads/biology.db", "select", table_name="length_weight_df") +sigma_bs_df = SQL("C:/Users/Brandyn/Downloads/biology.db", "select", table_name="sigma_bs_mean_df") +table_df = SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df") +sigma_bs_df = table_df +# ---- Check the table keys +table_keys = np.unique(table_df["id"]).tolist() +# ---- Get unique values +current_keys = np.unique(sigma_bs_df["id"]).tolist() +# ---- Get INSERTION keys +insertion_keys = list(set(current_keys).difference(set(table_keys))) +# ---- Get UPDATE keys +update_keys = list(set(current_keys).intersection(set(table_keys))) +insertion_df = sigma_bs_df[sigma_bs_df["id"].isin(insertion_keys)] +insertion_df.loc[0, "species_id"] = 22500 +insertion_df.loc[0, "stratum"] = 5 +insertion_df.loc[0, "haul_num"] = 100 +insertion_df.loc[0, "sigma_bs"] = 1e-10 +insertion_df.loc[0, "sigma_bs_count"] = 100 +insertion_df.loc[0, "sigma_bs_sum"] = 1e10 * 100 +insertion_df.loc[0, "id"] = f"{(1,1,1)}" +SQL(realtime_survey.config["database"]["biology"], "insert", table_name="sigma_bs_mean_df", + dataframe=insertion_df) +SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df") +survey_data = SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") +dat1[dat1.abundance > 0] +dat[dat.number_density > 0] coast_db = grid_db biology_db = Path(realtime_survey.config["database"]["biology"]) projection = realtime_survey.config["geospatial"]["projection"] @@ -91,10 +330,13 @@ def plt_to_pn(fig): panel.show() # OR panel.servable() if you want to serve it in a Panel server # ---- PLOT GRID fig = elv.plot_livesurvey_grid(grid_db, projection, coast_db) +fig.show() plt_to_pn(fig) # ---- PLOT TRACK -fig1 = elv.plot_livesurvey_track(survey_data_db, projection, coast_db) -plt_to_pn(fig) +from echopop.live.live_visualizer import plot_livesurvey_track +fig1 = plot_livesurvey_track(survey_data, projection, coast_db) +fig1.show() +plt_to_pn(fig1) # ---- PLOT DISTRIBUTIONS weight_table = SQL(biology_db, "select", table_name="length_weight_df") @@ -105,7 +347,7 @@ def plt_to_pn(fig): length_table = SQL(biology_db, "select", table_name="length_df") fig2 = elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table) -plt_to_pn(fig) +plt_to_pn(fig2) ### MULTIPANEL panel0 = pn.panel(fig, name='Gridded population estimates') panel1 = pn.panel(fig1, name='Alongtrack population estimates') From cef3036ffecd4a05fa666023df0c8d05b0cacc25 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 26 Aug 2024 10:29:45 -0700 Subject: [PATCH 75/81] Clarified config validation error messages --- echopop/live/live_data_loading.py | 48 ++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index f096ebed..3018604e 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -48,10 +48,12 @@ def live_configuration(live_init_config_path: Union[str, Path], # ---- Initialization settings init_config = yaml.safe_load(Path(live_init_config_path).read_text()) # -------- Validate - init_config = validate_live_config(copy.deepcopy(init_config), LIVE_CONFIG_INIT_MODEL) + init_config = validate_live_config(copy.deepcopy(init_config), LIVE_CONFIG_INIT_MODEL, + live_init_config_path) # ---- Filepath/directory settings file_config = yaml.safe_load(Path(live_file_config_path).read_text()) - file_config = validate_live_config(copy.deepcopy(file_config), LIVE_CONFIG_DATA_MODEL) + file_config = validate_live_config(copy.deepcopy(file_config), LIVE_CONFIG_DATA_MODEL, + live_file_config_path) # Check for intersecting/duplicative configuration keys # ---- Compare sets of keys from each dictionary @@ -727,8 +729,12 @@ def validate_spatial_config(spatial_config: dict): elif link_method != "global": validate_hauls_config(spatial_config, link_method) -def validate_live_config(config, reference_model): +def validate_live_config(config: dict, reference_model: dict, filename: Union[str, Path]): """Validate configuration inputs""" + + # Convert to string if Path + if isinstance(filename, Path): + filename = str(filename) # Recursive function for validating entire nested dictionary def validate_keys(config, model, path=""): @@ -768,14 +774,17 @@ def check_for_missing_keys(required_keys, config_keys, path): config_keys) if missing_keys: raise ValueError( - f"Missing required key(s): {', '.join(missing_keys)} at {path}" + f"Missing required configuration key(s): " + f"{', '.join(missing_keys)} at {path} in configuration file " + f"'{filename}'." ) return unexpected_keys_for_keys elif key not in config_keys and key != "*": missing_required.append(key) if missing_required: raise ValueError( - f"Missing required key(s): {', '.join(missing_required)} at {path}" + f"Missing required configuration key(s): {', '.join(missing_required)} at " + f"{path} in configuration file '{filename}'." ) return [] # ---- @@ -795,7 +804,10 @@ def check_for_unexpected_keys(config_keys, required_keys): missing_primary_keys = [key for key in required_keys if key != "*" and key not in config] if missing_primary_keys: - raise ValueError(f"Missing primary key(s): {', '.join(missing_primary_keys)}") + raise ValueError( + f"Missing primary configuration key(s): {', '.join(missing_primary_keys)} in " + f"configuration file '{filename}'." + ) unexpected_primary_keys = [key for key in config if key not in required_keys and key not in optional_keys @@ -803,7 +815,8 @@ def check_for_unexpected_keys(config_keys, required_keys): # ---- Raise error if unexpected_primary_keys: raise ValueError( - f"Unexpected primary key(s) found: {', '.join(unexpected_primary_keys)}" + f"Unexpected primary key(s) found in configuration file '{filename}': " + f"{', '.join(unexpected_primary_keys)}" ) # Nested validation else: @@ -812,7 +825,10 @@ def check_for_unexpected_keys(config_keys, required_keys): unexpected_keys.extend(check_for_unexpected_keys(config_keys, required_keys)) # ---- Raise error if unexpected_keys: - raise ValueError(f"Unexpected key(s) found: {', '.join(unexpected_keys)} at {path}") + raise ValueError( + f"Unexpected key(s) found: {', '.join(unexpected_keys)} at {path} in " + f"configuration file '{filename}'." + ) # Recursively validate nested dictionaries and lists for key, sub_model in keys.items(): @@ -841,31 +857,35 @@ def validate_list(config_value, allowed_types, key, path): if all(isinstance(item, (str, int, float)) for item in allowed_types): if config_value not in allowed_types: raise ValueError( - f"Invalid value for key '{key}' at {path}. Expected one of: {allowed_types}" + f"Invalid value for key '{key}' at {path} in {filename}. Expected one of: " + f"{allowed_types}" ) elif not isinstance(config_value, list): if type(config_value) not in allowed_types: raise ValueError( - f"Invalid value for key '{key}' at {path}. Expected one of: {allowed_types}" + f"Invalid value for key '{key}' at {path} in {filename}. Expected one of: " + f"{allowed_types}" ) else: if isinstance(config_value, list): for item in config_value: if not any(isinstance(item, t) for t in allowed_types): raise ValueError( - f"Invalid type for items in list '{key}' at {path}. Expected one of: " - f"{allowed_types}" + f"Invalid type for items in list '{key}' at {path} in {filename}. " + f"Expected one of: {allowed_types}" ) else: raise ValueError( - f"Invalid type for key '{key}' at {path}. Expected a list of: {allowed_types}" + f"Invalid type for key '{key}' at {path} in {filename}. Expected a list of: " + f"{allowed_types}" ) # ---- def validate_type(config_value, expected_type, key, path): """Validate configuration with model that is at the furthest point along a branch""" if not isinstance(config_value, expected_type): raise ValueError( - f"Invalid type for key '{key}' at {path}. Expected type: {expected_type}" + f"Invalid type for key '{key}' at {path} in {filename}. Expected type: " + f"{expected_type}" ) # Validate all branches within the configuration dictionary From 0aa88ac61619087b0703bce480b0bf5417c54a22 Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Mon, 26 Aug 2024 10:55:14 -0700 Subject: [PATCH 76/81] Pre-commit formatting changes --- config_files/live_initialization_config.yml | 22 +- config_files/live_survey_year_2019_config.yml | 8 +- echopop/live/__init__.py | 2 +- echopop/live/live_acoustics.py | 303 +- echopop/live/live_biology.py | 815 +-- echopop/live/live_core.py | 101 +- echopop/live/live_data_loading.py | 395 +- echopop/live/live_data_processing.py | 231 +- echopop/live/live_spatial_methods.py | 316 +- echopop/live/live_survey.py | 315 +- echopop/live/live_visualizer.py | 438 +- echopop/live/sql_methods.py | 487 +- echopop/mesh_generation.py | 4520 +++++++++-------- echopop/test_workflow.py | 766 +-- echopop/utils/operations.py | 12 +- echopop/zarr_read_ingest_test.py | 3789 +++++++------- 16 files changed, 6532 insertions(+), 5988 deletions(-) diff --git a/config_files/live_initialization_config.yml b/config_files/live_initialization_config.yml index ae265343..4e386b3e 100644 --- a/config_files/live_initialization_config.yml +++ b/config_files/live_initialization_config.yml @@ -8,16 +8,16 @@ ######################## biology: # Length-binning - # NOTE: start : end : number + # NOTE: start : end : number length_distribution: bins: [2, 80, 40] # Station separation # NOTE: if `separate_stations` is True, `['list']` is required for `station_id` - stations: + stations: separate_stations: True station_id: ["length", "specimen"] - # Trawl identifier - catch: + # Trawl identifier + catch: partition: codend ##################################################################################################################### @@ -25,10 +25,10 @@ ######################## geospatial: inpfc: # INPFC northern latitude limits and labels - latitude_max: [36.0, 40.5, 43.0, + latitude_max: [36.0, 40.5, 43.0, 45.7667, 48.50, 55.0] - stratum_names: [1, 2, 3, 4, 5, 6] - griddify: + stratum_names: [1, 2, 3, 4, 5, 6] + griddify: # Coordinate bounds bounds: latitude: [32.75, 55.50] @@ -39,20 +39,20 @@ y_distance: 25.0 projection: epsg:4326 # EPSG integer code for geodetic parameter dataset # TODO: Remember to convert this back to a string - # NOTE: `link_biology_acoustics` defines how biological and acoustic data are linked with one another. This + # NOTE: `link_biology_acoustics` defines how biological and acoustic data are linked with one another. This # comprises True/False statements that denote the desired association. All values set to "True" will be output. # `global` --> NASC associated with sigma_bs calculated from all survey data - # `INPFC` --> NASC for each INPFC stratum associated with matched stratum-specific sigma_bs + # `INPFC` --> NASC for each INPFC stratum associated with matched stratum-specific sigma_bs # `closest_haul` --> NASC associated with sigma_bs calculated from the closest (spatially) trawls # `weighted_haul` --> NASC associated with sigma_bs calculated from all survey data weighted by distance from haul coordinates link_biology_acoustics: INPFC ##################################################################################################################### # Acoustics settings# - ######################## + ######################## acoustics: # Acoustic transmit frequency (Hz or kHz) - transmit: + transmit: frequency: 38.0 units: kHz # Target strength (TS) - length (L) regression: TS=m*log10(L)+b diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml index fe8bb8b7..485ad86e 100644 --- a/config_files/live_survey_year_2019_config.yml +++ b/config_files/live_survey_year_2019_config.yml @@ -20,7 +20,7 @@ database_directory: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_fil ############################################################################## # Input data directories -input_directories: +input_directories: acoustics: directory: acoustics/ database_name: acoustics.db @@ -29,7 +29,7 @@ input_directories: directory: biology/ # directory: s3://sh2407-upload/data/Echopop-biology/ database_name: biology.db - extension: csv + extension: csv file_name_formats: catch: "{DATE:YYYYMM}_{HAUL}_{FILE_ID:catch_perc}" length: "{DATE:YYYYMM}_{SPECIES_CODE}_{HAUL}_{FILE_ID:lf}" @@ -40,12 +40,12 @@ input_directories: length: [haul_num, species_id] specimen: [haul_num, species_id] trawl_info: [] - file_ids: + file_ids: catch: catch_perc length: lf specimen: spec trawl_info: operation_info - coastline: + coastline: directory: coastline/ coastline_name: ne_10m_land grid: diff --git a/echopop/live/__init__.py b/echopop/live/__init__.py index 325afcbb..1dca9f3e 100644 --- a/echopop/live/__init__.py +++ b/echopop/live/__init__.py @@ -2,4 +2,4 @@ __all__ = ["operations"] -# from _echopop_version import version as __version__ # noqa \ No newline at end of file +# from _echopop_version import version as __version__ # noqa diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py index 2a9347a0..d2f7e763 100644 --- a/echopop/live/live_acoustics.py +++ b/echopop/live/live_acoustics.py @@ -1,19 +1,21 @@ -from typing import Union, Optional, List +from typing import Optional, Union + import numpy as np import pandas as pd -from ..acoustics import ts_length_regression, to_linear, to_dB -from .live_spatial_methods import apply_spatial_definitions, apply_griddify_definitions -from .sql_methods import sql_data_exchange, SQL, query_processed_files +from ..acoustics import to_linear, ts_length_regression +from .live_spatial_methods import apply_griddify_definitions, apply_spatial_definitions +from .sql_methods import query_processed_files, sql_data_exchange + # TODO: Documentation -def configure_transmit_frequency(frequency_values: pd.Series, - transmit_settings: dict, - current_units: str): - +def configure_transmit_frequency( + frequency_values: pd.Series, transmit_settings: dict, current_units: str +): + # Extract transmit frequency units defined in configuration file configuration_units = transmit_settings["units"] - + # Transform the units, if necessary # ---- Hz to kHz if current_units == "Hz" and configuration_units == "kHz": @@ -24,11 +26,12 @@ def configure_transmit_frequency(frequency_values: pd.Series, # ---- No change else: return frequency_values - + + # TODO: Documentation -def preprocess_acoustic_data(survey_data: pd.DataFrame, - spatial_dict: dict, - file_configuration: dict) -> pd.DataFrame: +def preprocess_acoustic_data( + survey_data: pd.DataFrame, spatial_dict: dict, file_configuration: dict +) -> pd.DataFrame: # Get acoustic processing settings acoustic_analysis_settings = file_configuration["acoustics"] @@ -37,10 +40,10 @@ def preprocess_acoustic_data(survey_data: pd.DataFrame, # Filter the dataset # ---- Configure `frequency_nominal`, if necessary - survey_data.loc[:, "frequency_nominal"] = ( - configure_transmit_frequency(survey_data.loc[:, "frequency_nominal"], - transmit_settings, - acoustic_analysis_settings["dataset_units"]["frequency"]) + survey_data.loc[:, "frequency_nominal"] = configure_transmit_frequency( + survey_data.loc[:, "frequency_nominal"], + transmit_settings, + acoustic_analysis_settings["dataset_units"]["frequency"], ) # ---- Filter out any unused frequency coordinates prc_nasc_df_filtered = ( @@ -50,15 +53,17 @@ def preprocess_acoustic_data(survey_data: pd.DataFrame, ) # Get grid coordinates - prc_nasc_df_filtered = pd.concat([ - prc_nasc_df_filtered, - apply_griddify_definitions(prc_nasc_df_filtered, file_configuration["geospatial"]) - ], axis = 1) - + prc_nasc_df_filtered = pd.concat( + [ + prc_nasc_df_filtered, + apply_griddify_definitions(prc_nasc_df_filtered, file_configuration["geospatial"]), + ], + axis=1, + ) + # Apply spatial settings - prc_nasc_df_filtered = ( - prc_nasc_df_filtered - .assign(stratum=apply_spatial_definitions(prc_nasc_df_filtered["latitude"], spatial_dict)) + prc_nasc_df_filtered = prc_nasc_df_filtered.assign( + stratum=apply_spatial_definitions(prc_nasc_df_filtered["latitude"], spatial_dict) ) # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object @@ -66,31 +71,28 @@ def preprocess_acoustic_data(survey_data: pd.DataFrame, prc_nasc_df_filtered.loc[:, "ship_id"] = file_configuration["ship_id"] # ---- Replace NASC `NaN` values with `0.0` prc_nasc_df_filtered.loc[:, "NASC"] = prc_nasc_df_filtered.loc[:, "NASC"].fillna(0.0) - # ---- Drop the `frequency_nominal` column and return the output - return prc_nasc_df_filtered.drop(columns = ["frequency_nominal"]) + # ---- Drop the `frequency_nominal` column and return the output + return prc_nasc_df_filtered.drop(columns=["frequency_nominal"]) + # TODO: Documentation -def average_sigma_bs(length: Union[pd.DataFrame, float, int], - weights: Optional[Union[float, int, str]] = None): +def average_sigma_bs( + length: Union[pd.DataFrame, float, int], weights: Optional[Union[float, int, str]] = None +): # Function approach for dataframe input if isinstance(length, pd.DataFrame): - if "length" not in length.columns: - raise ValueError( - "Column [`length`] missing from dataframe input `length`." - ) + if "length" not in length.columns: + raise ValueError("Column [`length`] missing from dataframe input `length`.") elif "TS_L_slope" not in length.columns: - raise ValueError( - "Column [`TS_L_slope`] missing from dataframe input `length`." - ) + raise ValueError("Column [`TS_L_slope`] missing from dataframe input `length`.") elif "TS_L_slope" not in length.columns: - raise ValueError( - "Column [`TS_L_intercept`] missing from dataframe input `length`." - ) - else: + raise ValueError("Column [`TS_L_intercept`] missing from dataframe input `length`.") + else: # ---- Compute the TS (as an array) - target_strength = ts_length_regression(length["length"], length["TS_L_slope"], - length["TS_L_intercept"]) + target_strength = ts_length_regression( + length["length"], length["TS_L_slope"], length["TS_L_intercept"] + ) # ---- Convert to `sigma_bs` sigma_bs_value = to_linear(target_strength) # ---- Weighted or arithmetic avveraging @@ -100,10 +102,11 @@ def average_sigma_bs(length: Union[pd.DataFrame, float, int], raise ValueError( f"Defined `weights` column, {weights}, missing from dataframe input " f"`length`." - ) + ) else: return (sigma_bs_value * length[weights]).sum() / length[weights].sum() + # TODO: Documentation # TODO: Refactor def estimate_echometrics(acoustic_data_df: pd.DataFrame): @@ -113,81 +116,92 @@ def estimate_echometrics(acoustic_data_df: pd.DataFrame): # Compute ABC # ---- Convert NASC to ABC - acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2) + acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852**2) # Pre-compute the change in depth acoustic_df["dz"] = acoustic_df["depth"].diff() # ---- Change first cell ! - acoustic_df.loc[0, "dz"] = ( - acoustic_df.loc[1, "depth"] - acoustic_df.loc[0, "depth"] - ) + acoustic_df.loc[0, "dz"] = acoustic_df.loc[1, "depth"] - acoustic_df.loc[0, "depth"] # Initialize echometrics dictionary echometrics = {} # Compute the metrics center-of-mass if acoustic_df["NASC"].sum() == 0.0: - echometrics.update({ - "n_layers": 0, - "mean_Sv": -999, - "max_Sv": -999, - "nasc_db": np.nan, - "center_of_mass": np.nan, - "dispersion": np.nan, - "evenness": np.nan, - "aggregation_index": np.nan, - "occupied_area": 0.0, - }) + echometrics.update( + { + "n_layers": 0, + "mean_Sv": -999, + "max_Sv": -999, + "nasc_db": np.nan, + "center_of_mass": np.nan, + "dispersion": np.nan, + "evenness": np.nan, + "aggregation_index": np.nan, + "occupied_area": 0.0, + } + ) else: - - # Create the `echometrics` dictionary - echometrics.update({ - # ---- Number of layers - "n_layers": int(acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size), - # ---- Mean Sv (back-calculated) - "mean_Sv": float( - 10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max()) - ), - # ---- Max Sv (back-calculated) - "max_Sv": float( - 10 * np.log10(acoustic_df["ABC"].max() - / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"]) - ), - # ---- (Logarithmic) acoustic abundance - "nasc_db": float(10 * np.log10(acoustic_df["ABC"].sum())), - # ---- Center-of-mass - "center_of_mass": float( - (acoustic_df["depth"] * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum() - ), - # ---- Evenness - "evenness": float( - (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2 - ), - # ---- Occupied area - "occupied_area": float( - acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max() - ) - }) + + # Create the `echometrics` dictionary + echometrics.update( + { + # ---- Number of layers + "n_layers": int(acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size), + # ---- Mean Sv (back-calculated) + "mean_Sv": float( + 10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max()) + ), + # ---- Max Sv (back-calculated) + "max_Sv": float( + 10 + * np.log10( + acoustic_df["ABC"].max() + / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"] + ) + ), + # ---- (Logarithmic) acoustic abundance + "nasc_db": float(10 * np.log10(acoustic_df["ABC"].sum())), + # ---- Center-of-mass + "center_of_mass": float( + (acoustic_df["depth"] * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum() + ), + # ---- Evenness + "evenness": float( + (acoustic_df["NASC"] ** 2).sum() / ((acoustic_df["NASC"]).sum()) ** 2 + ), + # ---- Occupied area + "occupied_area": float( + acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max() + ), + } + ) # Update variable-dependent metrics - echometrics.update({ - # ---- Dispersion - "dispersion": float( - ((acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2 - * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum() - ), - # ---- Index of aggregation - "aggregation_index": float(1 / echometrics["evenness"]), - }) + echometrics.update( + { + # ---- Dispersion + "dispersion": float( + ( + (acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2 + * acoustic_df["NASC"] + ).sum() + / (acoustic_df["NASC"]).sum() + ), + # ---- Index of aggregation + "aggregation_index": float(1 / echometrics["evenness"]), + } + ) # Return the dictionary return echometrics + def integrate_nasc(data_df: pd.DataFrame, echometrics: bool = True): # Vertically integrate PRC NASC nasc_dict = {"nasc": data_df["NASC"].sum()} - + # Horizontally concatenate `echometrics`, if `True` if echometrics: # ---- Compute values @@ -202,21 +216,23 @@ def integrate_nasc(data_df: pd.DataFrame, echometrics: bool = True): # return pd.DataFrame([nasc_dict]) -def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict, - echometrics: bool = True): + +def compute_nasc( + acoustic_data_df: pd.DataFrame, file_configuration: dict, echometrics: bool = True +): # Get spatial definitions, if any # spatial_column = file_configuration["spatial_column"] # Get stratum column, if any gridding_column = file_configuration["gridding_column"] - + # Integrate NASC (and compute the echometrics, if necessary) # ---- Get number of unique sources # if len(np.unique(acoustic_data_df["ping_time"])) > 1: # nasc_data_df = ( # acoustic_data_df - # .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, + # .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, # observed=False) # .apply(integrate_nasc, echometrics, include_groups=False).unstack() # .reset_index() @@ -224,49 +240,74 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict, # ) # else: nasc_data_df = ( - acoustic_data_df - .groupby(["ship_id", "longitude", "latitude", "ping_time", "source"] + gridding_column, - observed=False) - .apply(integrate_nasc, echometrics, include_groups=False).droplevel(-1) + acoustic_data_df.groupby( + ["ship_id", "longitude", "latitude", "ping_time", "source"] + gridding_column, + observed=False, + ) + .apply(integrate_nasc, echometrics, include_groups=False) + .droplevel(-1) .reset_index() .sort_values("ping_time") ) # ---- Amend the dtypes if echometrics were computed if echometrics: # ---- Set dtypes - nasc_data_df = ( - nasc_data_df - .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float, - "center_of_mass": float, "dispersion": float, "evenness": float, - "aggregation_index": float, "occupied_area": float}) + nasc_data_df = nasc_data_df.astype( + { + "n_layers": int, + "mean_Sv": float, + "max_Sv": float, + "nasc_db": float, + "center_of_mass": float, + "dispersion": float, + "evenness": float, + "aggregation_index": float, + "occupied_area": float, + } ) # ---- Reorder columns nasc_data_df = nasc_data_df[ gridding_column - + ["ship_id", "longitude", "latitude", "ping_time", "source", "nasc", "n_layers", - "nasc_db", "mean_Sv", "max_Sv", "aggregation_index", "center_of_mass", "dispersion", - "evenness", "occupied_area"] + + [ + "ship_id", + "longitude", + "latitude", + "ping_time", + "source", + "nasc", + "n_layers", + "nasc_db", + "mean_Sv", + "max_Sv", + "aggregation_index", + "center_of_mass", + "dispersion", + "evenness", + "occupied_area", + ] ] # Return the output return nasc_data_df + def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict, meta_dict: dict): # Get acoustic database filename acoustic_db = file_configuration["database"]["acoustics"] - - # Create a copy of the dataframe + + # Create a copy of the dataframe df = nasc_data_df.copy() - + # Add population-specific columns (specified in the file configuration) # TODO: Add to `yaml` file for configuration; hard-code for now add_columns = ["number_density", "biomass_density"] # ---- df[add_columns] = 0.0 # ---- Assign values for key values - key_values = [f"{df.loc[index, 'ship_id']}-{str(index)}-{df.loc[index, 'source']}" - for index in df.index] + key_values = [ + f"{df.loc[index, 'ship_id']}-{str(index)}-{df.loc[index, 'source']}" for index in df.index + ] # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint df.loc[:, "id"] = key_values @@ -274,15 +315,23 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict root_database = file_configuration["database_directory"] # Update the successfully processed files - query_processed_files(root_database, - file_configuration["input_directories"]["acoustics"], - meta_dict["provenance"]["acoustic_files_read"], - processed=True) - + query_processed_files( + root_database, + file_configuration["input_directories"]["acoustics"], + meta_dict["provenance"]["acoustic_files_read"], + processed=True, + ) + # Insert the new data into the database & pull in the combined dataset # TODO: Replace with single-direction INSERT statement instead of INSERT/SELECT - _ = sql_data_exchange(acoustic_db, dataframe=df, table_name="survey_data_df", - id_columns=["id"], primary_keys=["id"], output_type=pd.DataFrame) - + _ = sql_data_exchange( + acoustic_db, + dataframe=df, + table_name="survey_data_df", + id_columns=["id"], + primary_keys=["id"], + output_type=pd.DataFrame, + ) + # Return the formatted dataframe - return df \ No newline at end of file + return df diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py index 5e70d92b..a9cacb4b 100644 --- a/echopop/live/live_biology.py +++ b/echopop/live/live_biology.py @@ -1,11 +1,19 @@ -import pandas as pd +from functools import reduce + import numpy as np -from .sql_methods import SQL, sql_data_exchange, get_table_key_names, sql_group_update, query_processed_files, sql_update_strata_summary -from .live_spatial_methods import apply_spatial_definitions -from .live_acoustics import average_sigma_bs -from ..acoustics import ts_length_regression, to_dB, to_linear +import pandas as pd + from ..utils.operations import group_interpolator_creator -from functools import reduce +from .live_acoustics import average_sigma_bs +from .live_spatial_methods import apply_spatial_definitions +from .sql_methods import ( + SQL, + get_table_key_names, + sql_data_exchange, + sql_group_update, + sql_update_strata_summary, +) + def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict): @@ -20,6 +28,7 @@ def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict): # Return output return data_copy + def merge_trawl_info(biology_dict: dict): # Get the trawl information dictionary @@ -37,15 +46,15 @@ def merge_trawl_info(biology_dict: dict): # Drop the trawl information del biology_dict["trawl_info_df"] + def prepare_length_distribution(file_configuration: dict): # Get the length distribution parameters distrib_params = file_configuration["biology"]["length_distribution"]["bins"] # Create histogram bins - length_bins = ( - np.linspace(**{key: value for key, value in zip(["start", "stop", "num"], distrib_params)}, - dtype=float) + length_bins = np.linspace( + **{key: value for key, value in zip(["start", "stop", "num"], distrib_params)}, dtype=float ) # Get the binwidths @@ -56,8 +65,8 @@ def prepare_length_distribution(file_configuration: dict): # Format as a DataFrame and return the output # ---- Add Categorical interval column - length_bins_df = ( - pd.DataFrame({"length_bin": length_bins, "interval": pd.cut(length_bins, intervals)}) + length_bins_df = pd.DataFrame( + {"length_bin": length_bins, "interval": pd.cut(length_bins, intervals)} ) # ---- Add numeric lower boundary length_bins_df["lower"] = length_bins_df["interval"].apply(lambda x: x.left).astype(float) @@ -67,11 +76,12 @@ def prepare_length_distribution(file_configuration: dict): # Return the dataframe that will be incorporated into the biological data attribute return length_bins_df + def preprocess_biology_data(biology_output: dict, spatial_dict: dict, file_configuration: dict): - + # Get SQL database file biology_db = file_configuration["database"]["biology"] - + # Get contrasts used for filtering the dataset # ---- Species species_filter = file_configuration["species"]["number_code"] @@ -82,8 +92,9 @@ def preprocess_biology_data(biology_output: dict, spatial_dict: dict, file_confi # Apply the filter filtered_biology_output = { - key: biology_data_filter(df, filter_dict) - for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty + key: biology_data_filter(df, filter_dict) + for key, df in biology_output.items() + if isinstance(df, pd.DataFrame) and not df.empty } # ---- Create new data flag file_configuration["length_distribution"] = prepare_length_distribution(file_configuration) @@ -99,7 +110,7 @@ def preprocess_biology_data(biology_output: dict, spatial_dict: dict, file_confi table_list = list(set(SQL(biology_db, "map")) - set(["files_read"])) # ---- Plug into the dictionary filtered_biology_output.update({key: pd.DataFrame() for key in table_list}) - # ---- Initialize the results dictionary + # ---- Initialize the results dictionary sql_results_dict = {key: pd.DataFrame() for key in filtered_biology_output.keys()} # Update the SQL database @@ -109,17 +120,21 @@ def preprocess_biology_data(biology_output: dict, spatial_dict: dict, file_confi # ---- Create copy df = df.copy() # ---- Assign values for key values - key_values = [str(index) + "-" + "-".join(df.loc[index, key_columns].values.astype(str)) - for index in df.index] + key_values = [ + str(index) + "-" + "-".join(df.loc[index, key_columns].values.astype(str)) + for index in df.index + ] # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint df.loc[:, "id"] = key_values # ---- Insert the new data into the database & pull in the combined dataset - table_df = sql_data_exchange(biology_db, - dataframe=df, - table_name=table_name, - id_columns=["id"], - primary_keys=["id"], - output_type=pd.DataFrame) + table_df = sql_data_exchange( + biology_db, + dataframe=df, + table_name=table_name, + id_columns=["id"], + primary_keys=["id"], + output_type=pd.DataFrame, + ) # ---- Drop SQL db identifier if "id" in table_df.columns: table_df.drop(columns="id", inplace=True) @@ -129,18 +144,19 @@ def preprocess_biology_data(biology_output: dict, spatial_dict: dict, file_confi # Return the output return filtered_biology_output, sql_results_dict -def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, - file_configuration: dict): + +def compute_sigma_bs( + specimen_data: pd.DataFrame, length_data: pd.DataFrame, file_configuration: dict +): # Determine contrast columns # ----- Check for "stratum" column in spatial definitions configuration stratum_column = file_configuration["spatial_column"] # ---- Append to other defined keys contrast_columns = stratum_column + ["haul_num", "species_id", "length"] - + # Meld the biological datasets - length_datasets = specimen_data.meld(length_data, - contrasts=contrast_columns) + length_datasets = specimen_data.meld(length_data, contrasts=contrast_columns) # Get the TS-length model parameterization ts_length_parameters_spp = [ @@ -152,21 +168,22 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, # Extract the target species information target_species = pd.DataFrame.from_dict(ts_length_parameters_spp) # ---- Filter out non-target species - length_datasets = ( - length_datasets[length_datasets["species_id"].isin(target_species["number_code"])] - ) + length_datasets = length_datasets[ + length_datasets["species_id"].isin(target_species["number_code"]) + ] # ---- Merge with `length_datasets` - ts_length_df = length_datasets.merge(target_species, - left_on=["species_id"], right_on=["number_code"]) + ts_length_df = length_datasets.merge( + target_species, left_on=["species_id"], right_on=["number_code"] + ) # Compute the mean sigma_bs for this particular haul # ---- Create primary key list key_list = list(set(contrast_columns) - set(["length"])) # ---- Compute haul-specific means sigma_bs_df = ( - ts_length_df - .groupby(key_list, observed=False) - [["TS_L_slope", "TS_L_intercept", "length", "length_count"]] + ts_length_df.groupby(key_list, observed=False)[ + ["TS_L_slope", "TS_L_intercept", "length", "length_count"] + ] .apply(lambda x: average_sigma_bs(x, weights="length_count")) .to_frame("sigma_bs") ) @@ -174,9 +191,7 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, # For SQL database storage purposes, the sum and count are stored instead # ---- Count sum sigma_bs_df["sigma_bs_count"] = ( - ts_length_df.reset_index() - .groupby(key_list, observed=False)["length_count"] - .sum() + ts_length_df.reset_index().groupby(key_list, observed=False)["length_count"].sum() ) # ---- Value sum sigma_bs_df["sigma_bs_sum"] = sigma_bs_df["sigma_bs"] * sigma_bs_df["sigma_bs_count"] @@ -194,11 +209,21 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, # ---- Create an insertion dataframe insertion_df = sigma_bs_df.copy() # ---- Create - SQL(biology_db, "create", table_name="sigma_bs_mean_df", dataframe=insertion_df, - primary_keys=key_list+["id"]) + SQL( + biology_db, + "create", + table_name="sigma_bs_mean_df", + dataframe=insertion_df, + primary_keys=key_list + ["id"], + ) # ---- Populate table - SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=insertion_df, - id_columns=key_list+["id"]) + SQL( + biology_db, + "insert", + table_name="sigma_bs_mean_df", + dataframe=insertion_df, + id_columns=key_list + ["id"], + ) else: # ---- Get previous values in the table table_df = SQL(biology_db, "select", table_name="sigma_bs_mean_df") @@ -215,22 +240,27 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, # ---- Create DataFrame insertion_df = sigma_bs_df[sigma_bs_df["id"].isin(insertion_keys)] # ---- INSERT - SQL(biology_db, "insert", table_name="sigma_bs_mean_df", - dataframe=insertion_df) + SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=insertion_df) # ---- UPDATE values if update_keys: update_df = sigma_bs_df[sigma_bs_df["id"].isin(update_keys)] # ---- Create a filter condition command - sql_group_update(biology_db, dataframe=update_df, table_name="sigma_bs_mean_df", - columns=["sigma_bs_count", "sigma_bs_sum"], operation="+", - unique_columns=["id"], id_columns=["id"]) + sql_group_update( + biology_db, + dataframe=update_df, + table_name="sigma_bs_mean_df", + columns=["sigma_bs_count", "sigma_bs_sum"], + operation="+", + unique_columns=["id"], + id_columns=["id"], + ) # condition_str = " & ".join([f"id = {id_value}" for id_value in update_keys]) - # SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=update_df, - # operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], + # SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=update_df, + # operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], # condition=condition_str) # # ---- Check the present keys - # current_keys_dict = SQL(acoustic_db, "inspect", table_name="sigma_bs_mean_df", + # current_keys_dict = SQL(acoustic_db, "inspect", table_name="sigma_bs_mean_df", # columns=key_list) # # ---- Insert if missing # if not all([all(sigma_bs_df[key].isin(current_keys_dict[key])) for key in key_list]): @@ -238,22 +268,30 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, # # ---- Update if not missing # else: # # ---- Create a filter condition command - # condition_str = " & ".join([f"{key} in {np.unique(sigma_bs_df[key])}" for key in key_list]) - # # ---- Update the table key - # SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df, - # operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], condition=condition_str) + # condition_str = " & ".join([f"{key} in {np.unique(sigma_bs_df[key])}" for key inkey_list]) + # # ---- Update the table key + # SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df, + # operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], condition=condition_str) # # ---- Update the actual `sigma_bs` value in the table # SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", columns=["sigma_bs"], # operation="sigma_bs_sum / sigma_bs_count", condition=condition_str) - -def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.DataFrame, - file_configuration: dict): - + + +def length_weight_regression( + specimen_data: pd.DataFrame, distribution_df: pd.DataFrame, file_configuration: dict +): + # Get the spatial column name, if there is one spatial_column = file_configuration["spatial_column"].copy() # ---- Append additional columns that will be used - contrast_columns = spatial_column + ["trawl_partition", "sex", "haul_num", "species_id", "length_bin"] - + contrast_columns = spatial_column + [ + "trawl_partition", + "sex", + "haul_num", + "species_id", + "length_bin", + ] + # Gather specimen measurements to represent 'all' fish specimen_data_all = specimen_data.assign(sex="all") @@ -261,7 +299,7 @@ def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.Da # ---- Vertical concatenation specimen_data_all = pd.concat( [specimen_data[specimen_data["sex"].isin(["male", "female"])], specimen_data_all], - ignore_index=True + ignore_index=True, ) # ---- Remove bad values specimen_data_all.dropna(subset=["length", "weight"], inplace=True) @@ -273,21 +311,26 @@ def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.Da # ---- Query database # if not SQL(biology_db, "validate", table_name="specimen_data_df"): # ---- Assign values for key values - key_values = [str(index) + "-" - + "-".join(specimen_data_all.loc[index, contrast_columns].values.astype(str)) - for index in specimen_data_all.index] + key_values = [ + str(index) + + "-" + + "-".join(specimen_data_all.loc[index, contrast_columns].values.astype(str)) + for index in specimen_data_all.index + ] # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint specimen_data_all.loc[:, "id"] = key_values # ---- Insert the new data into the database & pull in the combined dataset - specimen_data_sql = sql_data_exchange(biology_db, - dataframe=specimen_data_all, - table_name="specimen_data_df", - id_columns=["id"], - primary_keys=["id"], - output_type=pd.DataFrame) + specimen_data_sql = sql_data_exchange( + biology_db, + dataframe=specimen_data_all, + table_name="specimen_data_df", + id_columns=["id"], + primary_keys=["id"], + output_type=pd.DataFrame, + ) # ---- Drop SQL db identifier specimen_data_sql.drop(columns="id", inplace=True) - + # Fit length-weight linear regression by male, female, and all fish length_weight_regression_df = ( specimen_data_sql.groupby(["species_id", "sex"]) @@ -329,14 +372,12 @@ def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.Da ) # ---- Merge with the fitted weights weight_fitted_distribution_df = weight_fitted_distribution_df.merge( - weight_fitted_df, - on=["species_id", "sex", "length_bin"], - how="outer" + weight_fitted_df, on=["species_id", "sex", "length_bin"], how="outer" ) # ---- Fill missing counts - weight_fitted_distribution_df["weight_mean"] = ( - weight_fitted_distribution_df["weight_mean"].fillna(0.0) - ) + weight_fitted_distribution_df["weight_mean"] = weight_fitted_distribution_df[ + "weight_mean" + ].fillna(0.0) # ---- Fill missing weights weight_fitted_distribution_df["count"] = ( weight_fitted_distribution_df["count"].fillna(0).astype(int) @@ -354,39 +395,60 @@ def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.Da # Check for `weight_fitted_df` in the database file # ---- Create id/primary key - key_values = ["-".join(weight_fitted_distribution_df - .loc[idx, ["species_id", "sex", "length_bin"]] - .values.astype(str)) - for idx in weight_fitted_distribution_df.index] + key_values = [ + "-".join( + weight_fitted_distribution_df.loc[ + idx, ["species_id", "sex", "length_bin"] + ].values.astype(str) + ) + for idx in weight_fitted_distribution_df.index + ] # ---- Add to the output output_df = weight_fitted_distribution_df.assign(id=key_values) # ---- Query database if not SQL(biology_db, "validate", table_name="weight_fitted_df"): # ---- Create - SQL(biology_db, "create", table_name="weight_fitted_df", - dataframe=output_df, primary_keys=["id"]) + SQL( + biology_db, + "create", + table_name="weight_fitted_df", + dataframe=output_df, + primary_keys=["id"], + ) # ---- Populate table - SQL(biology_db, "insert", table_name="weight_fitted_df", - dataframe=output_df, id_columns=["id"]) + SQL( + biology_db, + "insert", + table_name="weight_fitted_df", + dataframe=output_df, + id_columns=["id"], + ) else: # ---- Update the table - sql_group_update(db_file=biology_db, - dataframe=output_df, - table_name="weight_fitted_df", - columns=["weight_fitted"], - unique_columns=["species_id", "sex", "length_bin"], - id_columns=["id"]) - + sql_group_update( + db_file=biology_db, + dataframe=output_df, + table_name="weight_fitted_df", + columns=["weight_fitted"], + unique_columns=["species_id", "sex", "length_bin"], + id_columns=["id"], + ) + # Return the dataframe return weight_fitted_distribution_df -def length_bin_weights(length_data: pd.DataFrame, specimen_data: pd.DataFrame, - length_weight_df: pd.DataFrame, file_configuration: dict): - + +def length_bin_weights( + length_data: pd.DataFrame, + specimen_data: pd.DataFrame, + length_weight_df: pd.DataFrame, + file_configuration: dict, +): + # Get the spatial column name, if there is one contrast_columns = file_configuration["spatial_column"].copy() # ---- Get the spatial key - spatial_key = contrast_columns.copy() + # spatial_key = contrast_columns.copy() # ---- Append additional columns that will be used contrast_columns.extend(["sex", "species_id"]) @@ -394,7 +456,7 @@ def length_bin_weights(length_data: pd.DataFrame, specimen_data: pd.DataFrame, biology_db = file_configuration["database"]["biology"] # Pull the relevant data - # SQL(biology_db, "select", table_name="length_df", + # SQL(biology_db, "select", table_name="length_df", # columns=list(set(length_data.columns) - set(["length_bin"]))) # list(set(length_data.columns) - set(["length_bin"])) # Get length distribution @@ -410,6 +472,7 @@ def length_bin_weights(length_data: pd.DataFrame, specimen_data: pd.DataFrame, dependent_var="weight_fitted", contrast=["sex", "species_id"], ) + # ---- Create helper/lambda function def weight_interpolator(dataframe_row): sex = dataframe_row["sex"] @@ -418,8 +481,8 @@ def weight_interpolator(dataframe_row): if (sex, species_id) in interpolators: return interpolators[(sex, species_id)](length) else: - return None - + return None + # Extract only sexed fish from the unaged (station 1) length dataset length_data_sexed = length_data[length_data["sex"].isin(["male", "female"])].copy() # ---- Add interpolated weights to the general length dataset @@ -428,8 +491,9 @@ def weight_interpolator(dataframe_row): ) # ---- Convert interpolated weights (summed across length counts) into a table length_table_sexed = ( - length_data_sexed - .groupby(list(set(contrast_columns).union(set(["length_bin"]))))["weight_interp"].sum() + length_data_sexed.groupby(list(set(contrast_columns).union(set(["length_bin"]))))[ + "weight_interp" + ].sum() ).reset_index() # Remove specimen data with missing data required for this analysis @@ -439,15 +503,16 @@ def weight_interpolator(dataframe_row): specimen_data_filtered = specimen_data_filtered.dropna(subset=["length", "weight"]) # ---- Convert to a table specimen_table_sexed = ( - specimen_data_filtered - .groupby(list(set(contrast_columns).union(set(["length_bin"]))))["weight"].sum() + specimen_data_filtered.groupby(list(set(contrast_columns).union(set(["length_bin"]))))[ + "weight" + ].sum() ).reset_index() # Check for `length_weight_df` in the database file # ---- Combine the datasets - full_weight_distrib = ( - pd.concat([length_table_sexed.rename(columns={"weight_interp": "weight"}), - specimen_table_sexed], ignore_index=True) + full_weight_distrib = pd.concat( + [length_table_sexed.rename(columns={"weight_interp": "weight"}), specimen_table_sexed], + ignore_index=True, ) # ---- Sum by bin full_weight_distrib = ( @@ -455,46 +520,69 @@ def weight_interpolator(dataframe_row): ) # ---- Create id/primary key full_weight_distrib.loc[:, "id"] = ( - full_weight_distrib[contrast_columns + ["length_bin"]].apply(tuple, axis=1).astype(str) + full_weight_distrib[contrast_columns + ["length_bin"]] + .apply(tuple, axis=1) + .astype(str) .str.replace("'", "") ) # - key_values = ["-".join(length_table_sexed.reset_index() - .loc[idx, ["species_id", "sex", "length_bin"]] - .values.astype(str)) - for idx in length_table_sexed.reset_index().index] + key_values = [ + "-".join( + length_table_sexed.reset_index() + .loc[idx, ["species_id", "sex", "length_bin"]] + .values.astype(str) + ) + for idx in length_table_sexed.reset_index().index + ] # ---- Add to the output length_table_sexed["id"] = key_values # ---- Query database if not SQL(biology_db, "validate", table_name="length_weight_df"): # ---- Create full table overall_weight_distrib = ( - pd.DataFrame({"stratum": file_configuration["geospatial"]["inpfc"]["stratum_names"] + - [len(file_configuration["geospatial"]["inpfc"]["stratum_names"]) + 1]}) + pd.DataFrame( + { + "stratum": file_configuration["geospatial"]["inpfc"]["stratum_names"] + + [len(file_configuration["geospatial"]["inpfc"]["stratum_names"]) + 1] + } + ) .merge(pd.DataFrame({"sex": ["male", "female"]}), how="cross") - .merge(pd.DataFrame( - {"species_id": np.unique(file_configuration["species"]["number_code"])} - ), how="cross") + .merge( + pd.DataFrame( + {"species_id": np.unique(file_configuration["species"]["number_code"])} + ), + how="cross", + ) .merge(distribution_df.filter(["length_bin"]), how="cross") ) # ---- Pre-allocate weight overall_weight_distrib.loc[:, "weight"] = 0.0 # ---- Create id/primary key overall_weight_distrib.loc[:, "id"] = ( - overall_weight_distrib[contrast_columns + ["length_bin"]].apply(tuple, axis=1) + overall_weight_distrib[contrast_columns + ["length_bin"]] + .apply(tuple, axis=1) .astype(str) .str.replace("'", "") ) # ---- Create - SQL(biology_db, "create", table_name="length_weight_df", - dataframe=overall_weight_distrib, primary_keys=["id"]) + SQL( + biology_db, + "create", + table_name="length_weight_df", + dataframe=overall_weight_distrib, + primary_keys=["id"], + ) # ---- INSERT - SQL(biology_db, "insert", table_name="length_weight_df", - dataframe=overall_weight_distrib) + SQL(biology_db, "insert", table_name="length_weight_df", dataframe=overall_weight_distrib) # ---- UPDATE - sql_group_update(biology_db, dataframe=full_weight_distrib, table_name="length_weight_df", - columns=["weight"], - unique_columns=["id"], id_columns=["id"]) + sql_group_update( + biology_db, + dataframe=full_weight_distrib, + table_name="length_weight_df", + columns=["weight"], + unique_columns=["id"], + id_columns=["id"], + ) # table_df = SQL(biology_db, "select", table_name="length_weight_df") # # ---- Check the table keys # table_keys = np.unique(table_df["id"]).tolist() @@ -509,61 +597,66 @@ def weight_interpolator(dataframe_row): # # ---- Create DataFrame # insertion_df = full_weight_distrib[full_weight_distrib["id"].isin(insertion_keys)] # # ---- INSERT - # SQL(biology_db, "insert", table_name="length_weight_df", + # SQL(biology_db, "insert", table_name="length_weight_df", # dataframe=insertion_df) # # ---- UPDATE values # if update_keys: # update_df = full_weight_distrib[full_weight_distrib["id"].isin(update_keys)] # # ---- Create a filter condition command - # sql_group_update(biology_db, dataframe=update_df, table_name="length_weight_df", + # sql_group_update(biology_db, dataframe=update_df, table_name="length_weight_df", # columns=["weight"], - # unique_columns=["id"], id_columns=["id"]) + # unique_columns=["id"], id_columns=["id"]) # # ---- Update the table - # sql_group_update(db_file=biology_db, - # dataframe=length_table_sexed, - # table_name="length_weight_df", + # sql_group_update(db_file=biology_db, + # dataframe=length_table_sexed, + # table_name="length_weight_df", # columns=["weight_interp"], - # unique_columns=contrast_columns, + # unique_columns=contrast_columns, # id_columns=["id"]) # length_sql_sexed - - + # , specimen_sql_sexed # Return outputs return length_table_sexed, specimen_table_sexed -def number_proportions(specimen_binned: pd.DataFrame, specimen_binned_filtered: pd.DataFrame, - length_binned: pd.DataFrame, file_configuration: dict): - + +def number_proportions( + specimen_binned: pd.DataFrame, + specimen_binned_filtered: pd.DataFrame, + length_binned: pd.DataFrame, + file_configuration: dict, +): + # Get the spatial column name, if there is one contrast_columns = file_configuration["spatial_column"].copy() # ---- Append additional columns that will be used contrast_columns.extend(["sex", "species_id"]) - # Get unique values of each contrast column across the biological datasets - dfs = [pd.DataFrame({col: df[col].unique().tolist()}) - for col, df in zip(contrast_columns, [specimen_binned, - specimen_binned_filtered, - length_binned])] + dfs = [ + pd.DataFrame({col: df[col].unique().tolist()}) + for col, df in zip( + contrast_columns, [specimen_binned, specimen_binned_filtered, length_binned] + ) + ] # ---- Reduce into a single DataFrame - count_total = reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs) + count_total = reduce(lambda left, right: pd.merge(left, right, how="cross"), dfs) # ---- Set the indices count_total.set_index(contrast_columns, inplace=True) # ---- Specimen count count_total["total_specimen"] = specimen_binned.groupby(contrast_columns)["count"].sum() # ---- Specimen filtered count - count_total["total_specimen_filtered"] = ( - specimen_binned_filtered.groupby(contrast_columns)["count"].sum() - ) + count_total["total_specimen_filtered"] = specimen_binned_filtered.groupby(contrast_columns)[ + "count" + ].sum() # ---- Length count count_total["total_length"] = length_binned.groupby(contrast_columns)["count"].sum() # ---- Fill NaN count_total.fillna(0, inplace=True) - count_total = ( - count_total.reset_index().set_index(list(set(contrast_columns) - set(["sex", "species_id"]))) + count_total = count_total.reset_index().set_index( + list(set(contrast_columns) - set(["sex", "species_id"])) ) # ---- Grand totals count_total["total_overall"] = ( @@ -577,8 +670,10 @@ def number_proportions(specimen_binned: pd.DataFrame, specimen_binned_filtered: specimen_number_proportion = specimen_binned_filtered[ specimen_binned_filtered["sex"].isin(["male", "female", "all"]) ].merge( - count_total[list(set(contrast_columns).union(set(["total_specimen_filtered", "total_overall"])))], - on=contrast_columns + count_total[ + list(set(contrast_columns).union(set(["total_specimen_filtered", "total_overall"]))) + ], + on=contrast_columns, ) # ---- Within-dataset proportion specimen_number_proportion["proportion_number_specimen"] = ( @@ -602,7 +697,7 @@ def number_proportions(specimen_binned: pd.DataFrame, specimen_binned_filtered: length_binned["sex"].isin(["male", "female", "all"]) ].merge( count_total[list(set(contrast_columns).union(set(["total_length", "total_overall"])))], - on=contrast_columns + on=contrast_columns, ) # ---- Within-dataset proportion length_number_proportion["proportion_number_length"] = ( @@ -616,9 +711,7 @@ def number_proportions(specimen_binned: pd.DataFrame, specimen_binned_filtered: # Gather unaged (sexed) number proportions # ---- Merge sex_number_proportions = sex_number_proportions.merge( - length_number_proportion.groupby(contrast_columns)[ - "proportion_number_length_overall" - ] + length_number_proportion.groupby(contrast_columns)["proportion_number_length_overall"] .sum() .reset_index(), how="outer", @@ -627,13 +720,15 @@ def number_proportions(specimen_binned: pd.DataFrame, specimen_binned_filtered: sex_number_proportions["proportion_number_overall"] = ( sex_number_proportions.proportion_number_specimen_overall + sex_number_proportions.proportion_number_length_overall - ) + ) # Return the output return specimen_number_proportion, length_number_proportion, sex_number_proportions -def length_bin_counts(length_data: pd.DataFrame, specimen_data: pd.DataFrame, - file_configuration: dict): + +def length_bin_counts( + length_data: pd.DataFrame, specimen_data: pd.DataFrame, file_configuration: dict +): # Get the spatial column name, if there is one contrast_columns = file_configuration["spatial_column"].copy() @@ -673,12 +768,12 @@ def length_bin_counts(length_data: pd.DataFrame, specimen_data: pd.DataFrame, contrasts=contrast_columns, variable="length_count", fun="sum", - ) + ) return ( - specimen_number_distribution, - specimen_number_distribution_filtered, - length_number_distribution + specimen_number_distribution, + specimen_number_distribution_filtered, + length_number_distribution, ) @@ -698,12 +793,12 @@ def length_bin_counts(length_data: pd.DataFrame, specimen_data: pd.DataFrame, # # Generate number counts for the length distribution # length_datasets = ( # biology_dict["specimen_df"] -# .meld(biology_dict["length_df"], +# .meld(biology_dict["length_df"], # contrasts=list(set(contrast_columns).union(["length_bin"]))) -# ) +# ) # # ---- Create 'all' # length_datasets_all = pd.concat([ -# length_datasets[length_datasets["sex"].isin(["male", "female"])], +# length_datasets[length_datasets["sex"].isin(["male", "female"])], # length_datasets.assign(sex="all") # ]) @@ -712,7 +807,7 @@ def length_bin_counts(length_data: pd.DataFrame, specimen_data: pd.DataFrame, # length_datasets_all # .groupby(contrast_columns, observed=False)["length_count"].sum() # ) - + # # Get distinct DataFrame columns # distinct_keys = ( # grouped_length @@ -735,27 +830,27 @@ def length_bin_counts(length_data: pd.DataFrame, specimen_data: pd.DataFrame, # # ---- Create id/primary key # key_values = ["-".join(output_df # .loc[idx, ["species_id", "sex", "length_bin"]] -# .values.astype(str)) +# .values.astype(str)) # for idx in output_df.index] # # ---- Add to the output # output_df["id"] = key_values # # ---- Query database # if not SQL(biology_db, "validate", table_name="length_count_df"): # # ---- Create -# SQL(biology_db, "create", table_name="length_count_df", -# dataframe=output_df, primary_keys=["id"]) +# SQL(biology_db, "create", table_name="length_count_df", +# dataframe=output_df, primary_keys=["id"]) # # ---- Populate table -# SQL(biology_db, "insert", table_name="length_count_df", +# SQL(biology_db, "insert", table_name="length_count_df", # dataframe=output_df, id_columns=["id"]) # else: # # ---- Update the table -# sql_group_update(db_file=biology_db, -# dataframe=output_df, -# table_name="length_count_df", +# sql_group_update(db_file=biology_db, +# dataframe=output_df, +# table_name="length_count_df", # columns=["count"], -# unique_columns=contrast_columns, +# unique_columns=contrast_columns, # id_columns=["id"]) - + # # Return output # return output_df @@ -767,25 +862,26 @@ def _quantize_lengths(dataset, distribution): # ---- Cut/merge the underlying histogram/discretized length bins if "length" in dataset.columns: # ---- Cut the intervals - dataset["length_bin"] = pd.cut(dataset["length"], - np.unique(np.hstack([distribution["lower"], - distribution["upper"]])), - labels=distribution["length_bin"]).astype(float) + dataset["length_bin"] = pd.cut( + dataset["length"], + np.unique(np.hstack([distribution["lower"], distribution["upper"]])), + labels=distribution["length_bin"], + ).astype(float) # ---- Return the dataset return dataset - + # Update the data dictionary - biology_dict.update({ - k: _quantize_lengths(d, distribution_df) for k, d in biology_dict.items() - }) + biology_dict.update({k: _quantize_lengths(d, distribution_df) for k, d in biology_dict.items()}) -def compute_average_weights(specimen_number_proportion: pd.DataFrame, - length_number_proportion: pd.DataFrame, - sex_number_proportions: pd.DataFrame, - length_weight_df: pd.DataFrame, - distribution_df: pd.DataFrame, - file_configuration: dict): +def compute_average_weights( + specimen_number_proportion: pd.DataFrame, + length_number_proportion: pd.DataFrame, + sex_number_proportions: pd.DataFrame, + length_weight_df: pd.DataFrame, + distribution_df: pd.DataFrame, + file_configuration: dict, +): # Get the spatial column name, if there is one contrast_columns = file_configuration["spatial_column"].copy() @@ -795,25 +891,30 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame, overall_proportions = sex_number_proportions[sex_number_proportions["sex"] == "all"] updated_proportions = sex_number_proportions.copy() - updated_proportions["number_proportion_length_all"] = overall_proportions["proportion_number_length_overall"].values[0] - updated_proportions["number_proportion_specimen_all"] = overall_proportions["proportion_number_specimen_overall"].values[0] + updated_proportions["number_proportion_length_all"] = overall_proportions[ + "proportion_number_length_overall" + ].values[0] + updated_proportions["number_proportion_specimen_all"] = overall_proportions[ + "proportion_number_specimen_overall" + ].values[0] # Calculate the mixed aged and unaged number proportions - updated_proportions["proportion_length"] = ( - updated_proportions["number_proportion_length_all"] / - (updated_proportions["number_proportion_length_all"] + - updated_proportions["proportion_number_specimen_overall"]) + updated_proportions["proportion_length"] = updated_proportions[ + "number_proportion_length_all" + ] / ( + updated_proportions["number_proportion_length_all"] + + updated_proportions["proportion_number_specimen_overall"] ) # ---- Calculate aged number proportions per sex per stratum - updated_proportions["proportion_specimen"] = ( - updated_proportions["proportion_number_specimen_overall"] / ( - updated_proportions["proportion_number_specimen_overall"] + - updated_proportions["proportion_length"] - ) + updated_proportions["proportion_specimen"] = updated_proportions[ + "proportion_number_specimen_overall" + ] / ( + updated_proportions["proportion_number_specimen_overall"] + + updated_proportions["proportion_length"] ) # ---- Reduce the columns - proportion_df = ( - updated_proportions.filter(contrast_columns + ["proportion_length", "proportion_specimen"]) + proportion_df = updated_proportions.filter( + contrast_columns + ["proportion_length", "proportion_specimen"] ) # Combine the aged-unaged (or station-specific) proportions for calculations @@ -828,8 +929,9 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame, ).reset_index() # ---- Convert to Table (to replicate indexed matrix operations) station_proportions_table = station_proportions.pivot_table( - index=["species_id", "group", "sex"], - columns=file_configuration["spatial_column"].copy(), values="proportion" + index=["species_id", "group", "sex"], + columns=file_configuration["spatial_column"].copy(), + values="proportion", ).fillna(0.0) # Calculate the number length proportions that will later be converted into weight @@ -842,19 +944,22 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame, .reset_index(name="number_proportion") ) # ---- Length - length_length_distribution = ( - length_number_proportion[length_number_proportion.sex != "unsexed"][ - contrast_columns + ["length_bin", "proportion_number_length"] - ].rename(columns={"proportion_number_length": "number_proportion"}) + length_length_distribution = length_number_proportion[ + length_number_proportion.sex != "unsexed" + ][contrast_columns + ["length_bin", "proportion_number_length"]].rename( + columns={"proportion_number_length": "number_proportion"} ) # Get unique values of each contrast column across the biological datasets - dfs = [pd.DataFrame({col: df[col].unique().tolist()}) - for col, df in zip(contrast_columns, [specimen_number_proportion, - length_number_proportion, - sex_number_proportions])] + dfs = [ + pd.DataFrame({col: df[col].unique().tolist()}) + for col, df in zip( + contrast_columns, + [specimen_number_proportion, length_number_proportion, sex_number_proportions], + ) + ] # ---- Reduce into a single DataFrame - full_contrast_keys = reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs) + full_contrast_keys = reduce(lambda left, right: pd.merge(left, right, how="cross"), dfs) # length_distribution_df = distribution_df.copy() @@ -865,17 +970,29 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame, ) specimen_length_complete = complete_distrib_df.copy() - specimen_length_complete["number_proportion"] = specimen_length_distribution.set_index(contrast_columns + ["length_bin"]).sort_index() - specimen_length_complete.loc[:, "number_proportion"] = specimen_length_complete["number_proportion"].fillna(0.0) + specimen_length_complete["number_proportion"] = specimen_length_distribution.set_index( + contrast_columns + ["length_bin"] + ).sort_index() + specimen_length_complete.loc[:, "number_proportion"] = specimen_length_complete[ + "number_proportion" + ].fillna(0.0) length_length_complete = complete_distrib_df.copy() - length_length_complete["number_proportion"] = length_length_distribution.set_index(contrast_columns + ["length_bin"]).sort_index() - length_length_complete.loc[:, "number_proportion"] = length_length_complete["number_proportion"].fillna(0.0) + length_length_complete["number_proportion"] = length_length_distribution.set_index( + contrast_columns + ["length_bin"] + ).sort_index() + length_length_complete.loc[:, "number_proportion"] = length_length_complete[ + "number_proportion" + ].fillna(0.0) # ---- Concatenate the two datasets combined_number_proportions = ( - pd.concat([specimen_length_complete.assign(group="specimen"), - length_length_complete.assign(group="length")]) + pd.concat( + [ + specimen_length_complete.assign(group="specimen"), + length_length_complete.assign(group="length"), + ] + ) ).reset_index() # ---- Convert to Table (to replicate indexed matrix operations) length_proportions_table = combined_number_proportions.pivot_table( @@ -894,47 +1011,55 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame, # ---- All fitted_weight_table.loc[:, "all", :] weight_all = fitted_weight_table.loc[:, "all", :]["weight_fitted"].values.dot( - length_proportions_table.loc[:, "specimen", "all"] + length_proportions_table.loc[:, "specimen", "all"] * station_proportions_table.loc[:, "specimen", "all"] + length_proportions_table.loc[:, "length", "all"] * station_proportions_table.loc[:, "length", "all"] ) weight_male = fitted_weight_table.loc[:, "male", :]["weight_fitted"].values.dot( - length_proportions_table.loc[:, "specimen", "male"] + length_proportions_table.loc[:, "specimen", "male"] * station_proportions_table.loc[:, "specimen", "male"] + length_proportions_table.loc[:, "length", "male"] * station_proportions_table.loc[:, "length", "male"] ) weight_female = fitted_weight_table.loc[:, "female", :]["weight_fitted"].values.dot( - length_proportions_table.loc[:, "specimen", "female"] + length_proportions_table.loc[:, "specimen", "female"] * station_proportions_table.loc[:, "specimen", "female"] + length_proportions_table.loc[:, "length", "female"] * station_proportions_table.loc[:, "length", "female"] ) # ---- Combine the averaged weights for each sex and all fish fitted_weight_df = full_contrast_keys.copy() - fitted_weight_df["average_weight"] = ( - np.concatenate([weight_all, weight_male, weight_female]) - ) + fitted_weight_df["average_weight"] = np.concatenate([weight_all, weight_male, weight_female]) # Get database file biology_db = file_configuration["database"]["biology"] # Insert/update the table # ---- Create id/primary key - key_values = ["-".join(fitted_weight_df.reset_index() - .loc[idx, contrast_columns] - .values.astype(str)) - for idx in fitted_weight_df.reset_index().index] + key_values = [ + "-".join(fitted_weight_df.reset_index().loc[idx, contrast_columns].values.astype(str)) + for idx in fitted_weight_df.reset_index().index + ] # ---- Add to the output fitted_weight_df["id"] = key_values if not SQL(biology_db, "validate", table_name="weight_stratum_df"): # ---- Create - SQL(biology_db, "create", table_name="weight_stratum_df", - dataframe=fitted_weight_df, primary_keys=["id"]) + SQL( + biology_db, + "create", + table_name="weight_stratum_df", + dataframe=fitted_weight_df, + primary_keys=["id"], + ) # ---- Populate table - SQL(biology_db, "insert", table_name="weight_stratum_df", - dataframe=fitted_weight_df, id_columns=["id"]) + SQL( + biology_db, + "insert", + table_name="weight_stratum_df", + dataframe=fitted_weight_df, + id_columns=["id"], + ) else: # ---- Get previous values in the table table_df = SQL(biology_db, "select", table_name="weight_stratum_df") @@ -953,27 +1078,38 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame, # ---- Create DataFrame insertion_df = fitted_weight_df[fitted_weight_df["current_keys"].isin(insertion_keys)] # ---- INSERT - SQL(biology_db, "insert", table_name="weight_stratum_df", - dataframe=insertion_df.drop(columns="current_keys")) + SQL( + biology_db, + "insert", + table_name="weight_stratum_df", + dataframe=insertion_df.drop(columns="current_keys"), + ) # ---- UPDATE values if update_keys: # ---- Create DataFrame update_df = fitted_weight_df[fitted_weight_df["current_keys"].isin(update_keys)] # ---- UPDATE - sql_group_update(biology_db, dataframe=update_df, - table_name="weight_stratum_df", columns=["average_weight"], - unique_columns=contrast_columns, - id_columns=["id"]) + sql_group_update( + biology_db, + dataframe=update_df, + table_name="weight_stratum_df", + columns=["average_weight"], + unique_columns=contrast_columns, + id_columns=["id"], + ) # Return output return fitted_weight_df -def weight_proportions(catch_data: pd.DataFrame, - specimen_weight_binned: pd.DataFrame, - length_weight_binned: pd.DataFrame, - length_number_proportion: pd.DataFrame, - length_weight_df: pd.DataFrame, - file_configuration: dict): - + +def weight_proportions( + catch_data: pd.DataFrame, + specimen_weight_binned: pd.DataFrame, + length_weight_binned: pd.DataFrame, + length_number_proportion: pd.DataFrame, + length_weight_df: pd.DataFrame, + file_configuration: dict, +): + # Get the spatial column name, if there is one spatial_column = file_configuration["spatial_column"] # ---- Append additional columns that will be used @@ -982,77 +1118,58 @@ def weight_proportions(catch_data: pd.DataFrame, # Calculate grouped totals # ---- Sum the net haul weights from station 1/unaged fish catch_weights = catch_data.count_variable( - contrasts=["species_id"] + spatial_column, - variable="haul_weight", fun="sum" + contrasts=["species_id"] + spatial_column, variable="haul_weight", fun="sum" ) # ---- Rename resulting columns for both catch_weights.rename(columns={"count": "total_weight"}, inplace=True) - - # For the specimen data + + # For the specimen data # ---- Sum the net haul weights from station 1/unaged fish - specimen_weights_sex = ( - specimen_weight_binned - .groupby(contrast_columns)["weight"] - .sum() - ) + specimen_weights_sex = specimen_weight_binned.groupby(contrast_columns)["weight"].sum() # ---- Total (per stratum, if it exists) specimen_weight_total = specimen_weights_sex.transpose().unstack(1).sum(axis=1) - + # For the length (unaged) dataset - length_weights_sex = ( - length_weight_binned - .groupby(contrast_columns)["weight_interp"] - .sum() - ) + length_weights_sex = length_weight_binned.groupby(contrast_columns)["weight_interp"].sum() # ---- Further reduce to the grand total (per stratum, if it exists) length_weight_total = length_weights_sex.transpose().unstack(1).sum(axis=1) # ---- Standardize the unaged sexed weights - length_weight_standardized = ( - (length_weights_sex / length_weight_total).unstack(0) - * catch_weights["total_weight"].to_numpy() - ) - + length_weight_standardized = (length_weights_sex / length_weight_total).unstack( + 0 + ) * catch_weights["total_weight"].to_numpy() + # Calculate the specimen weight proportions # ---- Pivot weight bins - specimen_weight_binned_pvt = ( - specimen_weight_binned.pivot_table( - columns=spatial_column, - index=["length_bin", "species_id", "sex"], - values="weight", - observed = False - ) + specimen_weight_binned_pvt = specimen_weight_binned.pivot_table( + columns=spatial_column, + index=["length_bin", "species_id", "sex"], + values="weight", + observed=False, ) # ---- Divide by the aged stratum weights (relative to only aged fish) - specimen_weight_proportions_pvt = ( - specimen_weight_binned_pvt / specimen_weight_total.to_numpy() - ) + specimen_weight_proportions_pvt = specimen_weight_binned_pvt / specimen_weight_total.to_numpy() # ---- Pivot back to the desired format specimen_weight_proportion = ( - specimen_weight_proportions_pvt - .stack().reset_index(name="weight_proportion") - .pivot_table(columns=spatial_column + ["species_id", "sex"], - index="length_bin", values="weight_proportion") - ) - # ---- Calculate the internal (i.e. only aged fish) for each sex - within_specimen_sex_proportions = ( - specimen_weight_proportion.sum() + specimen_weight_proportions_pvt.stack() + .reset_index(name="weight_proportion") + .pivot_table( + columns=spatial_column + ["species_id", "sex"], + index="length_bin", + values="weight_proportion", + ) ) + # ---- Calculate the internal (i.e. only aged fish) for each sex + within_specimen_sex_proportions = specimen_weight_proportion.sum() # Calculate the total strata weights # ---- Index `catch_weights` catch_weights_idx = catch_weights.set_index(spatial_column + ["species_id"]) # ---- Compute the spatially-stratified/grouped weights - spatial_weights = ( - pd.concat([specimen_weight_total.to_frame("total_weight"), catch_weights_idx]) - .pivot_table( - columns=spatial_column, - aggfunc="sum", - values="total_weight", - observed=False - ) - ) - + spatial_weights = pd.concat( + [specimen_weight_total.to_frame("total_weight"), catch_weights_idx] + ).pivot_table(columns=spatial_column, aggfunc="sum", values="total_weight", observed=False) + # Calculate the weight proportions relative to the overall stratum weights # ---- Aged # -------- Reformat into dataframe and merge with total stratum weights @@ -1067,9 +1184,9 @@ def weight_proportions(catch_data: pd.DataFrame, specimen_weights_binned_df["specimen_weight"] / specimen_weights_binned_df["total_weight"] ) # -------- Consolidate to calculate the sexed proportions per stratum - specimen_weight_sex_proportions = specimen_weights_binned_df.groupby(spatial_column + ["species_id", "sex"])[ - "weight_proportion_overall" - ].sum() + specimen_weight_sex_proportions = specimen_weights_binned_df.groupby( + spatial_column + ["species_id", "sex"] + )["weight_proportion_overall"].sum() # ---- Unaged # -------- Reformat into dataframe and merge with total stratum weights length_weights_sex_standardized_df = ( @@ -1085,14 +1202,18 @@ def weight_proportions(catch_data: pd.DataFrame, ) # -------- Back-calculate the sexed weight proportions relative to just unaged fish # ------------ Aggregate proportions - length_total_sex_proportions = length_weights_sex_standardized_df.pivot_table( - columns=["species_id", "sex"], index=spatial_column, values="weight_proportion_overall" - ).transpose().unstack(["species_id"]).sum(axis=0) + length_total_sex_proportions = ( + length_weights_sex_standardized_df.pivot_table( + columns=["species_id", "sex"], index=spatial_column, values="weight_proportion_overall" + ) + .transpose() + .unstack(["species_id"]) + .sum(axis=0) + ) # ------------ Re-compute the proportions length_weight_sex_proportions = ( length_weights_sex_standardized_df.pivot_table( - index=["species_id", "sex"], columns=spatial_column, - values="weight_proportion_overall" + index=["species_id", "sex"], columns=spatial_column, values="weight_proportion_overall" ) / length_total_sex_proportions.to_numpy() ) @@ -1115,11 +1236,17 @@ def weight_proportions(catch_data: pd.DataFrame, # ---- Generate the fitted weight array fitted_weights = length_weight_all.copy() # ---- Get actual length bins in dataset - fitted_weights = fitted_weights[fitted_weights["length_bin"].isin(length_number_proportions["length_bin"])] + fitted_weights = fitted_weights[ + fitted_weights["length_bin"].isin(length_number_proportions["length_bin"]) + ] # ---- Apportion the averaged weights - length_apportioned_weights = length_number_proportions_tbl.T * fitted_weights["weight_fitted"].to_numpy() + length_apportioned_weights = ( + length_number_proportions_tbl.T * fitted_weights["weight_fitted"].to_numpy() + ) # ---- Compute the average weight proportions per length bin per stratum - average_length_bin_weights = length_apportioned_weights.T / length_apportioned_weights.sum(axis=1) + average_length_bin_weights = length_apportioned_weights.T / length_apportioned_weights.sum( + axis=1 + ) # ---- Convert back to a DataFrame average_length_bin_weights_df = average_length_bin_weights.unstack().reset_index( name="weight_proportion" @@ -1132,7 +1259,9 @@ def weight_proportions(catch_data: pd.DataFrame, unaged_proportions = 1 - aged_proportions # -------- Re-weight the unaged sexed proportions unaged_weight_sex_proportions_overall = ( - (length_weight_sex_proportions * unaged_proportions.unstack().transpose()).astype(float).fillna(0.0) + (length_weight_sex_proportions * unaged_proportions.unstack().transpose()) + .astype(float) + .fillna(0.0) ) unaged_proportions.unstack().transpose() @@ -1148,18 +1277,18 @@ def weight_proportions(catch_data: pd.DataFrame, ) ) # ---- Aged: stratum-sex relative to total weights - aged_sex_df =within_specimen_sex_proportions.reset_index(name="weight_proportion_aged").set_index( - spatial_column + ["species_id", "sex"] - ) + aged_sex_df = within_specimen_sex_proportions.reset_index( + name="weight_proportion_aged" + ).set_index(spatial_column + ["species_id", "sex"]) # ---- Add the aged sex proportiosn relative to the overall survey aged_sex_df["weight_proportion_overall_aged"] = specimen_weight_sex_proportions # ---- Consolidate the aged and unaged sexed dataframes # -------- Initialize the dataframe - aged_unaged_sex_proportions = aged_sex_df.reset_index().set_index(["species_id", "sex"] + spatial_column) - # --------- Add the within-unaged weight proportions - aged_unaged_sex_proportions["weight_proportion_unaged"] = ( - length_weight_sex_proportions.stack() + aged_unaged_sex_proportions = aged_sex_df.reset_index().set_index( + ["species_id", "sex"] + spatial_column ) + # --------- Add the within-unaged weight proportions + aged_unaged_sex_proportions["weight_proportion_unaged"] = length_weight_sex_proportions.stack() # --------- Add the overall-unaged weight proportions aged_unaged_sex_proportions["weight_proportion_overall_unaged"] = ( unaged_weight_sex_proportions_overall.stack() @@ -1169,10 +1298,10 @@ def weight_proportions(catch_data: pd.DataFrame, # ---- Set index aged_unaged_proportions.set_index(spatial_column + ["species_id"], inplace=True) # -------- Add unaged proportions - aged_unaged_proportions["unaged_proportions"] = unaged_proportions#.reset_index() + aged_unaged_proportions["unaged_proportions"] = unaged_proportions # .reset_index() # ---- Reset the index aged_unaged_proportions = aged_unaged_proportions.reset_index() - + # Return output return { "aged_weight_proportions_df": aged_overall_df, @@ -1183,10 +1312,12 @@ def weight_proportions(catch_data: pd.DataFrame, "aged_unaged_weight_proportions_df": aged_unaged_proportions, } + # TODO: NEED TO UPDATE TO EITHER INSERT IF NOT PRESENT OR UPDATE OTHERWISE ! ! ! # ! SEE ABOVE -def summarize_strata(nasc_biology_data: pd.DataFrame, spatial_data: pd.DataFrame, - file_configuration: dict): +def summarize_strata( + nasc_biology_data: pd.DataFrame, spatial_data: pd.DataFrame, file_configuration: dict +): # Get biology database acoustic_db = file_configuration["database"]["acoustics"] @@ -1200,25 +1331,47 @@ def summarize_strata(nasc_biology_data: pd.DataFrame, spatial_data: pd.DataFrame # Create copy strata_df = spatial_data.copy() - # Define new columns - strata_df[["length_mean", "weight_mean", "TS_mean", "number_density_mean", - "biomass_density_mean", "abundance_sum", "biomass_sum"]] = np.nan + # Define new columns + strata_df[ + [ + "length_mean", + "weight_mean", + "TS_mean", + "number_density_mean", + "biomass_density_mean", + "abundance_sum", + "biomass_sum", + ] + ] = np.nan # ---- Drop 'latitude_interval' strata_df.drop(columns=["latitude_interval"], inplace=True) # ---- Create - SQL(biology_db, "create", table_name="strata_summary_df", - dataframe=strata_df, primary_keys=["stratum"]) + SQL( + biology_db, + "create", + table_name="strata_summary_df", + dataframe=strata_df, + primary_keys=["stratum"], + ) # ---- Populate table - SQL(biology_db, "insert", table_name="strata_summary_df", - dataframe=strata_df, id_columns=["stratum"]) - + SQL( + biology_db, + "insert", + table_name="strata_summary_df", + dataframe=strata_df, + id_columns=["stratum"], + ) + # Get unique strata values strata_values = np.unique(nasc_biology_data["stratum"]).tolist() - + # Update the table - sql_update_strata_summary(source_db=acoustic_db, target_db=biology_db, - source_table="survey_data_df", target_table="strata_summary_df", - data_columns=[("number_density", "mean"), - ("biomass_density", "mean")], - strata=strata_values) \ No newline at end of file + sql_update_strata_summary( + source_db=acoustic_db, + target_db=biology_db, + source_table="survey_data_df", + target_table="strata_summary_df", + data_columns=[("number_density", "mean"), ("biomass_density", "mean")], + strata=strata_values, + ) diff --git a/echopop/live/live_core.py b/echopop/live/live_core.py index 388a8240..6c41b33a 100644 --- a/echopop/live/live_core.py +++ b/echopop/live/live_core.py @@ -1,5 +1,3 @@ -from datetime import datetime - import pandas as pd LIVE_DATA_STRUCTURE = { @@ -28,7 +26,7 @@ "results": { "acoustics": dict(), "biology": dict(), - "stratified": dict(), + "stratified": dict(), }, } @@ -54,8 +52,12 @@ "optional_keys": [], "keys": { "*": { - "required_keys": ["number_code", "TS_L_slope", "TS_L_intercept", - "length_units"], + "required_keys": [ + "number_code", + "TS_L_slope", + "TS_L_intercept", + "length_units", + ], "optional_keys": ["character_code"], "keys": { "number_code": int, @@ -78,8 +80,8 @@ "optional_keys": [], "keys": { "bins": [float, int], - }, }, + }, "stations": { "required_keys": ["separate_stations", "station_id"], "optional_keys": [], @@ -99,7 +101,7 @@ }, "geospatial": { "required_keys": ["projection", "link_biology_acoustics"], - "optional_keys": ["inpfc", "griddify"], + "optional_keys": ["inpfc", "griddify"], "keys": { "inpfc": { "required_keys": ["latitude_max", "stratum_names"], @@ -120,20 +122,22 @@ "latitude": [float], "longitude": [float], "x": [float], - "y": [float] + "y": [float], }, - }, + }, "grid_resolution": { - "required_keys":[("latitude_distance", "longitude_distance"), - ("x_distance", "y_distance")], + "required_keys": [ + ("latitude_distance", "longitude_distance"), + ("x_distance", "y_distance"), + ], "optional_keys": [], "keys": { "longitude_distance": float, "latitude_distance": float, "x_distance": float, "y_distnace": float, - } - } + }, + }, }, }, "link_biology_acoustics": ["closest_haul", "global", "INPFC", "weighted_haul"], @@ -146,7 +150,7 @@ # Required data configuration YAML structure LIVE_CONFIG_DATA_MODEL = { "required_keys": ["ship_id", "survey_year", "database_directory", "input_directories"], - "optional_keys": ["species", "data_root_dir"], + "optional_keys": ["species", "data_root_dir"], "keys": { "data_root_dir": str, "database_directory": str, @@ -164,8 +168,14 @@ }, }, "biology": { - "required_keys": ["database_name", "directory", "extension", "file_index", - "file_ids", "file_name_formats"], + "required_keys": [ + "database_name", + "directory", + "extension", + "file_index", + "file_ids", + "file_name_formats", + ], "optional_keys": [], "keys": { "directory": str, @@ -184,14 +194,14 @@ "keys": { "*": str, }, - }, + }, "file_index": { "required_keys": ["*"], "optional_keys": [], "keys": { "*": [str], }, - }, + }, }, }, "coastline": { @@ -234,11 +244,11 @@ }, "xarray_variables": { "NASC": float, - "frequency_nominal": float, + "frequency_nominal": float, "latitude": float, "longitude": float, "ping_time": "datetime64[ns]", - } + }, }, "biology": { "catch": { @@ -253,7 +263,7 @@ "species_code": "species_id", "overall_weight": "haul_weight", "catch_perc": "catch_percentage", - } + }, }, "trawl_info": { "dtypes": { @@ -298,7 +308,7 @@ "partition": "trawl_partition", "sex": "sex", "length": "length", - "organism_weight": "weight" + "organism_weight": "weight", }, }, }, @@ -320,16 +330,8 @@ "dtype": int, "expression": r"(?P\d+)", }, - "SPECIES_CODE": { - "name": "species_id", - "dtype": int, - "expression": r"(?P\d+)" - }, - "FILE_ID": { - "name": "file_id", - "dtype": str, - "expression": r"(?P.+)" - }, + "SPECIES_CODE": {"name": "species_id", "dtype": int, "expression": r"(?P\d+)"}, + "FILE_ID": {"name": "file_id", "dtype": str, "expression": r"(?P.+)"}, } SPATIAL_CONFIG_MAP = { @@ -338,21 +340,13 @@ "choices": ["distance", "time"], }, }, - "global" : {}, + "global": {}, "griddify": { "bounds": { - "longitude": { - "types": [float] - }, - "latitude": { - "types": [float] - }, - "northings": { - "types": [float] - }, - "eastings": { - "types": [float] - }, + "longitude": {"types": [float]}, + "latitude": {"types": [float]}, + "northings": {"types": [float]}, + "eastings": {"types": [float]}, "pairs": [("longitude", "latitude"), ("northings", "eastings")], }, "grid_resolution": { @@ -374,21 +368,20 @@ "grid_size_y": { "types": int, }, - "pairs": [("x_distance", "y_distance"), ("d_longitude", "d_latitude"), - ("grid_size_x", "grid_size_y")], + "pairs": [ + ("x_distance", "y_distance"), + ("d_longitude", "d_latitude"), + ("grid_size_x", "grid_size_y"), + ], }, }, "inpfc": { - "stratum_names": { - "types": [int, str] - }, + "stratum_names": {"types": [int, str]}, "latitude_max": { "types": [float], }, }, "weighted_haul": { - "proximity": { - "choices": ["distance", "time"] - }, + "proximity": {"choices": ["distance", "time"]}, }, -} \ No newline at end of file +} diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py index 3018604e..f763c0be 100644 --- a/echopop/live/live_data_loading.py +++ b/echopop/live/live_data_loading.py @@ -1,32 +1,34 @@ -from pathlib import Path -from typing import Union, Tuple, Optional, List -import yaml +import copy +import os import re -from .sql_methods import SQL, query_processed_files, sql_data_exchange, initialize_database -import pandas as pd -import numpy as np from datetime import datetime -import xarray as xr -import os -import copy +from pathlib import Path +from typing import List, Optional, Union + import boto3 +import numpy as np +import pandas as pd +import xarray as xr +import yaml from botocore.exceptions import ClientError -from .live_core import( +from .live_core import ( + LIVE_CONFIG_DATA_MODEL, + LIVE_CONFIG_INIT_MODEL, LIVE_FILE_FORMAT_MAP, LIVE_INPUT_FILE_CONFIG_MAP, SPATIAL_CONFIG_MAP, - LIVE_CONFIG_INIT_MODEL, - LIVE_CONFIG_DATA_MODEL ) - from .live_spatial_methods import create_inpfc_strata +from .sql_methods import initialize_database, query_processed_files + # TODO: Incorporate complete YAML file validator # TODO: Documentation -def live_configuration(live_init_config_path: Union[str, Path], - live_file_config_path: Union[str, Path]): - +def live_configuration( + live_init_config_path: Union[str, Path], live_file_config_path: Union[str, Path] +): + # Validate file existence # ---- str-to-Path conversion, if necessary live_init_config_path = Path(live_init_config_path) @@ -42,19 +44,21 @@ def live_configuration(live_init_config_path: Union[str, Path], ] raise FileNotFoundError( f"The following configuration files do not exist: {missing_config}." - ) + ) # Read the YAML configuration/recipe file to parameterize the `LiveSurvey` class # ---- Initialization settings init_config = yaml.safe_load(Path(live_init_config_path).read_text()) # -------- Validate - init_config = validate_live_config(copy.deepcopy(init_config), LIVE_CONFIG_INIT_MODEL, - live_init_config_path) + init_config = validate_live_config( + copy.deepcopy(init_config), LIVE_CONFIG_INIT_MODEL, live_init_config_path + ) # ---- Filepath/directory settings file_config = yaml.safe_load(Path(live_file_config_path).read_text()) - file_config = validate_live_config(copy.deepcopy(file_config), LIVE_CONFIG_DATA_MODEL, - live_file_config_path) - + file_config = validate_live_config( + copy.deepcopy(file_config), LIVE_CONFIG_DATA_MODEL, live_file_config_path + ) + # Check for intersecting/duplicative configuration keys # ---- Compare sets of keys from each dictionary config_intersect = set(init_config.keys()).intersection(set(file_config.keys())) @@ -65,36 +69,38 @@ def live_configuration(live_init_config_path: Union[str, Path], f"keys: {' ,'.join(config_intersect)}. Key names must be unique for each configuration " f"file." ) - + # Combine both into a dictionary output that can be added to the `LiveSurvey` class object return {**init_config, **file_config} -def read_acoustic_files(acoustic_files: List[str], - xarray_kwargs: dict = {}) -> tuple: + +def read_acoustic_files(acoustic_files: List[str], xarray_kwargs: dict = {}) -> tuple: # Get the file-specific settings, datatypes, columns, etc. # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"] # Read all of the zarr files - results_list = [(data_df, unit_dict) if i ==0 else (data_df, None) - for i, (data_df, unit_dict) in enumerate( - read_acoustic_zarr(file, acoustics_config_map, - xarray_kwargs=xarray_kwargs) - for file in acoustic_files - )] + results_list = [ + (data_df, unit_dict) if i == 0 else (data_df, None) + for i, (data_df, unit_dict) in enumerate( + read_acoustic_zarr(file, acoustics_config_map, xarray_kwargs=xarray_kwargs) + for file in acoustic_files + ) + ] # Concatenate the dataframe component - acoustic_data_df = pd.concat([df for df, _ in results_list], ignore_index = True) + acoustic_data_df = pd.concat([df for df, _ in results_list], ignore_index=True) # ---- Add the `acoustic_data_units` to the dictionary and output the resulting tuple return acoustic_data_df, results_list[0][1] if results_list else None -def filter_filenames(directory_path: Path, filename_id: str, - files: List[Path], - file_extension: str): + +def filter_filenames( + directory_path: Path, filename_id: str, files: List[Path], file_extension: str +): # Drop the `{FIELD_ID}` tag identifier - file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', filename_id) + file_id_format = re.sub(r"\{FILE_ID:([^}]+)\}", r"\1", filename_id) # ---- Replace all other tags with `*` placeholders file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) # ---- Compile the pattern @@ -102,8 +108,9 @@ def filter_filenames(directory_path: Path, filename_id: str, pattern = re.compile(escaped_file_id_format.replace(r"\*", ".*")) # pattern = re.compile(rf'{file_id_format.replace(".", r"\.").replace("*", ".*")}') # ---- Create Path object with the generalized format: S3 - s3_files = [filename for filename in files - if filename.startswith("s3://") and pattern.search(filename)] + s3_files = [ + filename for filename in files if filename.startswith("s3://") and pattern.search(filename) + ] # ---- Local search local_files = Path(directory_path).glob(f"{file_id_format}.{file_extension}") # ---- Assign to subfile path object @@ -116,19 +123,21 @@ def filter_filenames(directory_path: Path, filename_id: str, # Convert list of proposed files from Path to String file_str = [str(file) for file in list(files)] - + # Find intersection with the proposed filenames and return the output return list(set(subfile_str).intersection(set(file_str))) -def read_biology_files(biology_files: List[str], file_configuration: dict, - pandas_kwargs: dict = {}): + +def read_biology_files( + biology_files: List[str], file_configuration: dict, pandas_kwargs: dict = {} +): # Get the biology data file settings file_settings = file_configuration["input_directories"]["biology"] # Get the file-specific settings, datatypes, columns, etc. # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` - biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] + biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] # ---- Extract the expected file name ID's biology_file_ids = file_settings["file_name_formats"] # ---- Extract all of the file ids @@ -141,54 +150,59 @@ def read_biology_files(biology_files: List[str], file_configuration: dict, directory_path = "/".join([file_configuration["data_root_dir"], file_settings["directory"]]) else: directory_path = file_settings["directory"] - + # Add SQL file to dict # file_configuration["database"]["biology"] = ( - # Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"] + # Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"] # ) file_configuration["database"]["biology"] = ( - # Path(file_configuration["database_directory"]) / file_settings["database_name"] - "/".join([file_configuration["database_directory"], file_settings["database_name"]]) + # Path(file_configuration["database_directory"]) / file_settings["database_name"] + "/".join([file_configuration["database_directory"], file_settings["database_name"]]) ) # Iterate through the different biology datasets and read them in for dataset in list(biology_file_ids.keys()): # ---- Get dataset-specific file lists - dataset_files = filter_filenames(directory_path, - biology_file_ids[dataset], - biology_files, - file_settings["extension"]) + dataset_files = filter_filenames( + directory_path, biology_file_ids[dataset], biology_files, file_settings["extension"] + ) # ---- If there are dataset files available if dataset_files: # ---- Read in validated biology data - dataframe_list = [read_biology_csv(file, - file_settings["file_name_formats"][dataset], - biology_config_map[dataset], - pandas_kwargs) - for file in dataset_files] + dataframe_list = [ + read_biology_csv( + file, + file_settings["file_name_formats"][dataset], + biology_config_map[dataset], + pandas_kwargs, + ) + for file in dataset_files + ] # ---- Concatenate the dataset dataframe_combined = pd.concat(dataframe_list, ignore_index=True) # ---- Lower-case sex - if "sex" in dataframe_combined.columns: + if "sex" in dataframe_combined.columns: dataframe_combined["sex"] = dataframe_combined["sex"].str.lower() # ---- Lower-case trawl partition type - if "trawl_partition" in dataframe_combined.columns: - dataframe_combined["trawl_partition"] = dataframe_combined["trawl_partition"].str.lower() + if "trawl_partition" in dataframe_combined.columns: + dataframe_combined["trawl_partition"] = dataframe_combined[ + "trawl_partition" + ].str.lower() # ---- Reformat datetime column if "datetime" in dataframe_combined.columns: dataframe_combined["datetime"] = convert_datetime(dataframe_combined["datetime"]) # ---- Add to the data dictionary biology_output[f"{dataset}_df"] = dataframe_combined - + # Return the output return biology_output + def read_acoustic_zarr(file: Path, config_map: dict, xarray_kwargs: dict = {}) -> tuple: - + # Format the file reading configuration # ---- Concatenate into a full configuration map - full_config_map = {**config_map["xarray_coordinates"], - **config_map["xarray_variables"]} + full_config_map = {**config_map["xarray_coordinates"], **config_map["xarray_variables"]} # Determine the file loading method for the `acoustic_files` zarr_data_ds = xr.open_dataset(file, engine="zarr", chunks="auto", **xarray_kwargs) @@ -197,11 +211,9 @@ def read_acoustic_zarr(file: Path, config_map: dict, xarray_kwargs: dict = {}) - # ---- Convert to a DataFrame zarr_data_df = zarr_data_ds.to_dataframe().reset_index() # ---- Check for any missing columns - missing_columns = ( - [key for key in full_config_map.keys() if key not in zarr_data_df.columns] - ) + missing_columns = [key for key in full_config_map.keys() if key not in zarr_data_df.columns] # ---- Raise Error, if needed - if missing_columns: + if missing_columns: raise ValueError( f"The following columns are missing from at least one file: in " f"{', '.join(missing_columns)}!" @@ -210,7 +222,7 @@ def read_acoustic_zarr(file: Path, config_map: dict, xarray_kwargs: dict = {}) - zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map) # Add the filename as a column - zarr_data_df_filtered["source"] = Path(file).name + zarr_data_df_filtered["source"] = Path(file).name # Gather some of the units data_units = { @@ -222,6 +234,7 @@ def read_acoustic_zarr(file: Path, config_map: dict, xarray_kwargs: dict = {}) - # Return a Tuple return zarr_data_df_filtered, data_units + def construct_directorypath(file_configuration: dict, file_settings: dict): """Construct the root directory path.""" @@ -235,18 +248,21 @@ def construct_directorypath(file_configuration: dict, file_settings: dict): data_directory = file_settings["directory"] # Return the directory path - if root_directory != "": + if root_directory != "": return "/".join([root_directory, data_directory]) else: return data_directory + def is_s3_path(path): """Check if a path is an S3 path.""" return path.startswith("s3://") + # TODO: Documentation -def validate_data_directory(file_configuration: dict, dataset: str, - input_filenames: Optional[list] = None) -> List[Path]: +def validate_data_directory( + file_configuration: dict, dataset: str, input_filenames: Optional[list] = None +) -> List[Path]: # Get the dataset file settings file_settings = file_configuration["input_directories"][dataset] @@ -256,10 +272,8 @@ def validate_data_directory(file_configuration: dict, dataset: str, # Validate `input_filenames` input if input_filenames is not None and not isinstance(input_filenames, list): - raise TypeError( - "Data loading argument `input_filenames` must be a list." - ) - + raise TypeError("Data loading argument `input_filenames` must be a list.") + # Format data filenames if input_filenames is not None: data_files = ["/".join([directory_path, filename]) for filename in input_filenames] @@ -277,14 +291,16 @@ def validate_data_directory(file_configuration: dict, dataset: str, # ---- Validate validate_local_path(directory_path, file_settings) # ---- Format data files - if input_filenames is None: + if input_filenames is None: data_files = list(Path(directory_path).glob(f"*{'.'+file_settings['extension']}")) - + # Clean the filenames data_files = [ - re.sub(r'//', r'\\', str(filename)).replace('/', '\\') - if not str(filename).startswith('s3://') - else str(filename) + ( + re.sub(r"//", r"\\", str(filename)).replace("/", "\\") + if not str(filename).startswith("s3://") + else str(filename) + ) for filename in data_files ] @@ -296,81 +312,82 @@ def validate_data_directory(file_configuration: dict, dataset: str, # Drop incomplete datasets if dataset == "biology": - data_files = validate_complete_biology_dataset(data_files, - directory_path, - file_configuration) - + data_files = validate_complete_biology_dataset( + data_files, directory_path, file_configuration + ) + # Query the SQL database to process only new files (or create the db file in the first place) - valid_files, file_configuration["database"][dataset] = ( - query_processed_files(database_root_directory, file_settings, data_files) + valid_files, file_configuration["database"][dataset] = query_processed_files( + database_root_directory, file_settings, data_files ) # Return the valid filenames/paths return valid_files + def validate_s3_path(s3_path: str, cloud_credentials: dict): """Check if (parts of) S3 path exists.""" # Redundant validation that S3 object validation is appropriate if not is_s3_path(s3_path): - raise ValueError("The path is not an S3 path.") - + raise ValueError("The path is not an S3 path.") + # Validate credentials - if not all([True if param in cloud_credentials.keys() else False - for param in ["key", "secret"]]): + if not all( + [True if param in cloud_credentials.keys() else False for param in ["key", "secret"]] + ): # ---- Find missing credentials missing_creds = set(["key", "secret"]) - set(cloud_credentials) # ---- Format into string missing_creds_str = ", ".join(["'{}'".format(x.replace("'", "''")) for x in missing_creds]) # ---- Raise Error - raise PermissionError( - f"Required S3 credentials missing: {missing_creds_str}." - ) + raise PermissionError(f"Required S3 credentials missing: {missing_creds_str}.") # Remove the s3:// prefix - s3_path_reduced = s3_path[len("s3://"):] + s3_path_reduced = s3_path[len("s3://") :] # Split into bucket and key parts = s3_path_reduced.split("/", 1) if len(parts) < 2: raise ValueError(f"Invalid S3 path format for '{s3_path}'.") - + # Get bucket name and directory keys bucket_name, directory = parts # Initialize the S3 client - s3_client = boto3.client("s3", - aws_access_key_id=cloud_credentials["key"], - aws_secret_access_key=cloud_credentials["secret"]) - + s3_client = boto3.client( + "s3", + aws_access_key_id=cloud_credentials["key"], + aws_secret_access_key=cloud_credentials["secret"], + ) + # Check if the bucket exists try: s3_client.head_bucket(Bucket=bucket_name) - except ClientError as e: + except ClientError: raise FileNotFoundError( f"S3 bucket '{bucket_name}' does not exist or you do not have access." ) - + # Check if the S3 directory exists try: - # ---- Ping a response from the bucket + # ---- Ping a response from the bucket response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=directory, MaxKeys=1) # ---- Check for `Contents` if "Contents" not in response: raise FileNotFoundError(f"S3 path '{s3_path}' does not exist.") - except ClientError as e: + except ClientError as e: # --- Raise Error and propagate it upwards raise e + def validate_local_path(directory_path: str, file_settings: dict): # Validate filepath # ---- Error evaluation (if applicable) if not Path(directory_path).exists(): - raise FileNotFoundError( - f"The data directory [{directory_path}] does not exist." - ) - + raise FileNotFoundError(f"The data directory [{directory_path}] does not exist.") + # Validate that files even exist # ---- List available files of target extension data_files = list(Path(directory_path).glob(f"*{'.'+file_settings['extension']}")) @@ -381,9 +398,9 @@ def validate_local_path(directory_path: str, file_settings: dict): ) -def validate_complete_biology_dataset(data_files: List[str], - directory_path: str, - file_configuration: dict): +def validate_complete_biology_dataset( + data_files: List[str], directory_path: str, file_configuration: dict +): # Get the biology data file settings file_settings = file_configuration["input_directories"]["biology"] @@ -396,33 +413,33 @@ def validate_complete_biology_dataset(data_files: List[str], def get_file_haul_number(filename, format_string): # Step 1: Extract the filename from the full path filename_only = os.path.basename(filename) - + # Remove the file extension from the filename filename_no_ext = os.path.splitext(filename_only)[0] # Split the format string and filename into parts - format_parts = re.findall(r'\{[^}]+\}|[^_]+', format_string) - filename_parts = filename_no_ext.split('_') + format_parts = re.findall(r"\{[^}]+\}|[^_]+", format_string) + filename_parts = filename_no_ext.split("_") # Find the index of {HAUL} in format_parts - haul_index = format_parts.index('{HAUL}') + haul_index = format_parts.index("{HAUL}") # Extract and return the haul number from filename_parts if haul_index < len(filename_parts): return filename_parts[haul_index] return None - + # Organize dataset by their respective dataset-type - dataset_dict = {key: filter_filenames(directory_path, - ds, - data_files, - file_settings["extension"]) - for key, ds in biology_file_ids.items()} - + dataset_dict = { + key: filter_filenames(directory_path, ds, data_files, file_settings["extension"]) + for key, ds in biology_file_ids.items() + } + # Extract the haul numbers extracted_hauls = { - key: set(get_file_haul_number(filename, biology_file_ids.get(key, '')) - for filename in filenames) + key: set( + get_file_haul_number(filename, biology_file_ids.get(key, "")) for filename in filenames + ) for key, filenames in dataset_dict.items() } @@ -434,8 +451,7 @@ def get_file_haul_number(filename, format_string): filename for key, filenames in dataset_dict.items() for filename in filenames - if get_file_haul_number(filename, biology_file_ids.get(key, '')) - in common_hauls + if get_file_haul_number(filename, biology_file_ids.get(key, "")) in common_hauls ] # Get bad files for DEBUG @@ -443,14 +459,12 @@ def get_file_haul_number(filename, format_string): filename for key, filenames in dataset_dict.items() for filename in filenames - if get_file_haul_number(filename, biology_file_ids.get(key, '')) - not in common_hauls + if get_file_haul_number(filename, biology_file_ids.get(key, "")) not in common_hauls ] # ---- Create list non_filtered_filenames_lst = "\n".join(non_filtered_filenames) print( - f"The following files are parts of incomplete filesets: \n" - f"{non_filtered_filenames_lst}" + f"The following files are parts of incomplete filesets: \n" f"{non_filtered_filenames_lst}" ) # Return the curated filename list @@ -461,40 +475,37 @@ def compile_filename_format(file_name_format: str): # Create a copy of `file_name_format` regex_pattern = file_name_format - + # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern for key, value in LIVE_FILE_FORMAT_MAP.items(): regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"]) # ---- Replace the `FILE_ID` tag - regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P\1)', regex_pattern) + regex_pattern = re.sub(r"\{FILE_ID:(.+?)\}", r"(?P\1)", regex_pattern) # Compile the regex pattern and return the output return re.compile(regex_pattern) + def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict, pandas_kwargs: dict = {}): # Read in the `*.csv` file - df = pd.read_csv(file, - usecols=list(config_map["dtypes"].keys()), - storage_options=pandas_kwargs) + df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys()), storage_options=pandas_kwargs) # Validate the dataframe # ---- Check for any missing columns - missing_columns = ( - [key for key in config_map["dtypes"].keys() if key not in df.columns] - ) + missing_columns = [key for key in config_map["dtypes"].keys() if key not in df.columns] # ---- Raise Error, if needed - if missing_columns: + if missing_columns: raise ValueError( f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!" ) # ---- Ensure the correct datatypes df_validated = df.astype(config_map["dtypes"]) - # ---- Replace column names and drop + # ---- Replace column names and drop df_validated = df_validated.rename(columns=config_map["names"]) # Get the substring components that can be added to the DataFrame - filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern) + filename_substrings = re.findall(r"\{([^:}]+)(?::[^}]+)?}", pattern) # ---- Create sub-list of columns that can be added to the DataFrame valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings))) @@ -504,30 +515,32 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict, pandas_k match_obj = compiled_regex.search(file) # Iterate through the filename-derived tags and add them to the DataFrame - for i in valid_tags: + for i in valid_tags: matched_key = LIVE_FILE_FORMAT_MAP[i] df_validated[matched_key["name"]] = matched_key["dtype"](match_obj.group(i)) # Return the resulting DataFrame return df_validated + def infer_datetime_format(timestamp_str: Union[int, str]): patterns = { - r"^\d{14}$": "%Y%m%d%H%M%S", # YYYYMMDDHHMMSS - r"^\d{8}$": "%Y%m%d", # YYYYMMDD - r"^\d{6}$": "%H%M%S", # HHMMSS + r"^\d{14}$": "%Y%m%d%H%M%S", # YYYYMMDDHHMMSS + r"^\d{8}$": "%Y%m%d", # YYYYMMDD + r"^\d{6}$": "%H%M%S", # HHMMSS r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$": "%Y-%m-%d %H:%M:%S", # YYYY-MM-DD HH:MM:SS r"^\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}$": "%Y/%m/%d %H:%M:%S", # YYYY/MM/DD HH:MM:SS - r"^\d{4}-\d{2}-\d{2}$": "%Y-%m-%d", # YYYY-MM-DD - r"^\d{4}/\d{2}/\d{2}$": "%Y/%m/%d" # YYYY/MM/DD + r"^\d{4}-\d{2}-\d{2}$": "%Y-%m-%d", # YYYY-MM-DD + r"^\d{4}/\d{2}/\d{2}$": "%Y/%m/%d", # YYYY/MM/DD } - + for pattern, date_format in patterns.items(): if re.match(pattern, timestamp_str): return date_format - + raise ValueError("Unknown timestamp format") + def convert_datetime(timestamp: Union[int, str, pd.Series]): if isinstance(timestamp, pd.Series): @@ -544,6 +557,7 @@ def convert_datetime(timestamp: Union[int, str, pd.Series]): else: return datetime.strptime(timestamp, datetime_format) + def validate_hauls_config(spatial_config: dict, link_method: str): # Get the link method configuration map @@ -552,20 +566,21 @@ def validate_hauls_config(spatial_config: dict, link_method: str): # Extract the defined settings input_method_settings = spatial_config[link_method] - # Check for `proximity` + # Check for `proximity` if "proximity" not in input_method_settings.keys(): raise KeyError( "The following parameters are missing from the biology-acoustic linking method: " "'proximity'!" ) - + # Evaluate valid options for `proximity` if input_method_settings["proximity"] not in link_method_settings["proximity"]["choices"]: raise KeyError( f"Value biology-acoustic linking method parameter `proximity` must be one of the : " f"following: {link_method_settings['proximity']['choices']}." - ) - + ) + + def validate_griddify_config(spatial_config: dict, link_method: str): # Get the link method configuration map @@ -581,8 +596,8 @@ def validate_griddify_config(spatial_config: dict, link_method: str): raise KeyError( f"The following parameters are missing from the biology-acoustic linking method: " f"{list(key_diff)}!" - ) - + ) + # Iterate through the keys to evaluate inputs for key in list(input_method_settings.keys()): # ---- Subset the input method config @@ -596,7 +611,7 @@ def validate_griddify_config(spatial_config: dict, link_method: str): raise KeyError( f"Unexpected parameter(s) ('{parameter_diff}') detected in '{link_method}' " f"configuration." - ) + ) # ---- Check if the appropriate coordinate pairs are present coordinate_pairs = [set(param).intersection(set(input.keys())) for param in model["pairs"]] # ---- Count the number of paired coordinates @@ -630,7 +645,8 @@ def validate_griddify_config(spatial_config: dict, link_method: str): f"Biology-acoustic linking method argument '{parameter}' within '{key}' " f"for method '{link_method}' must be one of the following types within a list: " f"{config_dtypes}." - ) + ) + def validate_inpfc_config(spatial_config: dict, link_method: str): @@ -648,14 +664,14 @@ def validate_inpfc_config(spatial_config: dict, link_method: str): f"The following parameters are missing from the biology-acoustic linking method: " f"{list(key_diff)}!" ) - + # Iterate through the keys to evaluate inputs for key in list(input_method_settings.keys()): # ---- Subset the input method config input = input_method_settings[key] # ---- Get the original config of the dtypes model = link_method_settings[key]["types"] - # ---- Evaluate if a list + # ---- Evaluate if a list if not isinstance(input, list): raise TypeError( f"Biology-acoustic linking method argument '{key}' for method '{link_method}' must " @@ -666,8 +682,9 @@ def validate_inpfc_config(spatial_config: dict, link_method: str): raise TypeError( f"Biology-acoustic linking method argument '{key}' for method '{link_method}' must " f"be one of the following types within a list: {model}." - ) - + ) + + def configure_spatial_settings(file_configuration: dict): # Extract spatial strata *only* if spatial information from the configuration settings @@ -690,16 +707,19 @@ def configure_spatial_settings(file_configuration: dict): spatial_dict.update({"strata": create_inpfc_strata(spatial_config)}) # ---- Update the stratum classification in the primary file configuration file_configuration.update({"spatial_column": ["stratum"]}) - else: + else: # ---- Empty `spatial_column` key file_configuration.update({"spatial_column": []}) # Add grid - file_configuration.update({"gridding_column": file_configuration["spatial_column"] + ["x", "y"]}) + file_configuration.update( + {"gridding_column": file_configuration["spatial_column"] + ["x", "y"]} + ) # Return the dictionary as an output return spatial_dict + def validate_spatial_config(spatial_config: dict): # Check the link method @@ -711,7 +731,7 @@ def validate_spatial_config(spatial_config: dict): f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options " f"include: 'global', 'closest_haul', 'weighted_haul', 'griddify', and 'INPFC'." ) - + # Verify that associated parameters are present in the configuration settings # ---- Get keys as a list config_keys = list(spatial_config.keys()) @@ -720,18 +740,19 @@ def validate_spatial_config(spatial_config: dict): raise ValueError( f"No parameters provided for the biology-acoustic linking ([{link_method}])." ) - + # Check key settings - if link_method == "griddify": + if link_method == "griddify": validate_griddify_config(spatial_config, link_method) - elif link_method == "inpfc": + elif link_method == "inpfc": validate_inpfc_config(spatial_config, link_method) - elif link_method != "global": + elif link_method != "global": validate_hauls_config(spatial_config, link_method) + def validate_live_config(config: dict, reference_model: dict, filename: Union[str, Path]): """Validate configuration inputs""" - + # Convert to string if Path if isinstance(filename, Path): filename = str(filename) @@ -752,6 +773,7 @@ def validate_keys(config, model, path=""): def get_keys_from_tuples(tuples): """Parse key names from tuples""" return {key for group in tuples if isinstance(group, tuple) for key in group} + # ---- def find_missing_keys(required_keys, keys_to_check): """Find any missing keys""" @@ -764,14 +786,16 @@ def find_missing_keys(required_keys, keys_to_check): missing_keys = [key for key in valid_keys_in_tuples if key not in keys_to_check] unexpected_keys = [key for key in keys_to_check if key not in all_required_keys] return missing_keys, unexpected_keys + # ---- def check_for_missing_keys(required_keys, config_keys, path): """Check whether any required keys are missing""" missing_required = [] for key in required_keys: if isinstance(key, tuple): - missing_keys, unexpected_keys_for_keys = find_missing_keys(required_keys, - config_keys) + missing_keys, unexpected_keys_for_keys = find_missing_keys( + required_keys, config_keys + ) if missing_keys: raise ValueError( f"Missing required configuration key(s): " @@ -787,31 +811,38 @@ def check_for_missing_keys(required_keys, config_keys, path): f"{path} in configuration file '{filename}'." ) return [] + # ---- def check_for_unexpected_keys(config_keys, required_keys): """Check for unexpected keys""" unexpected_keys = [] for key in config_keys: - if (key not in required_keys - and key not in optional_keys - and "*" not in required_keys): + if ( + key not in required_keys + and key not in optional_keys + and "*" not in required_keys + ): if not any(key in group for group in required_keys if isinstance(group, tuple)): unexpected_keys.append(key) return unexpected_keys # Top-level validation if path == "": - missing_primary_keys = [key for key in required_keys - if key != "*" and key not in config] + missing_primary_keys = [ + key for key in required_keys if key != "*" and key not in config + ] if missing_primary_keys: raise ValueError( f"Missing primary configuration key(s): {', '.join(missing_primary_keys)} in " f"configuration file '{filename}'." ) - unexpected_primary_keys = [key for key in config - if key not in required_keys - and key not in optional_keys - and "*" not in required_keys] + unexpected_primary_keys = [ + key + for key in config + if key not in required_keys + and key not in optional_keys + and "*" not in required_keys + ] # ---- Raise error if unexpected_primary_keys: raise ValueError( @@ -828,14 +859,15 @@ def check_for_unexpected_keys(config_keys, required_keys): raise ValueError( f"Unexpected key(s) found: {', '.join(unexpected_keys)} at {path} in " f"configuration file '{filename}'." - ) + ) # Recursively validate nested dictionaries and lists for key, sub_model in keys.items(): if key == "*" and isinstance(sub_model, dict): for sub_key in config: - validate_keys(config[sub_key], - sub_model, path=f"{path}.{sub_key}" if path else sub_key) + validate_keys( + config[sub_key], sub_model, path=f"{path}.{sub_key}" if path else sub_key + ) elif key == "*" and isinstance(sub_model, list): for sub_key in config: validate_list(config[sub_key], sub_model, key, path) @@ -879,6 +911,7 @@ def validate_list(config_value, allowed_types, key, path): f"Invalid type for key '{key}' at {path} in {filename}. Expected a list of: " f"{allowed_types}" ) + # ---- def validate_type(config_value, expected_type, key, path): """Validate configuration with model that is at the furthest point along a branch""" diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py index a2dcaa46..07672a0f 100644 --- a/echopop/live/live_data_processing.py +++ b/echopop/live/live_data_processing.py @@ -1,31 +1,24 @@ -import yaml -import re +from typing import List + +import numpy as np +import pandas as pd -from functools import reduce -from .sql_methods import SQL, sql_group_update, query_dataset, get_unique_identifiers from .live_biology import summarize_strata from .live_spatial_methods import update_population_grid -from pathlib import Path -from typing import Union, Tuple, Optional, List - -import pandas as pd +from .sql_methods import query_dataset, sql_group_update -import numpy as np -from .live_core import( - LIVE_FILE_FORMAT_MAP, - LIVE_INPUT_FILE_CONFIG_MAP -) +def get_average_strata_weights(db_file: str, data_dict: dict, unique_columns: list): -def get_average_strata_weights(db_file: str, - data_dict: dict, - unique_columns: list): - # Get corresponding `weight_fitted_df` from the database - weight_fitted_sql_df = query_dataset(db_file, data_dict, table_name="weight_stratum_df", - data_columns=unique_columns + ["average_weight"], - unique_columns=unique_columns, - constraint="sex == 'all'") + weight_fitted_sql_df = query_dataset( + db_file, + data_dict, + table_name="weight_stratum_df", + data_columns=unique_columns + ["average_weight"], + unique_columns=unique_columns, + constraint="sex == 'all'", + ) # ---- Use SQL table data if present if weight_fitted_sql_df is not None and not weight_fitted_sql_df.empty: # ---- Return output @@ -33,6 +26,7 @@ def get_average_strata_weights(db_file: str, else: return None + def configure_database_paths(file_configuration: dict): # Extract input directory settings @@ -42,16 +36,22 @@ def configure_database_paths(file_configuration: dict): database_dir = file_configuration["database_directory"] # Update configuration - file_configuration["database"].update({ - dataset: "/".join([database_dir, file_settings[dataset]["database_name"]]) - for dataset in file_settings.keys() if "database_name" in file_settings[dataset] - }) + file_configuration["database"].update( + { + dataset: "/".join([database_dir, file_settings[dataset]["database_name"]]) + for dataset in file_settings.keys() + if "database_name" in file_settings[dataset] + } + ) + -def acoustic_pipeline(acoustic_dict: dict, - strata_df: pd.DataFrame, - file_configuration: dict, - verbose: bool, - contrast_columns: List[str] = []): +def acoustic_pipeline( + acoustic_dict: dict, + strata_df: pd.DataFrame, + file_configuration: dict, + verbose: bool, + contrast_columns: List[str] = [], +): # Get spatial column spatial_column = file_configuration["spatial_column"] @@ -68,40 +68,32 @@ def acoustic_pipeline(acoustic_dict: dict, if acoustic_dict["nasc_df"] is None or acoustic_dict["nasc_df"].empty: # ---- Print, if verbose if verbose: - print( - f"No new processed acoustic data available for processing." - ) + print("No new processed acoustic data available for processing.") else: # Get related acoustic data - acoustic_df = get_nasc_sql_data(acoustic_db, - acoustic_dict, - unique_columns=unique_columns) - + acoustic_df = get_nasc_sql_data(acoustic_db, acoustic_dict, unique_columns=unique_columns) + # Get the corresopding `sigma_bs` data (and also compute the sample-number weighted average) - sigma_bs_df = get_sigma_bs_sql_data(biology_db, - acoustic_dict, - unique_columns=["stratum"]) - + sigma_bs_df = get_sigma_bs_sql_data(biology_db, acoustic_dict, unique_columns=["stratum"]) + # Calculate population estimates if valid data are available if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]): # ---- Merge the NASC and sigma_bs datasets nasc_biology = acoustic_df.merge(sigma_bs_df, on=spatial_column) # ---- Compute the number densities (animals nmi^-2) - nasc_biology["number_density"] = ( - nasc_biology["nasc"] - / (4.0 * np.pi * nasc_biology["sigma_bs_mean"]) + nasc_biology["number_density"] = nasc_biology["nasc"] / ( + 4.0 * np.pi * nasc_biology["sigma_bs_mean"] ) # Get the corresponding average strata weights (computed for all fish) - weight_spatial_averages = get_average_strata_weights(biology_db, - acoustic_dict, - unique_columns=spatial_column + contrast_columns) - + weight_spatial_averages = get_average_strata_weights( + biology_db, acoustic_dict, unique_columns=spatial_column + contrast_columns + ) + if weight_spatial_averages is not None: # Merge average weights with number density estimates - nasc_biology = nasc_biology.merge(weight_spatial_averages, - on=spatial_column) + nasc_biology = nasc_biology.merge(weight_spatial_averages, on=spatial_column) # Compute biomass densities nasc_biology["biomass_density"] = ( @@ -109,49 +101,65 @@ def acoustic_pipeline(acoustic_dict: dict, ) # Update the survey population estimate DataFrame with the newly computed densities - if (all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]) - and not nasc_biology.empty): - sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df", - columns=["number_density", "biomass_density"], - unique_columns=["id"]) - + if ( + all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]) + and not nasc_biology.empty + ): + sql_group_update( + acoustic_db, + dataframe=nasc_biology, + table_name="survey_data_df", + columns=["number_density", "biomass_density"], + unique_columns=["id"], + ) + # Summarize strata summarize_strata(nasc_biology, strata_df, file_configuration) # Update grid - update_population_grid(file_configuration, coordinates=["x", "y"], - dataset=nasc_biology) + update_population_grid( + file_configuration, coordinates=["x", "y"], dataset=nasc_biology + ) + + +def get_nasc_sql_data(db_file: str, data_dict: dict, unique_columns: List[str]): -def get_nasc_sql_data(db_file: str, - data_dict: dict, - unique_columns: List[str]): - # Add SELECTION columns - data_columns = ( - unique_columns + ["longitude", "latitude", "ping_time", "nasc", "number_density", - "biomass_density", "id"] - ) + data_columns = unique_columns + [ + "longitude", + "latitude", + "ping_time", + "nasc", + "number_density", + "biomass_density", + "id", + ] # ----- Get the SQL dataset - nasc_sql_data = query_dataset(db_file, - data_dict, - table_name="survey_data_df", - data_columns = data_columns, - unique_columns=unique_columns, - constraint="nasc > 0.0") + nasc_sql_data = query_dataset( + db_file, + data_dict, + table_name="survey_data_df", + data_columns=data_columns, + unique_columns=unique_columns, + constraint="nasc > 0.0", + ) # ---- Use SQL table data if present if nasc_sql_data is not None and not nasc_sql_data.empty: return nasc_sql_data elif "nasc_df" in data_dict.keys(): return data_dict["nasc_df"] -def get_sigma_bs_sql_data(db_file: str, - data_dict: dict, - unique_columns: list): + +def get_sigma_bs_sql_data(db_file: str, data_dict: dict, unique_columns: list): # Get corresponding `sigma_bs` DataFrame - sigma_bs_sql_df = query_dataset(db_file, data_dict, table_name="sigma_bs_mean_df", - data_columns=unique_columns + ["sigma_bs", "sigma_bs_count"], - unique_columns=unique_columns) + sigma_bs_sql_df = query_dataset( + db_file, + data_dict, + table_name="sigma_bs_mean_df", + data_columns=unique_columns + ["sigma_bs", "sigma_bs_count"], + unique_columns=unique_columns, + ) # ---- Use SQL table data if present if sigma_bs_sql_df is not None and not sigma_bs_sql_df.empty: # ---- Compute the weighted average @@ -165,14 +173,15 @@ def get_sigma_bs_sql_data(db_file: str, return sigma_bs_mean_sql_df else: return None - -def biology_pipeline(biology_dict: dict, - strata_df: pd.DataFrame, - file_configuration: dict, - verbose: bool, - contrast_columns: List[str] = []): +def biology_pipeline( + biology_dict: dict, + strata_df: pd.DataFrame, + file_configuration: dict, + verbose: bool, + contrast_columns: List[str] = [], +): # Get spatial column spatial_column = file_configuration["spatial_column"] @@ -186,43 +195,36 @@ def biology_pipeline(biology_dict: dict, # Check for data completion # ---- List of boolean values - full_biology_data = ( - [True if (isinstance(df, pd.DataFrame) and not df.empty) or (isinstance(df, dict)) - else False for _, df in biology_dict.items()] - ) + full_biology_data = [ + True if (isinstance(df, pd.DataFrame) and not df.empty) or (isinstance(df, dict)) else False + for _, df in biology_dict.items() + ] # ---- Validation if not all(full_biology_data): # ---- Print, if verbose if verbose: - print( - f"No new processed biology data available for processing." - ) + print("No new processed biology data available for processing.") else: # Get related biology data - acoustic_df = get_nasc_sql_data(acoustic_db, - biology_dict, - unique_columns=unique_columns) + acoustic_df = get_nasc_sql_data(acoustic_db, biology_dict, unique_columns=unique_columns) # Get the corresopding `sigma_bs` data (and also compute the sample-number weighted average) - sigma_bs_df = get_sigma_bs_sql_data(biology_db, - biology_dict, - unique_columns=unique_columns) + sigma_bs_df = get_sigma_bs_sql_data(biology_db, biology_dict, unique_columns=unique_columns) # Calculate population estimates if valid data are available - if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]): + if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]): # ---- Merge the NASC and sigma_bs datasets nasc_biology = acoustic_df.merge(sigma_bs_df, on=unique_columns) # ---- Compute the number densities (animals nmi^-2) - nasc_biology["number_density"] = ( - nasc_biology["nasc"] - / (4.0 * np.pi * nasc_biology["sigma_bs_mean"]) + nasc_biology["number_density"] = nasc_biology["nasc"] / ( + 4.0 * np.pi * nasc_biology["sigma_bs_mean"] ) # Get the corresponding average strata weights (computed for all fish) - weight_spatial_averages = get_average_strata_weights(biology_db, - biology_dict, - unique_columns=unique_columns) - + weight_spatial_averages = get_average_strata_weights( + biology_db, biology_dict, unique_columns=unique_columns + ) + if weight_spatial_averages is not None: # Merge average weights with number density estimates nasc_biology = nasc_biology.merge(weight_spatial_averages, on=unique_columns) @@ -233,14 +235,19 @@ def biology_pipeline(biology_dict: dict, ) # Update the survey population estimate DataFrame with the newly computed densities - if not nasc_biology.empty: - sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df", - columns=["number_density", "biomass_density"], - unique_columns=["stratum", "longitude", "latitude", "ping_time"]) - + if not nasc_biology.empty: + sql_group_update( + acoustic_db, + dataframe=nasc_biology, + table_name="survey_data_df", + columns=["number_density", "biomass_density"], + unique_columns=["stratum", "longitude", "latitude", "ping_time"], + ) + # Summarize strata summarize_strata(nasc_biology, strata_df, file_configuration) # Update population grid - update_population_grid(file_configuration, coordinates=["stratum"], - dataset=nasc_biology) + update_population_grid( + file_configuration, coordinates=["stratum"], dataset=nasc_biology + ) diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py index 2d7ac606..75b362f2 100644 --- a/echopop/live/live_spatial_methods.py +++ b/echopop/live/live_spatial_methods.py @@ -1,33 +1,40 @@ +from pathlib import Path +from typing import List, Union + import geopandas as gpd -import pandas as pd import numpy as np -from geopy.distance import distance -from ..spatial.projection import utm_string_generator +import pandas as pd import shapely.geometry -from shapely.geometry import box import sqlalchemy as sqla -from pathlib import Path -from typing import Union, List -from .sql_methods import sql_group_update, query_dataset +from geopy.distance import distance +from shapely.geometry import box + +from ..spatial.projection import utm_string_generator +from .sql_methods import query_dataset, sql_group_update + def create_inpfc_strata(spatial_config: dict): # Extract the INPFC definitions - inpfc_definitions = spatial_config["inpfc"] + inpfc_definitions = spatial_config["inpfc"] # Create latitude bins latitude_bins = np.concatenate([[-90.0], inpfc_definitions["latitude_max"], [90.0]]) # ---- Append 1 more stratum layer - bin_names = np.concatenate([inpfc_definitions["stratum_names"], - [np.max(inpfc_definitions["stratum_names"]) + 1]]) - + bin_names = np.concatenate( + [inpfc_definitions["stratum_names"], [np.max(inpfc_definitions["stratum_names"]) + 1]] + ) + # Create spatial key - inpfc_strata_df = pd.DataFrame({ - "latitude_limit": np.concatenate([inpfc_definitions["latitude_max"], [90.0]]), - "latitude_interval": pd.cut(np.concatenate([inpfc_definitions["latitude_max"], [90.0]]), - latitude_bins), - "stratum": bin_names, - }) + inpfc_strata_df = pd.DataFrame( + { + "latitude_limit": np.concatenate([inpfc_definitions["latitude_max"], [90.0]]), + "latitude_interval": pd.cut( + np.concatenate([inpfc_definitions["latitude_max"], [90.0]]), latitude_bins + ), + "stratum": bin_names, + } + ) # Add boundaries # ---- Lower @@ -38,8 +45,9 @@ def create_inpfc_strata(spatial_config: dict): # Return the dataframe return inpfc_strata_df + def apply_inpfc_definitions(dataset: pd.DataFrame, inpfc_df: pd.DataFrame): - + # Create dataset copy dataset = dataset.copy() @@ -48,35 +56,37 @@ def apply_inpfc_definitions(dataset: pd.DataFrame, inpfc_df: pd.DataFrame): dataset.loc[:, "stratum"] = pd.cut( dataset.loc[:, "latitude"], np.unique(np.hstack([inpfc_df.loc[:, "lower"], inpfc_df.loc[:, "upper"]])), - labels = inpfc_df.loc[:, "stratum"] + labels=inpfc_df.loc[:, "stratum"], ).astype(int) - + return dataset else: - strata = pd.cut(dataset.copy(), - np.unique(np.hstack([inpfc_df.loc[:, "lower"], - inpfc_df.loc[:, "upper"]])), - labels = inpfc_df.loc[:, "stratum"] + strata = pd.cut( + dataset.copy(), + np.unique(np.hstack([inpfc_df.loc[:, "lower"], inpfc_df.loc[:, "upper"]])), + labels=inpfc_df.loc[:, "stratum"], ).astype(int) - + return strata # Return the INPFC-stratified dataset # return dataset + def apply_spatial_definitions(dataset: Union[dict, pd.Series], spatial_dict: dict): # Get the acoustic-biology link method link_method = spatial_dict["link_method"] - + # Apply spatial definitions if isinstance(dataset, dict) and link_method == "INPFC": - dataset.update({ - k: apply_inpfc_definitions(d, spatial_dict["strata"]) for k, d in dataset.items() - }) + dataset.update( + {k: apply_inpfc_definitions(d, spatial_dict["strata"]) for k, d in dataset.items()} + ) elif isinstance(dataset, pd.Series) and link_method == "INPFC": return apply_inpfc_definitions(dataset, spatial_dict["strata"]) + # def apply_inpfc_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict): # # Extract the INPFC definitions @@ -87,7 +97,7 @@ def apply_spatial_definitions(dataset: Union[dict, pd.Series], spatial_dict: dic # # ---- Append 1 more stratum layer # bin_names = np.concatenate([inpfc_definitions["stratum_names"], # [np.max(inpfc_definitions["stratum_names"]) + 1]]) - + # # Create spatial key # spatial_config["spatial_key"] = pd.DataFrame({ # "latitude_limit": inpfc_definitions["latitude_max"], @@ -120,8 +130,9 @@ def apply_spatial_definitions(dataset: Union[dict, pd.Series], spatial_dict: dic # labels = bin_names, # ) + def define_boundary_box(boundary_dict: dict, projection: str): - + # Get x-coordinates if "longitude" in boundary_dict.keys(): x = np.array(boundary_dict["longitude"]) @@ -135,10 +146,12 @@ def define_boundary_box(boundary_dict: dict, projection: str): y = np.array(boundary_dict["eastings"]) # Create a boundary DataFrame - bound_df = pd.DataFrame({ - "x": np.array([x.min(), x.max(), x.max(), x.min(), x.min()]), - "y":np.array([y.min(), y.max(), y.max(), y.min(), y.min()]), - }) + bound_df = pd.DataFrame( + { + "x": np.array([x.min(), x.max(), x.max(), x.min(), x.min()]), + "y": np.array([y.min(), y.max(), y.max(), y.min(), y.min()]), + } + ) # Convert to a GeoDataFrame and return the GeoDataFrame return gpd.GeoDataFrame( @@ -147,6 +160,7 @@ def define_boundary_box(boundary_dict: dict, projection: str): crs=projection, ) + def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict): # Extract the griddification definitions @@ -161,8 +175,11 @@ def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict): # Convert the coordinates, if needed if not set(["northings", "eastings"]).intersection(set(griddify_definitions["bounds"].keys())): # ---- Compute the equivalent UTM string - utm_num = int(utm_string_generator(np.median(boundary_box.loc[0:3, "x"]), - np.median(boundary_box.loc[0:3, "y"]))) + utm_num = int( + utm_string_generator( + np.median(boundary_box.loc[0:3, "x"]), np.median(boundary_box.loc[0:3, "y"]) + ) + ) # ---- Compute the boundary box GeoDataFrame with the new projection boundary_box = boundary_box.to_crs(utm_num) # ---- Create a new projection for later @@ -184,8 +201,8 @@ def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict): # ---- Iterate through for y0 in np.arange(ymin, ymax, y_step): for x0 in np.arange(xmin, xmax, x_step): - x1 = x0-x_step - y1 = y0+y_step + x1 = x0 - x_step + y1 = y0 + y_step grid_cells.append(shapely.geometry.box(x0, y0, x1, y1)) # Convert to a GeoDataFrame @@ -210,23 +227,25 @@ def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict): # Bin the longitude data dataset_gdf["stratum_x"] = pd.cut( dataset_gdf["x"], - np.arange(xmin, xmax+x_step, x_step), - right = False, - labels = np.arange(1, len(np.arange(xmin, xmax+x_step, x_step))), + np.arange(xmin, xmax + x_step, x_step), + right=False, + labels=np.arange(1, len(np.arange(xmin, xmax + x_step, x_step))), ).astype(int) # Bin the latitude data - dataset_gdf["stratum_y"] = pd.cut( - dataset_gdf["y"], - np.arange(ymin, ymax+y_step, y_step), - right = True, - labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1), - ).astype(int) + 1 + dataset_gdf["stratum_y"] = ( + pd.cut( + dataset_gdf["y"], + np.arange(ymin, ymax + y_step, y_step), + right=True, + labels=range(len(np.arange(ymin, ymax + y_step, y_step)) - 1), + ).astype(int) + + 1 + ) # Update the original dataset - return ( - dataset_gdf.loc[:, ["stratum_x", "stratum_y"]] - .rename(columns={"stratum_x": "x", "stratum_y": "y"}) + return dataset_gdf.loc[:, ["stratum_x", "stratum_y"]].rename( + columns={"stratum_x": "x", "stratum_y": "y"} ) # dataset.loc[:, "x"] = dataset_gdf.copy().loc[:, "stratum_x"] # dataset.loc[:, "y"] = dataset_gdf.copy().loc[:, "stratum_y"] @@ -244,9 +263,9 @@ def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict): # boundary_box = define_boundary_box(griddify_definitions["bounds"], projection) # # Convert the coordinates, if needed -# if not set(["northings", "eastings"]).intersection(set(griddify_definitions["bounds"].keys())): +# if not set(["northings", "eastings"]).intersection(set(griddify_definitions["bounds"].keys())): # # ---- Compute the equivalent UTM string -# utm_num = int(utm_string_generator(np.median(boundary_box.loc[0:3, "x"]), +# utm_num = int(utm_string_generator(np.median(boundary_box.loc[0:3, "x"]), # np.median(boundary_box.loc[0:3, "y"]))) # # ---- Compute the boundary box GeoDataFrame with the new projection # boundary_box = boundary_box.to_crs(utm_num) @@ -351,10 +370,11 @@ def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict): # # # biology_data["trawl_info_df"]["stratum"] = ( -# trawl_info_new["stratum_x"].astype(str) + "-" + trawl_info_new["stratum_y"].astype(str) +# trawl_info_new["stratum_x"].astype(str) + "-" + trawl_info_new["stratum_y"].astype(str) # ) -def initialize_grid(file_configuration = dict): + +def initialize_grid(file_configuration=dict): # Get root directory, if defined if "data_root_dir" in file_configuration: @@ -382,7 +402,7 @@ def initialize_grid(file_configuration = dict): # Get projection projection = file_configuration["geospatial"]["projection"] - + # Get grid settings grid_settings = file_configuration["geospatial"]["griddify"] @@ -398,22 +418,26 @@ def initialize_grid(file_configuration = dict): # ---- y y = boundary["latitude"] # ---- Create DataFrame - boundary_df = pd.DataFrame({ - "x": np.array([np.min(x), np.max(x), np.max(x), np.min(x), np.min(x)]), - "y": np.array([np.min(y), np.min(y), np.max(y), np.max(y), np.min(y)]) - }) + boundary_df = pd.DataFrame( + { + "x": np.array([np.min(x), np.max(x), np.max(x), np.min(x), np.min(x)]), + "y": np.array([np.min(y), np.min(y), np.max(y), np.max(y), np.min(y)]), + } + ) # Create GeoDataFrame boundary_gdf = gpd.GeoDataFrame( - data = boundary_df, + data=boundary_df, geometry=gpd.points_from_xy(boundary_df["x"], boundary_df["y"]), - crs = projection + crs=projection, ) # Convert to UTM (decimal degrees to m) # ---- Create UTM code - utm_code = utm_string_generator((boundary_df.x.min() + boundary_df.x.max()) / 2, - (boundary_df.y.min() + boundary_df.y.max()) / 2) + utm_code = utm_string_generator( + (boundary_df.x.min() + boundary_df.x.max()) / 2, + (boundary_df.y.min() + boundary_df.y.max()) / 2, + ) # ---- Create number code utm_num = int(utm_code) # ---- UTM conversion @@ -432,7 +456,8 @@ def initialize_grid(file_configuration = dict): grid_cells = [] # ---- Initialize coordinate counter y_ct = 0 - x_coord = []; y_coord = [] + x_coord = [] + y_coord = [] # ---- Iterate through to generate cells for y0 in np.arange(ymin, ymax, y_step): y_ct += 1 @@ -449,9 +474,9 @@ def initialize_grid(file_configuration = dict): # Convert to a GeoDataFrame cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=utm_code) - # ---- Add cordinates + # ---- Add coordinates cells_gdf.loc[:, "x"] = np.array(x_coord) - cells_gdf.loc[:, "y"] = np.array(y_coord) + cells_gdf.loc[:, "y"] = np.array(y_coord) # Get coastline shapefile directory, if defined if "coastline" in file_configuration["input_directories"]: @@ -459,14 +484,14 @@ def initialize_grid(file_configuration = dict): # Get coastline settings coast_settings = file_configuration["input_directories"]["coastline"] # ---- Get root folder directory - # coast_root = root_dir / coast_settings["directory"] / coast_settings["coastline_name"] - coast_root = ( - "/".join([root_dir, coast_settings["directory"], coast_settings["coastline_name"]]) + # coast_root = root_dir / coast_settings["directory"] / coast_settings["coastline_name"] + coast_root = "/".join( + [root_dir, coast_settings["directory"], coast_settings["coastline_name"]] ) # ---- Create filepath shp_filepath = ( - # root_dir / coast_settings["directory"] - # / coast_settings["coastline_name"] + # root_dir / coast_settings["directory"] + # / coast_settings["coastline_name"] # coast_root # / f"{coast_settings['coastline_name']}.shp" "/".join([coast_root, f"{coast_settings['coastline_name']}.shp"]) @@ -479,63 +504,67 @@ def initialize_grid(file_configuration = dict): # Get original lat/lon geometry boundaries xmin0, ymin0, xmax0, ymax0 = boundary_gdf.total_bounds - + # Read in file - full_coast = gpd.read_file(shp_filepath, - engine="pyogrio", - storage_options=file_configuration["storage_options"]) + full_coast = gpd.read_file( + shp_filepath, + engine="pyogrio", + storage_options=file_configuration["storage_options"], + ) # ---- Convert to UTM full_coast_utm = full_coast.to_crs(utm_code) # ---- Remove empty full_coast_utm = full_coast_utm[~full_coast_utm.is_empty] - # Create bouning box with a buffer + # Create bounding box with a buffer boundary_box = box(xmin0 - 5, ymin0 - 5, xmax0 + 5, ymax0 + 5) # ---- Create an unbuffered copy boundary_box_unbuffered = box(xmin0, ymin0, xmax0, ymax0) # ---- Convert to a GeoDataFrame - boundary_box_unbuffered_gdf = ( - gpd.GeoDataFrame(geometry=[boundary_box_unbuffered], crs=projection) + boundary_box_unbuffered_gdf = gpd.GeoDataFrame( + geometry=[boundary_box_unbuffered], crs=projection ) # ---- Clip the coastline for saving - clipped_coast_original = ( - gpd.clip(full_coast, box(xmin0 + 1, ymin0 + 1, xmax0 + 1, ymax0 + 1)) + clipped_coast_original = gpd.clip( + full_coast, box(xmin0 + 1, ymin0 + 1, xmax0 + 1, ymax0 + 1) ) # Clip the coastline shapefile clipped_coast = gpd.clip(full_coast, boundary_box).to_crs(utm_code) # Clip the grid cells - cells_gdf.loc[:, "geometry"] = ( - cells_gdf["geometry"].difference(clipped_coast.geometry.union_all()) + cells_gdf.loc[:, "geometry"] = cells_gdf["geometry"].difference( + clipped_coast.geometry.union_all() ) # Calculate area per cell cells_gdf.loc[:, "area"] = cells_gdf.area # ---- Convert back to nmi^2 from m^2 - cells_gdf.loc[:, "area"] = cells_gdf.loc[:, "area"] / 1852 ** 2 + cells_gdf.loc[:, "area"] = cells_gdf.loc[:, "area"] / 1852**2 - # Convert back to original projection and clip - clipped_cells_latlon = ( - gpd.clip(cells_gdf.to_crs(projection), boundary_box_unbuffered_gdf) - .reset_index(drop=True) - ) + # Convert back to original projection and clip + clipped_cells_latlon = gpd.clip( + cells_gdf.to_crs(projection), boundary_box_unbuffered_gdf + ).reset_index(drop=True) # Initialize empty columns that can be added to later on - clipped_cells_latlon.loc[:, ["number_density_mean", "biomass_density_mean", - "abundance", "biomass"]] = 0.0 - + clipped_cells_latlon.loc[ + :, ["number_density_mean", "biomass_density_mean", "abundance", "biomass"] + ] = 0.0 + # Create output DataFrame - output_df = pd.DataFrame({ - "geometry": clipped_cells_latlon["geometry"].apply(lambda geom: geom.wkt) - }) + output_df = pd.DataFrame( + {"geometry": clipped_cells_latlon["geometry"].apply(lambda geom: geom.wkt)} + ) # ---- Add the required columns - output_df = pd.concat([output_df, clipped_cells_latlon.loc[:, ["x", "y", "area"]]], - axis=1) + output_df = pd.concat( + [output_df, clipped_cells_latlon.loc[:, ["x", "y", "area"]]], axis=1 + ) # ---- Initialize empty columns that can be added to later on - output_df.loc[:, ["number_density_mean", "biomass_density_mean", "abundance", - "biomass"]] = 0.0 - + output_df.loc[ + :, ["number_density_mean", "biomass_density_mean", "abundance", "biomass"] + ] = 0.0 + # Write to the database file (for the grid) # ---- Create engine engine = sqla.create_engine(f"sqlite:///{db_filepath}") @@ -544,33 +573,36 @@ def initialize_grid(file_configuration = dict): # Write to the database file (for the coastline shapefile) # ---- Create output copy - coastline_out = pd.DataFrame({ - "geometry": clipped_coast_original["geometry"].apply(lambda geom: geom.wkt) - }) + coastline_out = pd.DataFrame( + {"geometry": clipped_coast_original["geometry"].apply(lambda geom: geom.wkt)} + ) # ---- Concatenate - coastline_out = ( - pd.concat([coastline_out, clipped_coast_original.drop(columns="geometry")], axis=1) + coastline_out = pd.concat( + [coastline_out, clipped_coast_original.drop(columns="geometry")], axis=1 ) # ---- Connect and create table _ = coastline_out.to_sql("coastline_df", engine, if_exists="replace", index=False) -def update_population_grid(file_configuration: dict, - coordinates: Union[List[str], str], - dataset: Union[dict, pd.DataFrame]): + +def update_population_grid( + file_configuration: dict, coordinates: Union[List[str], str], dataset: Union[dict, pd.DataFrame] +): # Extract input directory settings file_settings = file_configuration["input_directories"] # Get filepath for grid grid_db = list( - Path(file_configuration["database_directory"]) - .glob(pattern=f"{file_settings['grid']['database_name']}") + Path(file_configuration["database_directory"]).glob( + pattern=f"{file_settings['grid']['database_name']}" + ) )[0] # Get filepath for acoustics survey_db = list( - Path(file_configuration["database_directory"]) - .glob(pattern=f"{file_settings['acoustics']['database_name']}") + Path(file_configuration["database_directory"]).glob( + pattern=f"{file_settings['acoustics']['database_name']}" + ) )[0] # Define the SQL tables that will be parsed and queries @@ -578,29 +610,41 @@ def update_population_grid(file_configuration: dict, grid_table = "grid_df" # Get indexed survey data - indexed_data = query_dataset(survey_db, - dataset, - table_name=data_table, - data_columns=coordinates + ["x", "y", "number_density", - "biomass_density"], - unique_columns=coordinates) - + indexed_data = query_dataset( + survey_db, + dataset, + table_name=data_table, + data_columns=coordinates + ["x", "y", "number_density", "biomass_density"], + unique_columns=coordinates, + ) + # Get indexed grid data - indexed_grid = query_dataset(grid_db, - indexed_data, - table_name=grid_table, - data_columns= ["x", "y", "area", "number_density_mean", - "biomass_density_mean", "abundance", "biomass"], - unique_columns=["x", "y"]) - + indexed_grid = query_dataset( + grid_db, + indexed_data, + table_name=grid_table, + data_columns=[ + "x", + "y", + "area", + "number_density_mean", + "biomass_density_mean", + "abundance", + "biomass", + ], + unique_columns=["x", "y"], + ) + # Set DataFrame index indexed_grid.set_index(["x", "y"], inplace=True) - # Update the areal density esitmates + # Update the areal density estimates # ---- Number (animals/nmi^2) indexed_grid["number_density_mean"] = indexed_data.groupby(["x", "y"])["number_density"].mean() # ---- Bioamss (kg/nmi^2) - indexed_grid["biomass_density_mean"] = indexed_data.groupby(["x", "y"])["biomass_density"].mean() + indexed_grid["biomass_density_mean"] = indexed_data.groupby(["x", "y"])[ + "biomass_density" + ].mean() # Compute the abundance and biomass per grid cell # ---- Abundance (# animals) @@ -612,12 +656,10 @@ def update_population_grid(file_configuration: dict, # ---- Reset index output_df = indexed_grid.reset_index() # ---- Grouped update - sql_group_update(grid_db, dataframe=output_df, table_name=grid_table, - columns=["number_density_mean", "biomass_density_mean", "abundance", - "biomass"], - unique_columns=["x", "y"]) - - - - - \ No newline at end of file + sql_group_update( + grid_db, + dataframe=output_df, + table_name=grid_table, + columns=["number_density_mean", "biomass_density_mean", "abundance", "biomass"], + unique_columns=["x", "y"], + ) diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py index 51cc4ba8..297d5be2 100644 --- a/echopop/live/live_survey.py +++ b/echopop/live/live_survey.py @@ -1,28 +1,12 @@ -from typing import Union, Optional, Literal -from pathlib import Path -from datetime import datetime import copy -import pandas as pd - -from .sql_methods import query_processed_files - -from .live_core import( - LIVE_DATA_STRUCTURE, -) +from datetime import datetime +from pathlib import Path +from typing import Literal, Optional, Union -from ..acoustics import ( - ts_length_regression, - to_dB, - to_linear -) +import pandas as pd -from .sql_methods import query_processed_files -from .live_acoustics import ( - compute_nasc, - format_acoustic_dataset, - preprocess_acoustic_data -) - +from . import live_data_loading as eldl, live_data_processing as eldp +from .live_acoustics import compute_nasc, format_acoustic_dataset, preprocess_acoustic_data from .live_biology import ( bin_length_data, compute_average_weights, @@ -30,25 +14,24 @@ length_bin_counts, length_bin_weights, length_weight_regression, - number_proportions, + number_proportions, preprocess_biology_data, - weight_proportions + weight_proportions, ) - +from .live_core import LIVE_DATA_STRUCTURE from .live_spatial_methods import initialize_grid +from .sql_methods import query_processed_files -from . import live_data_processing as eldp -from . import live_data_loading as eldl class LiveSurvey: """ - A real-time processing version of the `echopop` base `Survey` class that ingests biological, + A real-time processing version of the `echopop` base `Survey` class that ingests biological, acoustic, and event meta data to provide population estimates when generated. """ def __init__( self, - live_init_config_path: Union[str, Path], + live_init_config_path: Union[str, Path], live_file_config_path: Union[str, Path], cloud_storage_options: dict = {}, verbose: bool = True, @@ -66,10 +49,8 @@ def __init__( {"database": {key: None for key in self.config["input_directories"].keys()}} ) # ---- Add cloud storage options, if needed - self.config.update( - {"storage_options": cloud_storage_options} - ) - + self.config.update({"storage_options": cloud_storage_options}) + # Initialize input attribute self.input = copy.deepcopy(LIVE_DATA_STRUCTURE["input"]) @@ -88,8 +69,8 @@ def __init__( # Configure the spatial settings self.input.update({"spatial": eldl.configure_spatial_settings(self.config)}) - # TODO: Add verbosity for printing database filepaths/connections - if verbose: + # TODO: Add verbosity for printing database filepaths/connections + if verbose: pass def __repr__(self): @@ -100,7 +81,9 @@ def __repr__(self): acoustic_filenames = self.meta["provenance"]["acoustic_files_read"] # ---- Subset if many files are being processed if len(acoustic_filenames) > 2: - acoustic_filenames = acoustic_filenames[:2] + ["..."] + [f"[n = {len(acoustic_filenames)}]"] + acoustic_filenames = ( + acoustic_filenames[:2] + ["..."] + [f"[n = {len(acoustic_filenames)}]"] + ) # ---- Format string acoustic_files = ", ".join(acoustic_filenames) else: @@ -119,8 +102,8 @@ def __repr__(self): biology_files = "None" # Get linked database names - linked_dbs = ( - "\n ".join([f"{key.title()}: {db}" for key, db in self.config["database"].items()]) + linked_dbs = "\n ".join( + [f"{key.title()}: {db}" for key, db in self.config["database"].items()] ) return ( @@ -130,96 +113,96 @@ def __repr__(self): f"Biology files being processed: \n {biology_files}\n" f"Linked databases: \n {linked_dbs}" ) - + def __str__(self): return self.__repr__() - def load_acoustic_data(self, - xarray_kwargs: dict = {}, - input_filenames: Optional[list] = None, - verbose: bool = True): - + def load_acoustic_data( + self, xarray_kwargs: dict = {}, input_filenames: Optional[list] = None, verbose: bool = True + ): + # Validate the data directory and format the filepaths - acoustic_files = eldl.validate_data_directory(self.config, dataset="acoustics", - input_filenames=input_filenames) - + acoustic_files = eldl.validate_data_directory( + self.config, dataset="acoustics", input_filenames=input_filenames + ) + # Read in the acoustic data files if acoustic_files: # ! [REQUIRES DASK] ---- Read in the listed file # ---- Read in the acoustic data files - prc_nasc_df, acoustic_data_units = eldl.read_acoustic_files(acoustic_files, - xarray_kwargs=xarray_kwargs) + prc_nasc_df, acoustic_data_units = eldl.read_acoustic_files( + acoustic_files, xarray_kwargs=xarray_kwargs + ) # ---- Add the `acoustic_data_units` to the dictionary - self.config["acoustics"]["dataset_units"] = acoustic_data_units + self.config["acoustics"]["dataset_units"] = acoustic_data_units # ---- Preprocess the acoustic dataset # TODO: SettingWithCopyWarning: - self.input["acoustics"]["prc_nasc_df"] = preprocess_acoustic_data(prc_nasc_df.copy(), - self.input["spatial"], - self.config) + self.input["acoustics"]["prc_nasc_df"] = preprocess_acoustic_data( + prc_nasc_df.copy(), self.input["spatial"], self.config + ) # ---- Add meta key - self.meta["provenance"].update({ - "acoustic_files_read": acoustic_files, - }) - # TODO: Add verbosity for printing database filepaths/connections + self.meta["provenance"].update( + { + "acoustic_files_read": acoustic_files, + } + ) + # TODO: Add verbosity for printing database filepaths/connections if verbose: # ---- Create file list file_list = "\n".join(acoustic_files) - print( - f"The following acoustic files are being processed:\n" - f"{file_list}." - ) + print(f"The following acoustic files are being processed:\n" f"{file_list}.") else: self.input["acoustics"]["prc_nasc_df"] = None - def load_biology_data(self, - pandas_kwargs: dict = {}, - input_filenames: Optional[list] = None, - verbose: bool = True): + def load_biology_data( + self, pandas_kwargs: dict = {}, input_filenames: Optional[list] = None, verbose: bool = True + ): # Validate the data directory and format the filepaths - biology_files = eldl.validate_data_directory(self.config, dataset="biology", - input_filenames=input_filenames) - - # ! REMOVE + biology_files = eldl.validate_data_directory( + self.config, dataset="biology", input_filenames=input_filenames + ) + + # ! REMOVE self.meta["provenance"]["biology_files_checkpoint1"] = biology_files - - # TODO: Add verbosity for printing database filepaths/connections + + # TODO: Add verbosity for printing database filepaths/connections if biology_files and verbose: # ---- Create file list file_list = "\n".join(biology_files) - print( - f"The following biological files are being processed:\n" - f"{file_list}." - ) - + print(f"The following biological files are being processed:\n" f"{file_list}.") + # Read in the biology data files - initial_biology_output = eldl.read_biology_files(biology_files, self.config, - pandas_kwargs=pandas_kwargs) - - # ! REMOVE - self.meta["provenance"]["biology_files_checkpoint2"] =( - {key: df.shape for key, df in initial_biology_output.items()} - ) + initial_biology_output = eldl.read_biology_files( + biology_files, self.config, pandas_kwargs=pandas_kwargs + ) + + # ! REMOVE + self.meta["provenance"]["biology_files_checkpoint2"] = { + key: df.shape for key, df in initial_biology_output.items() + } # Preprocess the biology dataset - self.input["biology"], self.input["biology_processed"] = ( - preprocess_biology_data(initial_biology_output, self.input["spatial"], self.config) + self.input["biology"], self.input["biology_processed"] = preprocess_biology_data( + initial_biology_output, self.input["spatial"], self.config ) - # ! REMOVE - self.meta["provenance"]["biology_files_checkpoint3"] = ( - {key: df.shape for key, df in self.input["biology_processed"].items()} - ) + # ! REMOVE + self.meta["provenance"]["biology_files_checkpoint3"] = { + key: df.shape for key, df in self.input["biology_processed"].items() + } # Add meta key - self.meta["provenance"].update({ - "biology_files_read": biology_files, - }) + self.meta["provenance"].update( + { + "biology_files_read": biology_files, + } + ) def process_biology_data(self): # TODO: How and when should the already processed data be imported? - # Separate out processed and unprocessed biological data + # Separate out processed and unprocessed biological data # ----- Unprocessed biology_unprocessed = self.input["biology"] @@ -227,77 +210,82 @@ def process_biology_data(self): root_directory = self.config["database_directory"] # Check if data are present - unprocess_data_dfs = ( - [True if isinstance(df, pd.DataFrame) and not df.empty else False - for _, df in biology_unprocessed.items()] - ) + unprocess_data_dfs = [ + True if isinstance(df, pd.DataFrame) and not df.empty else False + for _, df in biology_unprocessed.items() + ] # ---- Proceed in processing the unprocessed data if all(unprocess_data_dfs): # Compute `sigma_bs` by sending it to the appropriate database table - compute_sigma_bs(biology_unprocessed["specimen_df"], - biology_unprocessed["length_df"], - self.config) + compute_sigma_bs( + biology_unprocessed["specimen_df"], biology_unprocessed["length_df"], self.config + ) # Bin the length measurements of the biological data bin_length_data(biology_unprocessed, self.config["length_distribution"]) # Compute the length-weight regression and add it to the SQL table - length_weight_df = length_weight_regression(biology_unprocessed["specimen_df"], - self.config["length_distribution"], - self.config) - + length_weight_df = length_weight_regression( + biology_unprocessed["specimen_df"], self.config["length_distribution"], self.config + ) + # Compute length-binned counts for the aggregated and individual-based measurements - specimen_binned, specimen_binned_filtered, length_binned = ( - length_bin_counts(biology_unprocessed["length_df"], - biology_unprocessed["specimen_df"], - self.config) + specimen_binned, specimen_binned_filtered, length_binned = length_bin_counts( + biology_unprocessed["length_df"], biology_unprocessed["specimen_df"], self.config ) # Compute the number proportions specimen_number_proportion, length_number_proportion, sex_number_proportions = ( - number_proportions(specimen_binned, specimen_binned_filtered, - length_binned, self.config) + number_proportions( + specimen_binned, specimen_binned_filtered, length_binned, self.config + ) ) # Compute the length-binned weights for the aggregated and individual-based measurements - length_weight_binned, specimen_weight_binned = ( - length_bin_weights(biology_unprocessed["length_df"], - biology_unprocessed["specimen_df"], - length_weight_df,self.config) + length_weight_binned, specimen_weight_binned = length_bin_weights( + biology_unprocessed["length_df"], + biology_unprocessed["specimen_df"], + length_weight_df, + self.config, ) # Calculate the average weights among male, female, and all fish - self.input["weight_stratum_df"] = ( - compute_average_weights(specimen_number_proportion, - length_number_proportion, - sex_number_proportions, - length_weight_df, - self.config["length_distribution"], - self.config) + self.input["weight_stratum_df"] = compute_average_weights( + specimen_number_proportion, + length_number_proportion, + sex_number_proportions, + length_weight_df, + self.config["length_distribution"], + self.config, ) - + # Compute the weight proportions - self.input["biology"].update({ - "proportions": weight_proportions(biology_unprocessed["catch_df"], - specimen_weight_binned, - length_weight_binned, - length_number_proportion, - length_weight_df, - self.config) - }) + self.input["biology"].update( + { + "proportions": weight_proportions( + biology_unprocessed["catch_df"], + specimen_weight_binned, + length_weight_binned, + length_number_proportion, + length_weight_df, + self.config, + ) + } + ) # Update the database - query_processed_files(root_directory, - self.config["input_directories"]["biology"], - self.meta["provenance"]["biology_files_read"], - processed=True) - + query_processed_files( + root_directory, + self.config["input_directories"]["biology"], + self.meta["provenance"]["biology_files_read"], + processed=True, + ) + # Add meta key - self.meta["provenance"].update({ - "biology_files_processed": self.meta["provenance"]["biology_files_read"] - }) - + self.meta["provenance"].update( + {"biology_files_processed": self.meta["provenance"]["biology_files_read"]} + ) def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True): @@ -311,46 +299,45 @@ def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True): "No acoustic data located in `*.input['acoustics']['prc_nasc_df']" " DataFrame. Data processing step will therefore be skipped." ) - else: + else: # Get the unprocessed acoustic data acoustic_data_df = self.input["acoustics"]["prc_nasc_df"] # Integrate NASC (and compute the echometrics, if necessary) nasc_data_df = compute_nasc(acoustic_data_df, self.config, echometrics) - + # Format the dataframe and insert into the LiveSurvey object - self.input["acoustics"]["nasc_df"] = format_acoustic_dataset(nasc_data_df, - self.config, - self.meta) + self.input["acoustics"]["nasc_df"] = format_acoustic_dataset( + nasc_data_df, self.config, self.meta + ) # Add meta key - self.meta["provenance"].update({ - "acoustic_files_processed": self.meta["provenance"]["acoustic_files_read"] - }) - - def estimate_population(self, - working_dataset: Literal["acoustic", "biology"], - verbose: bool = True): - + self.meta["provenance"].update( + {"acoustic_files_processed": self.meta["provenance"]["acoustic_files_read"]} + ) + + def estimate_population( + self, working_dataset: Literal["acoustic", "biology"], verbose: bool = True + ): + self.meta["provenance"][f"{working_dataset}_population"] = False # method if working_dataset == "acoustic": - eldp.acoustic_pipeline(self.input["acoustics"], - self.input["spatial"]["strata"], - self.config, - verbose=verbose, - contrast_columns=["ship_id"]) + eldp.acoustic_pipeline( + self.input["acoustics"], + self.input["spatial"]["strata"], + self.config, + verbose=verbose, + contrast_columns=["ship_id"], + ) # --- Validate successful run self.meta["provenance"]["acoustic_population"] = True - + # method if working_dataset == "biology": - eldp.biology_pipeline(self.input["biology"], - self.input["spatial"]["strata"], - self.config, - verbose=verbose) + eldp.biology_pipeline( + self.input["biology"], self.input["spatial"]["strata"], self.config, verbose=verbose + ) # --- Validate successful run self.meta["provenance"]["biology_population"] = True - - diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py index 4c59d975..a1d55a26 100644 --- a/echopop/live/live_visualizer.py +++ b/echopop/live/live_visualizer.py @@ -1,17 +1,21 @@ -from echopop.live.sql_methods import SQL -from shapely import wkt +from pathlib import Path +from typing import Optional, Union + +import geopandas as gpd import matplotlib.pyplot as plt -from matplotlib.colors import ListedColormap import numpy as np import pandas as pd -import geopandas as gpd -from typing import Union, Optional -from pathlib import Path -import matplotlib.gridspec as gridspec +from matplotlib.colors import ListedColormap +from shapely import wkt + +from echopop.live.sql_methods import SQL -def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame], - projection: str, - coast_db: Optional[Union[Path, pd.DataFrame]] = None): + +def plot_livesurvey_grid( + grid_db: Union[Path, pd.DataFrame], + projection: str, + coast_db: Optional[Union[Path, pd.DataFrame]] = None, +): # Extract grid data from database if needed if isinstance(grid_db, Path): @@ -23,31 +27,31 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame], ) else: grid_data = grid_db - + # Extract coast data from database if needed if isinstance(coast_db, Path): # ---- SELECT - coast_data = SQL(coast_db, "select", table_name="coastline_df") + coast_data = SQL(coast_db, "select", table_name="coastline_df") elif coast_data is None: # ---- SELECT from `grid_data` - coast_data = SQL(grid_db, "select", table_name="coastline_df") + coast_data = SQL(grid_db, "select", table_name="coastline_df") elif not isinstance(coast_db, pd.DataFrame): raise TypeError( "Coast data input (`coast_data`) must either be a `Path` or `pandas.DataFrame` object, " "or exist within the SQL database as a table (`'coastline_df'`) within the `grid_data` " "input (i.e. `grid_data.db`)." - ) + ) else: - coast_data = coast_db - + coast_data = coast_db + # Format columns if needed (well-known-text to Polygon) # ---- `grid_data` if isinstance(grid_data["geometry"][0], str): grid_data["geometry"] = grid_data["geometry"].apply(wkt.loads) # ---- `coastline_data` if isinstance(coast_data["geometry"][0], str): - coast_data["geometry"] = coast_data["geometry"].apply(wkt.loads) - + coast_data["geometry"] = coast_data["geometry"].apply(wkt.loads) + # Generate GeoDataFrames # ---- `grid` grid_gdf = gpd.GeoDataFrame(grid_data, geometry="geometry", crs=projection) @@ -63,27 +67,21 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame], "name": "Mean number density", "units": "fish $\\mathregular{nmi^{-2}}$", "colormap": "viridis", - "color_threshold": { - "minimum": 1e1, - "maximum": 1e6 - }, - }, + "color_threshold": {"minimum": 1e1, "maximum": 1e6}, + }, "biomass_density_mean": { "name": "Mean biomass density", "units": "kg $\\mathregular{nmi^{-2}}$", "colormap": "plasma", - "color_threshold": { - "minimum": 1e1, - "maximum": 1e6 - }, - }, + "color_threshold": {"minimum": 1e1, "maximum": 1e6}, + }, "biomass": { "name": "Biomass", "units": "kg", "colormap": "cividis", "color_threshold": { "minimum": 1e1 * grid_gdf["area"].max(), - "maximum": 1e6 * grid_gdf["area"].max() + "maximum": 1e6 * grid_gdf["area"].max(), }, }, "abundance": { @@ -92,9 +90,9 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame], "colormap": "inferno", "color_threshold": { "minimum": 1e1 * grid_gdf["area"].max(), - "maximum": 1e6 * grid_gdf["area"].max() + "maximum": 1e6 * grid_gdf["area"].max(), }, - } + }, } # Create a figure and a 2x2 grid of subplots @@ -108,7 +106,7 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame], # ---- Get the colormap colormap = plt.colormaps.get_cmap(VARIABLE_MAP[var]["colormap"]).resampled(256) # ---- Invert - newcolors = colormap (np.linspace(0, 1, 256))[::-1] + newcolors = colormap(np.linspace(0, 1, 256))[::-1] # ---- Define `white` white = np.array([1, 1, 1, 1]) # ---- Replace "start" color @@ -124,29 +122,35 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame], min_value = sub_grid_gdf[var].min() max_value = sub_grid_gdf[var].max() # ---- Normalize colorscale - norm=plt.Normalize(vmin=min_value, vmax=max_value) + norm = plt.Normalize(vmin=min_value, vmax=max_value) # ---- Plot the polygons with color fills based on the variable (non-zero) - grid_gdf.plot(column=var, ax=ax, edgecolor="gainsboro", legend=False, cmap=custom_cmap, - norm=norm, - markersize=0, linewidth=0.5) + grid_gdf.plot( + column=var, + ax=ax, + edgecolor="gainsboro", + legend=False, + cmap=custom_cmap, + norm=norm, + markersize=0, + linewidth=0.5, + ) # ---- Add coastline data layer - coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") + coast_gdf.plot(ax=ax, linewidth=1.2, color="gray", edgecolor="black") # ---- Set axis limits - ax.set_xlim(axis_limits[0]*1.005, axis_limits[2]*1.01) - ax.set_ylim(axis_limits[1]*0.98, axis_limits[3]*1.005) + ax.set_xlim(axis_limits[0] * 1.005, axis_limits[2] * 1.01) + ax.set_ylim(axis_limits[1] * 0.98, axis_limits[3] * 1.005) # ---- Trim down the margins - ax.margins(0,0) + ax.margins(0, 0) # ---- Set adjustable aspect ratio # ax.set_aspect('equal', adjustable='box') # ---- Set the title and labels var_info = VARIABLE_MAP[var] ax.set_title(f"{var_info['name']}") # ---- Set axis labels - ax.set_xlabel(u'Longitude (\u00B0E)') - ax.set_ylabel(u'Latitude (\u00B0N)') + ax.set_xlabel("Longitude (\u00B0E)") + ax.set_ylabel("Latitude (\u00B0N)") # ---- Add colorbar - sm = plt.cm.ScalarMappable(cmap=custom_cmap, - norm=norm) + sm = plt.cm.ScalarMappable(cmap=custom_cmap, norm=norm) sm._A = [] # fake up the array of the scalar mappable cbar = fig.colorbar(sm, ax=ax, shrink=0.5) cbar.set_label(f"{var_info['units']}") @@ -161,17 +165,27 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame], x_scale = (x1 - x0) * 0.1 y_scale = (y1 - y0) * 0.1 # scalebar_y_offset = (axis_limits[3]*1.005 - axis_limits[1]*0.98) * 0.05 - # ---- Plot scalebar - # ax.plot([scalebar_x, scalebar_x + scalebar_length / 100], + # ---- Plot scalebar + # ax.plot([scalebar_x, scalebar_x + scalebar_length / 100], # [scalebar_y, scalebar_y], color='black', lw=2) - ax.plot([x0 + x_scale, x0 + x_scale + scalebar_length_in_degrees], - [y0 + y_scale, y0 + y_scale], color='black', lw=2) + ax.plot( + [x0 + x_scale, x0 + x_scale + scalebar_length_in_degrees], + [y0 + y_scale, y0 + y_scale], + color="black", + lw=2, + ) # ---- Add scale text - ax.text(x0 + x_scale + scalebar_length_in_degrees / 2, y0 + y_scale - (y1 - y0) * 0.025, - f'{scalebar_length} km', ha='center', va='top', color='black') + ax.text( + x0 + x_scale + scalebar_length_in_degrees / 2, + y0 + y_scale - (y1 - y0) * 0.025, + f"{scalebar_length} km", + ha="center", + va="top", + color="black", + ) - # ax.text(scalebar_x + (scalebar_length / 200), - # scalebar_y - scalebar_y_offset, + # ax.text(scalebar_x + (scalebar_length / 200), + # scalebar_y - scalebar_y_offset, # f'{scalebar_length} km', ha='center', va='bottom', color='black') # Adjust layout @@ -181,9 +195,12 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame], # plt.show() return fig -def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame], - projection: str, - coast_db: Optional[Union[Path, pd.DataFrame]] = None): + +def plot_livesurvey_track( + survey_data_db: Union[Path, pd.DataFrame], + projection: str, + coast_db: Optional[Union[Path, pd.DataFrame]] = None, +): # Extract grid data from database if needed if isinstance(survey_data_db, Path): @@ -195,29 +212,30 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame], ) else: survey_data = survey_data_db - + # Extract coast data from database if needed if isinstance(coast_db, Path): # ---- SELECT - coast_data = SQL(coast_db, "select", table_name="coastline_df") + coast_data = SQL(coast_db, "select", table_name="coastline_df") elif not isinstance(coast_db, pd.DataFrame): raise TypeError( "Coast data input (`coast_data`) must either be a `Path` or `pandas.DataFrame` object." - ) + ) else: coast_data = coast_db - + # Format columns if needed (well-known-text to Polygon) # ---- `coastline_data` if isinstance(coast_data["geometry"][0], str): - coast_data["geometry"] = coast_data["geometry"].apply(wkt.loads) - + coast_data["geometry"] = coast_data["geometry"].apply(wkt.loads) + # Generate GeoDataFrames # ---- `grid` - survey_gdf = gpd.GeoDataFrame(survey_data, - geometry=gpd.points_from_xy(survey_data["longitude"], - survey_data["latitude"]), - crs=projection) + survey_gdf = gpd.GeoDataFrame( + survey_data, + geometry=gpd.points_from_xy(survey_data["longitude"], survey_data["latitude"]), + crs=projection, + ) # ---- `coast` coast_gdf = gpd.GeoDataFrame(coast_data, geometry="geometry", crs=projection) @@ -232,12 +250,12 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame], "colormap": "inferno", "minimum": 0.0, "cbar_reverse": True, - "color_threshold": { + "color_threshold": { "minimum": 1e1, "maximum": 1e6, }, - "size": [25, 150] - }, + "size": [25, 150], + }, "biomass_density": { "name": "Mean biomass density", "units": "kg $\\mathregular{nmi^{-2}}$", @@ -248,19 +266,16 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame], "minimum": 1e1, "maximum": 1e6, }, - "size": [25, 150] - }, + "size": [25, 150], + }, "nasc": { "name": "Nautical area scattering coefficient", "units": "$\\mathregular{m^{2}~nmi^{-2}}$", "colormap": "viridis", "minimum": 0.0, "cbar_reverse": False, - "color_threshold": { - "minimum": 1e2, - "maximum": 1e4 - }, - "size": [25, 150] + "color_threshold": {"minimum": 1e2, "maximum": 1e4}, + "size": [25, 150], }, "max_Sv": { "name": "Max $\\mathregular{S_V}$", @@ -268,11 +283,8 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame], "colormap": "viridis", "minimum": -999, "cbar_reverse": True, - "color_threshold": { - "minimum": -80.0, - "maximum": -36.0 - }, - "size": [5, 100] + "color_threshold": {"minimum": -80.0, "maximum": -36.0}, + "size": [5, 100], }, # "mean_Sv": { # "name": "$Mean \\mathregular{S_V}$", @@ -300,15 +312,12 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): sizes.loc[sizes < min_value] = min_value sizes.loc[sizes > max_value] = max_value - return ( - ((sizes - min_value) / (max_value - min_value)) - * (max_size - min_size) + min_size - ) - + return ((sizes - min_value) / (max_value - min_value)) * (max_size - min_size) + min_size + # Define colors for ship_ids (you can customize these colors as needed) ship_id_colors = { ship_id: plt.cm.tab10(i) # Use a colormap for distinct colors; adjust as needed - for i, ship_id in enumerate(survey_gdf['ship_id'].unique()) + for i, ship_id in enumerate(survey_gdf["ship_id"].unique()) } # Create a figure and a 2xn grid of subplots @@ -331,17 +340,24 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): custom_cmap = ListedColormap(newcolors) # ---- Plot cruisetrack # survey_gdf.plot(ax=ax, color="dimgray", linewidth=0.25, linestyle="-") - # ax.plot(survey_gdf.geometry.x, survey_gdf.geometry.y, color="dimgray", + # ax.plot(survey_gdf.geometry.x, survey_gdf.geometry.y, color="dimgray", # linewidth=0.25, linestyle="-") handles = [] # List to store legend handles for ship_id, group in survey_gdf.groupby("ship_id"): # Sort the group by latitude or longitude - # group = group.sort_values(by=["latitude", "longitude"]) - color = ship_id_colors.get(ship_id, 'gray') - line_handle, = ax.plot(group.geometry.x, group.geometry.y, color=color, - linewidth=0.25, linestyle="-", label=ship_id, zorder=1) + # group = group.sort_values(by=["latitude", "longitude"]) + color = ship_id_colors.get(ship_id, "gray") + (line_handle,) = ax.plot( + group.geometry.x, + group.geometry.y, + color=color, + linewidth=0.25, + linestyle="-", + label=ship_id, + zorder=1, + ) handles.append(line_handle) # Add handle to legend - # ax.plot(group.geometry.x, group.geometry.y, label=ship_id, linewidth=0.25, + # ax.plot(group.geometry.x, group.geometry.y, label=ship_id, linewidth=0.25, # linestyle="-", zorder=1) # ---- Drop "empty" values sub_gdf = survey_gdf[survey_gdf[var] > VARIABLE_MAP[var]["minimum"]] @@ -353,38 +369,40 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): min_value = sub_gdf[var].min() max_value = sub_gdf[var].max() # ---- Normalize colorscale - norm=plt.Normalize(vmin=min_value, vmax=max_value) + norm = plt.Normalize(vmin=min_value, vmax=max_value) # ---- Plot the points with color fills based on the variable (non-zero) ax.scatter( [geom.x for geom in sub_gdf.geometry], [geom.y for geom in sub_gdf.geometry], c=sub_gdf[var], - s=scale_sizes(values=sub_gdf[var], - min_value=min_value, - max_value=max_value, - min_size=VARIABLE_MAP[var]["size"][0], - max_size=VARIABLE_MAP[var]["size"][1]), + s=scale_sizes( + values=sub_gdf[var], + min_value=min_value, + max_value=max_value, + min_size=VARIABLE_MAP[var]["size"][0], + max_size=VARIABLE_MAP[var]["size"][1], + ), cmap=custom_cmap, norm=norm, - zorder = 2 + zorder=2, # edgecolor="black", # linewidths=0.1 - ) + ) # ---- Add coastline data layer - coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") + coast_gdf.plot(ax=ax, linewidth=1.2, color="gray", edgecolor="black") # ---- Set axis limits - ax.set_xlim(axis_limits[0]*1.005, axis_limits[2]*0.995) - ax.set_ylim(axis_limits[1]*0.98, axis_limits[3]*1.005) + ax.set_xlim(axis_limits[0] * 1.005, axis_limits[2] * 0.995) + ax.set_ylim(axis_limits[1] * 0.98, axis_limits[3] * 1.005) # ---- Trim down the margins - ax.margins(0,0) + ax.margins(0, 0) # ---- Set adjustable aspect ratio # ax.set_aspect('equal', adjustable='box') # ---- Set the title and labels var_info = VARIABLE_MAP[var] ax.set_title(f"{var_info['name']}") # ---- Set axis labels - ax.set_xlabel(u'Longitude (\u00B0E)') - ax.set_ylabel(u'Latitude (\u00B0N)') + ax.set_xlabel("Longitude (\u00B0E)") + ax.set_ylabel("Latitude (\u00B0N)") # ---- Add colorbar sm = plt.cm.ScalarMappable(cmap=custom_cmap, norm=norm) sm._A = [] # fake up the array of the scalar mappable @@ -401,18 +419,28 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): x_scale = (x1 - x0) * 0.1 y_scale = (y1 - y0) * 0.1 # scalebar_y_offset = (axis_limits[3]*1.005 - axis_limits[1]*0.98) * 0.05 - # ---- Plot scalebar - # ax.plot([scalebar_x, scalebar_x + scalebar_length / 100], + # ---- Plot scalebar + # ax.plot([scalebar_x, scalebar_x + scalebar_length / 100], # [scalebar_y, scalebar_y], color='black', lw=2) - ax.plot([x0 + x_scale, x0 + x_scale + scalebar_length_in_degrees], - [y0 + y_scale, y0 + y_scale], color='black', lw=2) + ax.plot( + [x0 + x_scale, x0 + x_scale + scalebar_length_in_degrees], + [y0 + y_scale, y0 + y_scale], + color="black", + lw=2, + ) # ---- Add scale text - ax.text(x0 + x_scale + scalebar_length_in_degrees / 2, y0 + y_scale - (y1 - y0) * 0.025, - f'{scalebar_length} km', ha='center', va='top', color='black') + ax.text( + x0 + x_scale + scalebar_length_in_degrees / 2, + y0 + y_scale - (y1 - y0) * 0.025, + f"{scalebar_length} km", + ha="center", + va="top", + color="black", + ) # ax.legend(handles=handles, title='Ship ID') - # ax.text(scalebar_x + (scalebar_length / 200), - # scalebar_y - scalebar_y_offset, + # ax.text(scalebar_x + (scalebar_length / 200), + # scalebar_y - scalebar_y_offset, # f'{scalebar_length} km', ha='center', va='bottom', color='black') # Adjust layout @@ -422,137 +450,179 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): # plt.show() return fig -def plot_livesurvey_distributions(weight_table: pd.DataFrame, - stratum_table: pd.DataFrame, - specimen_table: pd.DataFrame, - length_table: pd.DataFrame, - biology_db: Optional[Path] = None): - + +def plot_livesurvey_distributions( + weight_table: pd.DataFrame, + stratum_table: pd.DataFrame, + specimen_table: pd.DataFrame, + length_table: pd.DataFrame, + biology_db: Optional[Path] = None, +): + # If calling from SQL database - if biology_db is not None: + if biology_db is not None: weight_table = SQL(biology_db, "select", table_name="length_weight_df") stratum_table = SQL(biology_db, "select", table_name="strata_summary_df") specimen_table = SQL(biology_db, "select", table_name="specimen_data_df") length_table = SQL(biology_db, "select", table_name="length_df") - elif not all([isinstance(df, pd.DataFrame) for df in [weight_table, stratum_table, - specimen_table, length_table]]): - raise TypeError( - "All tables must be a `pandas.DataFrame." - ) - + elif not all( + [ + isinstance(df, pd.DataFrame) + for df in [weight_table, stratum_table, specimen_table, length_table] + ] + ): + raise TypeError("All tables must be a `pandas.DataFrame.") + # Organize the weight table data # ---- Sum weights by stratum, sex, and length_bin aggregated_data = ( - weight_table.groupby(['stratum', 'sex', 'length_bin'])['weight'].sum().reset_index() + weight_table.groupby(["stratum", "sex", "length_bin"])["weight"].sum().reset_index() ) # ---- Create a column to indicate 'all' sexes aggregated_data_all = ( - aggregated_data.groupby(['stratum', 'length_bin'])['weight'].sum().reset_index() + aggregated_data.groupby(["stratum", "length_bin"])["weight"].sum().reset_index() ) - aggregated_data_all['sex'] = 'all' + aggregated_data_all["sex"] = "all" # ---- Combine the male, female, and all data plot_weight_data = pd.concat([aggregated_data, aggregated_data_all], ignore_index=True) - + # Define the sexes sexes = plot_weight_data.sex.unique().tolist() - + # Organize the length table data bins = plot_weight_data.length_bin.unique() + 1 full_bins = np.concatenate([[bins[0] - np.diff(bins).mean() / 2], bins]) - length_table["length_bin"] = ( - pd.cut(length_table["length"], bins=full_bins, labels=bins - 1).astype(float) - ) - length_table_sex = ( - length_table.groupby(["stratum", "sex", "length_bin"])["length_count"].sum().reset_index() - ) + length_table["length_bin"] = pd.cut( + length_table["length"], bins=full_bins, labels=bins - 1 + ).astype(float) + # length_table_sex = ( + # length_table.groupby(["stratum", "sex", "length_bin"])["length_count"].sum().reset_index() + # ) length_table_all = ( length_table.groupby(["stratum", "length_bin"])["length_count"].sum().reset_index() ) - length_table_all['sex'] = 'all' + length_table_all["sex"] = "all" full_count = ( - specimen_table.meld(length_table_all, contrasts=["stratum", "sex", "species_id", "length_bin"]) + specimen_table.meld( + length_table_all, contrasts=["stratum", "sex", "species_id", "length_bin"] + ) .loc[lambda x: x.sex.isin(sexes)] - .groupby(['stratum', 'sex', 'length_bin'])['length_count'].sum().reset_index() + .groupby(["stratum", "sex", "length_bin"])["length_count"] + .sum() + .reset_index() ) full_count["total"] = full_count.groupby(["stratum", "sex"])["length_count"].transform("sum") full_count["number_proportion"] = full_count["length_count"] / full_count["total"] # ---- Combine into the full dataset for plotting plot_count_data = ( - plot_weight_data - .merge(full_count.filter(["stratum", "sex", "length_bin", "number_proportion"]), - on=["stratum", "sex", "length_bin"], how="left") + plot_weight_data.merge( + full_count.filter(["stratum", "sex", "length_bin", "number_proportion"]), + on=["stratum", "sex", "length_bin"], + how="left", + ) ).fillna(0.0) - + # Get a color map - colors = plt.colormaps['tab10'] - num_strata = len(stratum_table['stratum'].unique()) + colors = plt.colormaps["tab10"] + num_strata = len(stratum_table["stratum"].unique()) num_sexes = len(sexes) - color_map = colors(num_strata) - + # color_map = colors(num_strata) + # Plot fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(6, 8), sharex=True, sharey=True) plt.subplots_adjust(hspace=0.08, wspace=0.05, bottom=0.25) # Adjust spacing between plots - + # Plot weights and counts for i, sex in enumerate(sexes): # Weight plot (left column) ax_weight = axes[i, 0] - data_weight = plot_weight_data[plot_weight_data['sex'] == sex] - for j, (stratum, group) in enumerate(data_weight.groupby('stratum')): - # color = colors(i / num_strata) if num_strata > 1 else colors(0) + data_weight = plot_weight_data[plot_weight_data["sex"] == sex] + for j, (stratum, group) in enumerate(data_weight.groupby("stratum")): + # color = colors(i / num_strata) if num_strata > 1 else colors(0) color = colors(j / num_strata) if num_strata > 1 else colors(0) total = group["weight"].sum() group["proportions"] = group["weight"] / total if total > 0.0 else 0.0 ms = 5 if group["proportions"].max() > 0.0 else 0.1 - # handle, = ax_weight.plot(group['length_bin'], group['proportions'], marker='o', + # handle, = ax_weight.plot(group['length_bin'], group['proportions'], marker='o', # label=f'Stratum {stratum}', color=color, ms=ms) - ax_weight.plot(group['length_bin'], group['proportions'], marker='o', - label=f'Stratum {stratum}', color=color, ms=ms) + ax_weight.plot( + group["length_bin"], + group["proportions"], + marker="o", + label=f"Stratum {stratum}", + color=color, + ms=ms, + ) if i == 0: - ax_weight.set_title(f'Weight') + ax_weight.set_title("Weight") if i < num_sexes - 1: # No x-ticks for non-bottom plots - ax_weight.set_xlabel('') + ax_weight.set_xlabel("") if i == num_sexes // 2: - ax_weight.set_ylabel('Within-stratum proportion [0, 1]') + ax_weight.set_ylabel("Within-stratum proportion [0, 1]") if i == num_sexes - 1: # Bottom plot - ax_weight.set_xlabel('Length bin (cm)') + ax_weight.set_xlabel("Length bin (cm)") ax_weight.set_ylim(0.0, 1.0) # Add label in the top-left corner - ax_weight.text(0.05, 1.00 - 0.05 * (num_sexes - 1), sex.title(), - transform=ax_weight.transAxes, - fontsize=12, verticalalignment='top', - bbox=dict(facecolor='white', alpha=0.8, - edgecolor='none')) - + ax_weight.text( + 0.05, + 1.00 - 0.05 * (num_sexes - 1), + sex.title(), + transform=ax_weight.transAxes, + fontsize=12, + verticalalignment="top", + bbox=dict(facecolor="white", alpha=0.8, edgecolor="none"), + ) + # Count plot (right column) ax_count = axes[i, 1] - data_count = plot_count_data[plot_count_data['sex'] == sex] - for j, (stratum, group) in enumerate(data_count.groupby('stratum')): + data_count = plot_count_data[plot_count_data["sex"] == sex] + for j, (stratum, group) in enumerate(data_count.groupby("stratum")): color = colors(j / num_strata) if num_strata > 1 else colors(0) ms = 5 if group["number_proportion"].max() > 0.0 else 0.1 - ax_count.plot(group['length_bin'], group['number_proportion'], - marker='o', label=f'Stratum {stratum}', color=color, ms=ms) + ax_count.plot( + group["length_bin"], + group["number_proportion"], + marker="o", + label=f"Stratum {stratum}", + color=color, + ms=ms, + ) if i == 0: - ax_count.set_title(f"Number") + ax_count.set_title("Number") if i < num_sexes - 1: # No x-ticks for non-bottom plots - ax_count.set_xlabel('') + ax_count.set_xlabel("") if i == num_sexes - 1: # Bottom plot - ax_count.set_xlabel('Length bin (cm)') + ax_count.set_xlabel("Length bin (cm)") ax_count.set_ylim(0.0, 1.0) # Add label in the top-left corner - ax_count.text(0.05, 1.00 - 0.05 * (num_sexes - 1), sex.title(), - transform=ax_count.transAxes, - fontsize=12, verticalalignment='top', - bbox=dict(facecolor='white', alpha=0.8, - edgecolor='none')) + ax_count.text( + 0.05, + 1.00 - 0.05 * (num_sexes - 1), + sex.title(), + transform=ax_count.transAxes, + fontsize=12, + verticalalignment="top", + bbox=dict(facecolor="white", alpha=0.8, edgecolor="none"), + ) # Create a new axes for the legend - legend_ax = fig.add_axes([0.15, 0.05, 0.7, 0.1]) # Position the legend axes (left, bottom, width, height) - legend_ax.axis('off') # Hide the new axes - + legend_ax = fig.add_axes( + [0.15, 0.05, 0.7, 0.1] + ) # Position the legend axes (left, bottom, width, height) + legend_ax.axis("off") # Hide the new axes + # Create a shared legend in the bottom-most subplot - handles, labels = axes[2, 1].get_legend_handles_labels() # Get handles and labels from the bottom-left plot - fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 0.2), - ncol=num_strata // 2 + 1, fontsize='small', title='INPFC stratum') + handles, labels = axes[ + 2, 1 + ].get_legend_handles_labels() # Get handles and labels from the bottom-left plot + fig.legend( + handles, + labels, + loc="upper center", + bbox_to_anchor=(0.5, 0.2), + ncol=num_strata // 2 + 1, + fontsize="small", + title="INPFC stratum", + ) # plt.show() return fig diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py index 0e5f6a97..5a4765bb 100644 --- a/echopop/live/sql_methods.py +++ b/echopop/live/sql_methods.py @@ -1,13 +1,19 @@ -from sqlalchemy import create_engine, text, Engine, inspect -import sqlalchemy as sqla -import pandas as pd -from typing import Optional, Literal, Union, List -import numpy as np -from pathlib import Path import re +from pathlib import Path +from typing import List, Optional, Union -def sql_create(connection: sqla.Connection, dataframe: pd.DataFrame, table_name: str, - primary_keys: Optional[list] = None): +import numpy as np +import pandas as pd +import sqlalchemy as sqla +from sqlalchemy import create_engine, inspect, text + + +def sql_create( + connection: sqla.Connection, + dataframe: pd.DataFrame, + table_name: str, + primary_keys: Optional[list] = None, +): """ Generate a SQL command to create a table with dynamic columns, primary keys, and indices. @@ -20,16 +26,15 @@ def sql_create(connection: sqla.Connection, dataframe: pd.DataFrame, table_name: str: The SQL command to create the table. """ # Generate column definitions - column_definitions = ( - ",\n".join(f"{col} {SQL_DTYPES[type(dataframe[col][0]).__name__]}" - for col in dataframe.columns) - ) - + column_definitions = ",\n".join( + f"{col} {SQL_DTYPES[type(dataframe[col][0]).__name__]}" for col in dataframe.columns + ) + # Generate primary key definition primary_key_definition = "" if primary_keys: primary_key_definition = f",\nPRIMARY KEY ({', '.join(primary_keys)})" - + # Combine all parts into the final SQL command create_table_command = f""" CREATE TABLE IF NOT EXISTS {table_name} ( @@ -37,13 +42,13 @@ def sql_create(connection: sqla.Connection, dataframe: pd.DataFrame, table_name: {primary_key_definition} ); """ - + # Execute connection.execute(text(create_table_command.strip())) + def sql_map_tables(connection: sqla.Connection): - """ - """ + """ """ inspector = inspect(connection) table_names = inspector.get_table_names() # result = connection.execute(text("SELECT name FROM sqlite_master WHERE type='table';")) @@ -52,7 +57,8 @@ def sql_map_tables(connection: sqla.Connection): # table_names = [name[0] for name in table_names] return table_names -def sql_validate(connection: sqla.Connection, table_name: str): + +def sql_validate(connection: sqla.Connection, table_name: str): """ Check if a table exists in the database. @@ -62,10 +68,11 @@ def sql_validate(connection: sqla.Connection, table_name: str): Returns: bool: True if the table exists, False otherwise. - """ + """ inspector = inspect(connection) return table_name in inspector.get_table_names() + def sql_inspect(connection: sqla.Connection, table_name: str, columns: List[str] = None): """ Get a list of all tables present @@ -75,17 +82,17 @@ def sql_inspect(connection: sqla.Connection, table_name: str, columns: List[str] Returns: list: True if the table exists, False otherwise. - """ + """ # Inspect the columns from the table if columns is None: # ---- Create 'inspector' for the db file inspector = inspect(connection) # ---- Retrieve column information - column_info = inspector.get_columns(table_name) + column_info = inspector.get_columns(table_name) # ---- Format as a dictionary and return the output - return {col['name']: {k: v for k, v in col.items() if k != 'name'} for col in column_info} - else: + return {col["name"]: {k: v for k, v in col.items() if k != "name"} for col in column_info} + else: # Inspect unique values in specified columns # ---- Create SQL command sql_command = f"SELECT DISTINCT {', '.join(columns)} FROM {table_name};" @@ -94,17 +101,23 @@ def sql_inspect(connection: sqla.Connection, table_name: str, columns: List[str] # ---- Extract unique values unique_values = table.fetchall() # ---- Format as a dictionary and return the output - return ( - {col: list(set(row[idx] for row in unique_values)) for idx, col in enumerate(columns)} - ) + return { + col: list(set(row[idx] for row in unique_values)) for idx, col in enumerate(columns) + } + def sql_drop(connection: sqla.Connection, table_name: str): - """ - """ + """ """ connection.execute(text(f"DROP TABLE IF EXISTS {table_name}")) - -def sql_insert(connection: sqla.Connection, table_name: str, columns: list, dataframe: pd.DataFrame, - id_columns: Optional[list] = None): + + +def sql_insert( + connection: sqla.Connection, + table_name: str, + columns: list, + dataframe: pd.DataFrame, + id_columns: Optional[list] = None, +): """ Insert data into a table. @@ -115,11 +128,11 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data data (list of dict): List of dictionaries containing data to insert or update. conflict_columns (list): List of column names to use for conflict resolution. """ - + # Create 'inspector' for the db file inspector = inspect(connection) # ---- Get the column names from the db file - table_columns = [col['name'] for col in inspector.get_columns(table_name)] + table_columns = [col["name"] for col in inspector.get_columns(table_name)] # Prepare the SQL statement for insertion # ---- Check whether `columns` is '*' @@ -139,7 +152,7 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data # Format `id_columns` if id_columns is not None and not isinstance(id_columns, list): id_columns = [id_columns] - + # Convert the DataFrame into a tuple and then into a string # ---- Replace NaN with None dataframe = dataframe.replace([np.nan], [None]) @@ -147,27 +160,27 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data dataframe = dataframe[columns] # ---- DataFrame to Tuple data_tuple = [tuple(row) for row in dataframe.itertuples(index=False)] - + def format_value(x): if isinstance(x, str): return "'{}'".format(x.replace("'", "''")) elif isinstance(x, pd.Timestamp): return "'{}'".format(x) elif x is None: - return 'NULL' + return "NULL" else: return str(x) - + # ---- Tuple to String # data_str = ", ".join( # # f"({', '.join(map(lambda x: f'\'{x}\'' if isinstance(x, str) else str(x), row))})" - # f"({', '.join(map(lambda x: f'\'{x}\'' - # if isinstance(x, str) or isinstance(x, pd.Timestamp) + # f"({', '.join(map(lambda x: f'\'{x}\'' + # if isinstance(x, str) or isinstance(x, pd.Timestamp) # else 'NULL' if x is None else str(x), row))})" # for row in data_tuple # ) data_str = ", ".join(f"({','.join(map(lambda x: format_value(x), row))})" for row in data_tuple) - + # Construct the "ON CONFLICT, DO UPDATE SET" if needed on_conflict_clause = "" if id_columns: @@ -175,23 +188,29 @@ def format_value(x): ON CONFLICT ({', '.join(id_columns)}) DO UPDATE SET {', '.join(f'{col}=excluded.{col}' for col in columns)} """ - + # Construct the SQL query sql_command = f""" INSERT INTO {table_name} ({column_names}) VALUES {data_str} {on_conflict_clause} - """ - + """ + # Execute connection.execute(text(sql_command.strip())) - + # Commit connection.commit() -def sql_update(connection: sqla.Connection, table_name: str, columns: list, - dataframe: Optional[pd.DataFrame] = None, operation: Optional[str] = None, - condition: Optional[str] = None): + +def sql_update( + connection: sqla.Connection, + table_name: str, + columns: list, + dataframe: Optional[pd.DataFrame] = None, + operation: Optional[str] = None, + condition: Optional[str] = None, +): """ Insert data into a table. @@ -202,14 +221,14 @@ def sql_update(connection: sqla.Connection, table_name: str, columns: list, data (list of dict): List of dictionaries containing data to insert or update. conflict_columns (list): List of column names to use for conflict resolution. """ - + # Prepare the SQL statement for insertion # ---- Check whether `columns` is '*' if "*" in columns: # ---- Create 'inspector' for the db file inspector = inspect(connection) # ---- Get the column names from the db file - columns = [col['name'] for col in inspector.get_columns(table_name)] + columns = [col["name"] for col in inspector.get_columns(table_name)] # ---- If not a List elif not isinstance(columns, list): columns = [columns] @@ -220,15 +239,16 @@ def format_value(x): elif isinstance(x, pd.Timestamp): return "'{}'".format(x) elif x is None: - return 'NULL' + return "NULL" else: return str(x) # Format the SET command # ---- Update column by applying arithmetic between table and dataframe if operation is not None and dataframe is not None: - set_list = [f"{column} = {column} {operation} {dataframe[column].values[0]}" - for column in columns] + set_list = [ + f"{column} = {column} {operation} {dataframe[column].values[0]}" for column in columns + ] # ---- Update column by applying arithmetic within table if dataframe is None and operation is not None: # ---- Make sure `operation` is a list @@ -240,7 +260,7 @@ def format_value(x): if dataframe is not None and operation is None: set_list = [f"{column} = {dataframe[column].values[0]}" for column in columns] # ---- Join the list - set_clause = ', '.join(set_list) + set_clause = ", ".join(set_list) # Add the WHERE clause if a parsed condition is provided if condition is not None: @@ -253,14 +273,18 @@ def format_value(x): # Execute connection.execute(text(sql_command.strip())) - + # Commit connection.commit() -def sql_select(connection: sqla.Connection, table_name: str, - columns: Optional[Union[list, str]] = None, - condition: Optional[str] = None, - output_type: type = pd.DataFrame): + +def sql_select( + connection: sqla.Connection, + table_name: str, + columns: Optional[Union[list, str]] = None, + condition: Optional[str] = None, + output_type: type = pd.DataFrame, +): # Columns if columns is None: @@ -286,16 +310,16 @@ def sql_select(connection: sqla.Connection, table_name: str, parsed_condition = parse_condition(condition) sql_command += " WHERE " + parsed_condition - # Execute the command + # Execute the command table = connection.execute(text(sql_command)) # Fetch the data from the table data = table.fetchall() - + # Inspect the table to construct a dictionary of expected datatypes for each column table_info = sql_inspect(connection, table_name=table_name) # ---- Whittle down the information dictionary to isolate just the column datatypes - table_dtypes = {col: info['type'] for col, info in table_info.items()} + table_dtypes = {col: info["type"] for col, info in table_info.items()} # Raise error if `output_type` is invalid if output_type not in [pd.DataFrame, np.ndarray, str, tuple]: @@ -304,23 +328,28 @@ def sql_select(connection: sqla.Connection, table_name: str, f"`pandas.DataFrame`, or `numpy.ndarray`." ) - # Format the output + # Format the output # ---- DataFrame if output_type is pd.DataFrame: # ---- Create DataFrame output_df = pd.DataFrame(data, columns=table.keys()) # ---- Format the expected datatypes - df_dtypes = {col: SQL_DTYPES[type(dtype).__name__] - for col, dtype in table_dtypes.items() if col in columns } + df_dtypes = { + col: SQL_DTYPES[type(dtype).__name__] + for col, dtype in table_dtypes.items() + if col in columns + } # ---- Apply the dtypes return output_df.astype(df_dtypes) else: # ---- Get the datatypes that will correspond to each value of the tuples tuple_dtypes = [SQL_DTYPES[type(dtype).__name__] for _, dtype in table_dtypes.items()] - # ---- Convert the `Row` objects to tuples + # ---- Convert the `Row` objects to tuples converted_data = [ - tuple(dtype(value) if value is not None else None - for value, dtype in zip(row, tuple_dtypes)) + tuple( + dtype(value) if value is not None else None + for value, dtype in zip(row, tuple_dtypes) + ) for row in data ] # ---- String @@ -333,25 +362,25 @@ def sql_select(connection: sqla.Connection, table_name: str, else: return converted_data -def validate_tables(db_file: str, table_name: Union[str, List[str]], - reference_dataframe: pd.DataFrame): + +def validate_tables( + db_file: str, table_name: Union[str, List[str]], reference_dataframe: pd.DataFrame +): # Helper function def _validate_table(table): # ---- Check table existence if not SQL(db_file, "validate", table_name=table): - raise KeyError( - f"SQL database table `{table}` in `{db_file}` failed to initialize!" - ) + raise KeyError(f"SQL database table `{table}` in `{db_file}` failed to initialize!") # ---- Get DataFrame dtypes (avoid 'object' and similar ambiguous typing) - expected_dtypes = ( - {col: type(reference_dataframe[col][0]).__name__ for col in reference_dataframe.columns} - ) - # ---- Inspect the table + expected_dtypes = { + col: type(reference_dataframe[col][0]).__name__ for col in reference_dataframe.columns + } + # ---- Inspect the table inspected_table = SQL(db_file, "inspect", table_name=table) # ---- Get the column dtypes (with back-formatting via configuration mapping) table_dtypes = { - col: SQL_DTYPES[type(inspected_table["filepath"]["type"]).__name__].__name__ + col: SQL_DTYPES[type(inspected_table["filepath"]["type"]).__name__].__name__ for col in inspected_table.keys() } # ---- Compare keys @@ -363,10 +392,11 @@ def _validate_table(table): f"{', '.join(key_difference)}." ) # ---- Compare dtypes - dtypes_comparison = ( - {key: table_dtypes[key] for key in table_dtypes - if table_dtypes[key] != expected_dtypes.get(key)} - ) + dtypes_comparison = { + key: table_dtypes[key] + for key in table_dtypes + if table_dtypes[key] != expected_dtypes.get(key) + } # ---- Get key names dtypes_different_names = list(set(dtypes_comparison)) # ---- Raise error, if needed @@ -375,16 +405,17 @@ def _validate_table(table): f"The following columns from table `{table}` in `{db_file}` had unexpected " f"datatypes: {', '.join(dtypes_different_names)}." ) - + # Iterate through tables to validate if isinstance(table_name, list): _ = [_validate_table(table) for table in table_name] else: _validate_table(table_name) + def initialize_database(root_directory: Path, file_settings: dict): - # Get the database name + # Get the database name db_name = file_settings["database_name"] # Create filepath to the SQL database @@ -401,22 +432,27 @@ def initialize_database(root_directory: Path, file_settings: dict): # Create two tables for 'files read' and 'files processed' # ---- Read files - SQL(db_file, "create", table_name="files_read", dataframe=template_df, - primary_keys=["filepath"]) + SQL( + db_file, "create", table_name="files_read", dataframe=template_df, primary_keys=["filepath"] + ) # ---- Processed files - SQL(db_file, "create", table_name="files_processed", dataframe=template_df, - primary_keys=["filepath"]) - + SQL( + db_file, + "create", + table_name="files_processed", + dataframe=template_df, + primary_keys=["filepath"], + ) + # Query the database ensure it exists # ---- File existence if not Path(db_file).exists(): - raise FileExistsError( - f"SQL database file `{db_file}` failed to initialize!" - ) - + raise FileExistsError(f"SQL database file `{db_file}` failed to initialize!") + # Validate the created tables validate_tables(db_file, ["files_read", "files_processed"], template_df) + SQL_COMMANDS = { "create": dict(function=sql_create, args=["table_name", "dataframe", "primary_keys"]), "drop": dict(function=sql_drop, args=["table_name"]), @@ -424,37 +460,41 @@ def initialize_database(root_directory: Path, file_settings: dict): "inspect": dict(function=sql_inspect, args=["table_name", "columns"]), "map": dict(function=sql_map_tables, args=[]), "select": dict(function=sql_select, args=["table_name", "columns", "output_type", "condition"]), - "update": dict(function=sql_update, args=["table_name", "columns", "condition", "operation", - "dataframe"]), + "update": dict( + function=sql_update, args=["table_name", "columns", "condition", "operation", "dataframe"] + ), "validate": dict(function=sql_validate, args=["table_name"]), } - + SQL_DTYPES = { - 'int32': 'INTEGER', - 'int64': 'INTEGER', - 'float64': 'FLOAT', + "int32": "INTEGER", + "int64": "INTEGER", + "float64": "FLOAT", "float": "FLOAT", "int": "INTEGER", - 'bool': 'BOOLEAN', + "bool": "BOOLEAN", "Interval": "TEXT", "Timestamp": "DATETIME", - 'object': 'TEXT', + "object": "TEXT", "str": "TEXT", "FLOAT": float, "INTEGER": int, "DATETIME": str, "TEXT": str, "BIGINT": int, -} - -def sql_group_update(db_file: str, - dataframe: pd.DataFrame, - table_name: str, - columns: List[str], - unique_columns: List[str], - operation: Optional[str] = None, - id_columns: Optional[List[str]] = None): - +} + + +def sql_group_update( + db_file: str, + dataframe: pd.DataFrame, + table_name: str, + columns: List[str], + unique_columns: List[str], + operation: Optional[str] = None, + id_columns: Optional[List[str]] = None, +): + # Check for unique values contained within the table unique_values = SQL(db_file, "inspect", table_name=table_name, columns=unique_columns) @@ -462,8 +502,9 @@ def sql_group_update(db_file: str, table_values = {col: dataframe[col].unique().tolist() for col in unique_columns} # Find mismatched indices - new_indices = {col: list(set(table_values[col]) - set(unique_values[col])) - for col in unique_columns} + new_indices = { + col: list(set(table_values[col]) - set(unique_values[col])) for col in unique_columns + } # Filter the DataFrame to include only rows with these missing values # ---- Create DataFrame copy @@ -477,18 +518,20 @@ def sql_group_update(db_file: str, filtered_df = pd.DataFrame(columns=filtered_df.columns) # Insert into the table if not otherwise present - if not filtered_df.empty: + if not filtered_df.empty: SQL(db_file, "insert", table_name=table_name, id_columns=id_columns, dataframe=filtered_df) - + case_statements = [] for col in columns: case_stmt = "CASE" for _, row in dataframe.iterrows(): # Construct the filter condition based on unique_columns - filter_conditions = ' AND '.join([ - f"{col} = '{row[col]}'" if isinstance(row[col], str) else f"{col} = {row[col]}" - for col in unique_columns - ]) + filter_conditions = " AND ".join( + [ + f"{col} = '{row[col]}'" if isinstance(row[col], str) else f"{col} = {row[col]}" + for col in unique_columns + ] + ) # Add the WHEN condition to the CASE statement case_stmt += f" WHEN {filter_conditions} THEN {row[col]}" case_stmt += f" ELSE {col} END" @@ -497,8 +540,7 @@ def sql_group_update(db_file: str, case_statements.append(f"{col} = {col} {operation} {case_stmt}") else: case_statements.append(f"{col} = {case_stmt}") - - + # Update the table # ---- Format the conditional string # case_statements = [] @@ -519,7 +561,7 @@ def sql_group_update(db_file: str, update_clause = ", ".join(case_statements) # Format the SQL COMMAND string - # sql_command = f""" + # sql_command = f""" # UPDATE {table_name} # SET {update_clause} # WHERE ({' OR '.join([ @@ -530,7 +572,7 @@ def sql_group_update(db_file: str, # for _, row in dataframe.iterrows() # ])}); # """ - sql_command = f""" + sql_command = f""" UPDATE {table_name} SET {update_clause}; """ @@ -554,6 +596,7 @@ def sql_group_update(db_file: str, # Dispose engine engine.dispose() + def get_table_key_names(db_file: Path, data_dict: dict, table_name: str) -> List[str]: # Get the data input column names @@ -567,17 +610,15 @@ def get_table_key_names(db_file: Path, data_dict: dict, table_name: str) -> List table_columns = data_dict[table_name].columns # Create a list of the primary keys - key_columns = ( - set(table_columns) - .intersection(["trawl_partition", "sex", "haul_num", "species_id", "longitude", - "latitude", "stratum"]) - ) + key_columns = set(table_columns).intersection( + ["trawl_partition", "sex", "haul_num", "species_id", "longitude", "latitude", "stratum"] + ) # Return a list of the output return list(key_columns) -def get_unique_identifiers(data_dict: dict, - unique_columns: List[str]) -> pd.DataFrame: + +def get_unique_identifiers(data_dict: dict, unique_columns: List[str]) -> pd.DataFrame: # Gather all dataframes from a dictionary into a list if isinstance(data_dict, dict): @@ -585,12 +626,16 @@ def get_unique_identifiers(data_dict: dict, else: df_list = [data_dict] - # Get unique values of each contrast column across the biological datasets + # Get unique values of each contrast column across the biological datasets combined_df = pd.concat( - [df[unique_columns] for df in df_list if isinstance(df, pd.DataFrame) and all(col in df.columns for col in unique_columns)], - ignore_index=True + [ + df[unique_columns] + for df in df_list + if isinstance(df, pd.DataFrame) and all(col in df.columns for col in unique_columns) + ], + ignore_index=True, ).drop_duplicates() - + # Reduce into a single DataFrame return combined_df # if len(unique_columns) > 1: @@ -601,60 +646,80 @@ def get_unique_identifiers(data_dict: dict, def parse_condition(condition: str): # Replace logical operators with SQL equivalents - condition = condition.replace('&', ' AND ').replace('|', ' OR ') - + condition = condition.replace("&", " AND ").replace("|", " OR ") + # Handle "IN" lists and replace square brackets with parentheses - condition = re.sub(r'(\w+)\s*IN\s*\[(.*?)\]', lambda m: f"{m.group(1)} IN ({m.group(2)})", condition, flags=re.IGNORECASE) - + condition = re.sub( + r"(\w+)\s*IN\s*\[(.*?)\]", + lambda m: f"{m.group(1)} IN ({m.group(2)})", + condition, + flags=re.IGNORECASE, + ) + # Handle range conditions for BETWEEN, including floats - condition = re.sub(r'(\d*\.\d+|\d+)\s*<=\s*(\w+)\s*<=\s*(\d*\.\d+|\d+)', - lambda m: f"{m.group(2)} BETWEEN {m.group(1)} AND {m.group(3)}", condition) - + condition = re.sub( + r"(\d*\.\d+|\d+)\s*<=\s*(\w+)\s*<=\s*(\d*\.\d+|\d+)", + lambda m: f"{m.group(2)} BETWEEN {m.group(1)} AND {m.group(3)}", + condition, + ) + # Handle individual comparisons - condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\d*\.\d+|\d+)', lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", condition) - condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\'[^\']*\')', lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", condition) + condition = re.sub( + r"(\w+)\s*([<>!=]+)\s*(\d*\.\d+|\d+)", + lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", + condition, + ) + condition = re.sub( + r"(\w+)\s*([<>!=]+)\s*(\'[^\']*\')", + lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", + condition, + ) # Return the parsed condition return condition + def format_sql_select(table_name, column_names, condition_string): # Base SQL command to select columns from the table sql_command = f"SELECT {column_names} FROM {table_name}" - + # Parse the condition string parsed_condition = parse_condition(condition_string) - + # Add the WHERE clause if a parsed condition is provided if parsed_condition: sql_command += " WHERE " + parsed_condition - + # Add a semicolon at the end of the SQL command sql_command += ";" - + return sql_command + def format_sql_columns(kwargs: dict): # Columns if "columns" in kwargs and "condition" not in kwargs: if isinstance(kwargs["columns"], list) or isinstance(kwargs["columns"], pd.Index): kwargs["columns"] = ", ".join(kwargs["columns"]) - elif "columns" not in kwargs: + elif "columns" not in kwargs: kwargs["columns"] = "*" # ID/Conflict columns if "id_columns" in kwargs: if isinstance(kwargs["id_columns"], list) or isinstance(kwargs["id_columns"], pd.Index): - kwargs["id_columns"] = ", ".join(kwargs["id_columns"]) + kwargs["id_columns"] = ", ".join(kwargs["id_columns"]) # Return the updated `kwargs` dictionary return kwargs + # TODO: Documentation -def query_processed_files(root_directory: Path, file_settings: dict, files: List[Path], - processed=False) -> dict: +def query_processed_files( + root_directory: Path, file_settings: dict, files: List[Path], processed=False +) -> dict: - # Get the database name + # Get the database name db_name = file_settings["database_name"] # Create filepath to the SQL database @@ -670,22 +735,33 @@ def query_processed_files(root_directory: Path, file_settings: dict, files: List files_str = [str(file) for file in files] # ---- Create DataFrame current_files = pd.DataFrame(files_str, columns=["filepath"]) - + # Check against `files_processed` previous_files = SQL(db_file, "select", table_name="files_processed", output_type=str) # Insert the files into the `files_read` table - if processed: - SQL(db_file, "insert", table_name="files_processed", dataframe=current_files, - id_columns=["filepath"]) + if processed: + SQL( + db_file, + "insert", + table_name="files_processed", + dataframe=current_files, + id_columns=["filepath"], + ) elif not current_files.empty: - SQL(db_file, "insert", table_name="files_read", dataframe=current_files, - id_columns=["filepath"]) + SQL( + db_file, + "insert", + table_name="files_read", + dataframe=current_files, + id_columns=["filepath"], + ) # ---- Apply filter by comparing sets and return the output return list(set(files_str) - set(previous_files)), db_file else: return None, db_file + # TODO: Documentation def sql_data_exchange(database_file: Path, **kwargs): @@ -700,18 +776,21 @@ def sql_data_exchange(database_file: Path, **kwargs): if not table_exists: # ---- Create table SQL(database_file, "create", **kwargs) - # ---- Insert into table + # ---- Insert into table SQL(database_file, "insert", **kwargs) # ---- Return the initial dataframe return kwargs.get("dataframe") - + # Insert into the table SQL(database_file, "insert", **kwargs) - + # Select existing data frame the database and return the output return SQL(database_file, "select", **kwargs) -def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str, List[str]]] = None): + +def reset_db_files( + file_configuration: dict, table_exception: Optional[Union[str, List[str]]] = None +): # Get all database files database_files = file_configuration["database"] @@ -727,22 +806,23 @@ def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str if None not in table_exception: table_names = list(set(table_names) - set(table_exception)) # ---- Iterate through - for table_name in table_names: + for table_name in table_names: SQL(db_file, "drop", table_name=table_name) - # ---- Validate that all tables were removed - remaining_tables = SQL(table_names, "map") + # ---- Validate that all tables were removed + remaining_tables = SQL(table_names, "map") if set(table_names).intersection(set(remaining_tables)): - raise ValueError( - f"Attempted reset of [{str(db_file)}] failed." - ) + raise ValueError(f"Attempted reset of [{str(db_file)}] failed.") + + +def query_dataset( + db_file: str, + data_dict: dict, + table_name: str, + data_columns: List[str], + unique_columns: List[str], + constraint: Optional[str] = None, +): -def query_dataset(db_file: str, - data_dict: dict, - table_name: str, - data_columns: List[str], - unique_columns: List[str], - constraint: Optional[str] = None): - # Validate that the desired table exists if SQL(db_file, "validate", table_name=table_name): # ---- Inspect the SQL table @@ -753,51 +833,58 @@ def query_dataset(db_file: str, valid_keys = list(set(inspected_table.keys()).intersection(set(data_columns))) # ---- Get unique identifiers unique_keys_df = get_unique_identifiers(data_dict, unique_keys) + # ---- Conditional string formatting helper function def format_value(x): if isinstance(x, str): return "'{}'".format(x.replace("'", "''")) return str(x) - # ---- Create conditional string + + # ---- Create conditional string conditional_str = " | ".join( - [" & ".join([f"{col} = {format_value(val)}" for col, val in row.items()]) - for _, row in unique_keys_df.iterrows()] + [ + " & ".join([f"{col} = {format_value(val)}" for col, val in row.items()]) + for _, row in unique_keys_df.iterrows() + ] ) # conditional_str = " | ".join( - # [" & ".join([f"{col} = {val}" for col, val in row.items()]) + # [" & ".join([f"{col} = {val}" for col, val in row.items()]) # for _, row in unique_keys_df.iterrows()] - # ) + # ) # conditional_str = ( - # " & ".join([f"{col} in {np.unique(unique_keys_df[col]).tolist()}" - # for col in unique_keys_df.columns]) + # " & ".join([f"{col} in {np.unique(unique_keys_df[col]).tolist()}" + # for col in unique_keys_df.columns]) # ) # ---- Append the additional constraint statement if present if constraint is not None: conditional_str = f"({conditional_str})" + f" & {constraint}" # ---- SELECT the dataset using the conidtional statement - data_sql = SQL(db_file, "select", table_name=table_name, columns=valid_keys, - condition=conditional_str).filter(data_columns) + data_sql = SQL( + db_file, "select", table_name=table_name, columns=valid_keys, condition=conditional_str + ).filter(data_columns) else: data_sql = None # Return the table DataFrame return data_sql -def sql_update_strata_summary(source_db: str, - target_db: str, - source_table: str, - target_table: str, - data_columns: List[tuple[str, str]], - strata: list): - + + +def sql_update_strata_summary( + source_db: str, + target_db: str, + source_table: str, + target_table: str, + data_columns: List[tuple[str, str]], + strata: list, +): + # Format strata list as a string - strata_str = ', '.join(map(str, strata)) + strata_str = ", ".join(map(str, strata)) # Function reference map FUNCTION_MAP = { - "sum": {"function": "SUM", - "suffix": "sum"}, - "mean": {"function": "AVG", - "suffix": "mean"} + "sum": {"function": "SUM", "suffix": "sum"}, + "mean": {"function": "AVG", "suffix": "mean"}, } # Prepare the SQL script @@ -830,7 +917,7 @@ def sql_update_strata_summary(source_db: str, ) WHERE stratum IN ({strata_str}); """ - # ----- Append DETACH commands only once at the end + # ----- Append DETACH commands only once at the end sql_script += """ -- Detach the databases DETACH DATABASE source; @@ -840,7 +927,7 @@ def sql_update_strata_summary(source_db: str, # Create the engine engine = create_engine(f"sqlite:///{target_db}") - # Create the SQL database connection and send the script + # Create the SQL database connection and send the script with engine.connect() as connection: dbapi_conn = connection.connection _ = dbapi_conn.executescript(sql_script) @@ -851,11 +938,11 @@ def SQL(db_file: str, command: str, **kwargs): # Create engine from `db_file` string engine = create_engine(f"sqlite:///{db_file}") - + # Format the data columns, if necessary, to fit within the SQL commands if command not in ["inspect", "update"]: kwargs = format_sql_columns(kwargs) - + # Run the command try: with engine.connect() as connection: @@ -867,6 +954,6 @@ def SQL(db_file: str, command: str, **kwargs): kwargs = {key: value for key, value in kwargs.items() if key in command_args} # ---- Return output return command_function(connection, **kwargs) - finally: + finally: # ---- Dispose of the engine to release any resources being pooled/used engine.dispose() diff --git a/echopop/mesh_generation.py b/echopop/mesh_generation.py index 7752fe63..257829db 100644 --- a/echopop/mesh_generation.py +++ b/echopop/mesh_generation.py @@ -1,2267 +1,2307 @@ -import numpy as np -import pandas as pd -from sqlalchemy import create_engine, text -from pathlib import Path -import os - -SQL_COMMANDS["create"].format(**{"table_name": "A", "column_definitions": "B"}) - -# Coordinates -x = np.array([1, 2, 3, 4, 5]) -y = np.array([1, 2, 3, 4, 5]) - -# Create the grid points -grid_points = [(i, j, 0) for i in x for j in y] - -def initialize_grid(): - - -data_root_dir = Path("C:/Users/Brandyn/Documents/GitHub/EchoPro_data/") -db_directory = data_root_dir / "database" -# ---- Create the directory if it does not already exist -db_directory.mkdir(parents=True, exist_ok=True) -# ---- Complete path to `biology.db` -db_file = db_directory / "grid.db" - -from sqlalchemy import create_engine, MetaData, Table, select, inspect, update, text, case - -# Initialize the database and create the table -engine = create_engine(f"sqlite:///{db_file}") - -# Define metadata and the table to drop -metadata = MetaData() -grid_table = Table('grid', metadata, autoload_with=engine) -# Drop the table -with engine.connect() as connection: - grid_table.drop(connection) - print("Table 'grid' has been dropped.") - -# Inspect the database -inspector = inspect(engine) -tables = inspector.get_table_names() -print(tables) - -def create_table_sql(table_name, columns, primary_keys=None, index_columns=None): - """ - Generate a SQL command to create a table with dynamic columns, primary keys, and indices. - - Args: - table_name (str): The name of the table. - columns (dict): A dictionary where keys are column names and values are data types. - primary_keys (list, optional): List of column names to be used as primary keys. - index_columns (list, optional): List of column names to be indexed. - - Returns: - str: The SQL command to create the table. - """ - # Generate column definitions - column_definitions = ",\n ".join(f"{col} {dtype}" for col, dtype in columns.items()) - - # Generate primary key definition - primary_key_definition = "" - if primary_keys: - primary_key_definition = f",\n PRIMARY KEY ({', '.join(primary_keys)})" - - # Generate index definitions - index_definitions = "" - if index_columns: - index_definitions = "\n".join( - f"CREATE INDEX IF NOT EXISTS idx_{table_name}_{col} ON {table_name} ({col});" - for col in index_columns - ) - - # Combine all parts into the final SQL command - create_table_command = f""" - CREATE TABLE IF NOT EXISTS {table_name} ( - {column_definitions} - {primary_key_definition} - ); - """ - # Return the command and any index definitions - return create_table_command.strip() + "\n" + index_definitions - -# Define metadata and the table to drop -metadata = MetaData() -grid_table = Table('grid', metadata, autoload_with=engine) -# Drop the table -with engine.connect() as connection: - grid_table.drop(connection) - print("Table 'grid' has been dropped.") - -check_table_exists(engine, "grid") - -with engine.connect() as connection: - sql_create(connection, df, table_name, primary_keys) - -# Create the table -table_name = "grid" -columns = {"x": "INTEGER", "y": "INTEGER", "value": "REAL"} -primary_keys = ["x", "y"] -index_columns = ["x", "y"] - -create_sql = create_table_sql(table_name, columns, primary_keys, index_columns) -print("Create Table SQL:\n", create_sql) - -with engine.connect() as connection: - connection.execute(text(create_sql)) - -inspector = inspect(engine) -tables = inspector.get_table_names() -print(tables) - -check_table_exists(engine, "grid") - -sql_command = f"SELECT * FROM {table_name};" - -with engine.connect() as connection: - result = connection.execute(text(sql_command)) - rows = result.fetchall() - -for row in rows: - print(row) - -converted_data[0] -check_table_exists(engine, "files_read") - -zarr_files_str = ["A", "B", "C", "D"] -# ---- Create DataFrame -current_files = pd.DataFrame(zarr_files_str, columns=["filepath"]) - -with engine.connect() as connection: - sql_create(connection, table_name="files_read", df=current_files) - sql_insert(connection, table_name="files_read", columns=["filepath"], dataframe=current_files) - -table_name = "files_read" -sql_command = f"SELECT * FROM {table_name};" - -with engine.connect() as connection: - result = connection.execute(text(sql_command)) - rows = result.fetchall() - -for row in rows: - print(row) - - - -from sqlalchemy.exc import IntegrityError - -def insert_or_update(engine, table_name, columns, data, conflict_columns): - """ - Insert or update data in a table. - - Args: - engine (Engine): The SQLAlchemy engine instance. - table_name (str): The name of the table. - columns (list): List of column names. - data (list of dict): List of dictionaries containing data to insert or update. - conflict_columns (list): List of column names to use for conflict resolution. - """ - - # Prepare the SQL statement for insertion - column_names = ", ".join(columns) - placeholder = ", ".join(f":{col}" for col in columns) - # values_list = ", ".join(f"({', '.join(f':{col}' for col in columns)})" for _ in data) - values_str = ", ".join( - f"({', '.join(map(str, row))})" - for row in data - ) - - - # Construct the SQL query - sql = f""" - INSERT INTO {table_name} ({column_names}) - VALUES {values_str} - ON CONFLICT ({', '.join(conflict_columns)}) - DO UPDATE SET {', '.join(f'{col}=excluded.{col}' for col in columns)} - """ - - # Flatten the list of data for execution - # flattened_data = [item for sublist in [[(item[col] for col in columns)] for item in data] for item in sublist] - - # Execute the SQL command - with engine.connect() as connection: - try: - connection.execute(text(sql)) - # connection.commit() - print(f"Data inserted or updated successfully in table '{table_name}'.") - except IntegrityError as e: - print(f"IntegrityError: {e}") - except Exception as e: - print(f"An error occurred: {e}") - -# Prepare data for insertion or update -# data = [{'x': i, 'y': j, 'value': v} for i, j, v in grid_points] -data = grid_points - -# Insert or update data -insert_or_update(engine, table_name, columns.keys(), data, primary_keys) - -sql_command = f"SELECT * FROM {table_name};" - -with engine.connect() as connection: - result = connection.execute(text(sql_command)) - rows = result.fetchall() - -for row in rows: - print(row) - -def update_specific_rows(engine, table_name, updates, conditions): - """ - Update specific rows in a table based on conditions. - - Args: - engine (Engine): The SQLAlchemy engine instance. - table_name (str): The name of the table. - updates (dict): Dictionary of columns and their new values to be updated. - conditions (dict): Dictionary of columns and their values to be used in the WHERE clause. - """ - - # Construct the SET clause for the update - set_clause = ", ".join(f"{col} = :{col}" for col in updates.keys()) - - # Construct the WHERE clause for the update - where_clause = " AND ".join(f"{col} = :{col}_cond" for col in conditions.keys()) - - # Construct the SQL query - sql = f""" - UPDATE {table_name} - SET {set_clause} - WHERE {where_clause} - """ - - # Prepare parameters for the query - parameters = {**updates, **{f"{col}_cond": val for col, val in conditions.items()}} - - # Execute the SQL command - with engine.connect() as connection: - try: - connection.execute(text(sql), parameters) - print(f"Rows updated successfully in table '{table_name}'.") - except IntegrityError as e: - print(f"IntegrityError: {e}") - except Exception as e: - print(f"An error occurred: {e}") - -# Define table name -table_name = "grid" -# Define the table and columns -table_name = 'grid' -condition_columns = ['x', 'y'] - -# Define the updates and conditions -dd = {"x": np.array([1, 2, 3 , 4, 5]),"" "y": np.array([1, 2, 3 , 4, 5]), "value": np.array([1, 2, 3 , 4, 5]).astype(float)} -new_data = pd.DataFrame(dd) -new_data -df = new_data - -kwargs = {"table_name": "grid", "columns": df.columns, "df": df} - -with engine.connect() as connection: - # sql_create(connection, table_name = "grid", df = df) - # sql_validate(connection, "grid") - # sql_drop(connection, "grid") - sql_insert(connection, table_name="grid", columns=df.columns, dataframe=df, id_columns=["x", "y"]) - - -data_tuples = [tuple(row) for row in df.itertuples(index=False)] - -all_columns = df.columns.tolist() -if len(condition_columns) >= len(all_columns): - raise ValueError("The number of condition columns must be less than the number of columns in data.") - -# Prepare column names and conditions -update_columns = [col for col in all_columns if col not in condition_columns] -condition_str = " AND ".join(f"{col} = ?" for col in condition_columns) -update_str = ", ".join(f"{col} = ?" for col in update_columns) -data_tuples = [tuple(row) for row in df.itertuples(index=False)] -# Generate values string for SQL command -values_str = ", ".join( - f"({', '.join(map(str, row))})" - for row in data_tuples -) - -# Construct the SQL query -sql = f""" -INSERT INTO {table_name} ({', '.join(all_columns)}) -VALUES {values_str} -ON CONFLICT ({', '.join(condition_columns)}) -DO UPDATE SET {', '.join(f'{col} = {table_name}.{col} + excluded.{col}' for col in update_columns)} -""" - -# Execute the SQL command -with engine.connect() as connection: - try: - connection.execute(text(sql)) - connection.commit() - print(f"Specific rows updated successfully in table '{table_name}'.") - except IntegrityError as e: - print(f"IntegrityError: {e}") - except Exception as e: - print(f"An error occurred: {e}") - -sql_command = f"SELECT * FROM {table_name};" - -with engine.connect() as connection: - result = connection.execute(text(sql_command)) - rows = result.fetchall() - -for row in rows: - print(row) - - -# Insert or update data -insert_or_update(engine, table_name, columns.keys(), data, primary_keys) - -sql_command = f"SELECT * FROM {table_name};" - -with engine.connect() as connection: - result = connection.execute(text(sql_command)) - rows = result.fetchall() - -for row in rows: - print(row) - -# Ensure that condition_columns match the length of data tuples minus the update column -if len(condition_columns) != len(df.columns) - 1: - raise ValueError("The number of condition columns must match the number of columns in data minus the update column.") - -# Prepare the SQL statement for update -update_columns = [col for col in df.columns if col not in condition_columns] -condition_str = " AND ".join(f"{col} = ?" for col in condition_columns) -update_str = ", ".join(f"{col} = ?" for col in update_columns) -# Convert DataFrame rows to list of tuples -data_tuples = [tuple(row) for row in df.itertuples(index=False)] - -# Generate a values string for the SQL command -values_str = ", ".join( - f"({', '.join(map(str, row))})" - for row in data_tuples -) -# Construct the SQL query -sql = f""" -UPDATE {table_name} -SET {update_str} -WHERE {condition_str} -""" - -# Flatten the list of data for execution -flattened_data = [] -for row in data_tuples: - conditions = row[:len(condition_columns)] - update_values = row[len(condition_columns):] - flattened_data.extend(conditions + update_values) - -# Execute the SQL command -with engine.connect() as connection: - try: - connection.execute(text(sql), flattened_data) - print(f"Specific rows updated successfully in table '{table_name}'.") - except IntegrityError as e: - print(f"IntegrityError: {e}") - except Exception as e: - print(f"An error occurred: {e}") - -# Execute the SQL command -with engine.connect() as connection: - try: - connection.execute(text(sql), flattened_data) - print(f"Specific rows updated successfully in table '{table_name}'.") - except IntegrityError as e: - print(f"IntegrityError: {e}") - except Exception as e: - print(f"An error occurred: {e}") -# Update specific rows -update_specific_rows(engine, table_name, updates, conditions) - -# Verify the update -sql_command = f"SELECT * FROM {table_name};" -with engine.connect() as connection: - result = connection.execute(text(sql_command)) - rows = result.fetchall() - -for row in rows: - print(row) -# Construct the full SQL command -sql_command = f""" -INSERT INTO {table_name} ({columns_str}) -VALUES {values_str}; -""" - -# Execute the SQL command -with engine.connect() as connection: - connection.execute(text(sql_command)) - connection.commit() - -check_table_exists(engine, "grid") - -# Define table name, columns, and data -table_name = 'grid' -columns = ['x', 'y', 'value'] -data = [ - (1, 1, 1.0), - (2, 2, 1.5), - (3, 3, 2.0) -] - -# Prepare the columns part of the SQL statement -columns_str = ", ".join(columns) - -# Prepare the values part of the SQL statement -values_str = ", ".join( - f"({', '.join(map(str, row))})" - for row in data -) - - - - - - -print("Generated SQL Command:") -print(sql_command) - -# Execute the SQL command -with engine.connect() as connection: - connection.execute(text(sql_command)) - -def insert_values_sql(table_name, columns, values, filter_clause=""): - """ - Generate a SQL command to insert values into a table. - - Args: - table_name (str): The name of the table. - columns (list): List of column names to be inserted. - values (list of tuples): List of tuples where each tuple represents a row of values to be inserted. - filter_clause (str, optional): Optional filter clause to specify conditions for insertion. - - Returns: - str: The SQL command to insert values into the table. - """ - # Generate column names - column_names = ", ".join(columns) - - # Generate value placeholders - value_placeholders = ", ".join("?" * len(columns)) - - # Generate values part - values_part = ", ".join(f"({', '.join('?' * len(columns))})" for _ in values) - - # Flatten the values list for insertion - flattened_values = [item for sublist in values for item in sublist] - - # Create the SQL command - insert_command = f""" - INSERT INTO {table_name} ({column_names}) - VALUES {values_part} - {filter_clause} - """ - return insert_command.strip(), flattened_values - -# Define the values for insertion -insert_columns = ["x", "y", "value"] -insert_values = [(1, 1, 10.0)] - -insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values) -print("Insert Values SQL:\n", insert_sql) -print("Data:\n", insert_data) - -insrt_stmt = - -with engine.connect() as connection: - connection.execute(text(insert_sql), tuple(insert_data)) - -# Define the values for insertion -insert_columns = ["x", "y", "value"] -insert_values = [(1, 1, 10.0), (2, 2, 20.0), (3, 3, 30.0)] - -# Call the function -insert_or_update_table(engine, table_name, columns, data, conflict_columns) - -# Example usage -table_name = "grid" -columns = ["x", "y", "value"] -data = [ - (1, 1, 1.0), - (2, 2, 1.5), - (3, 3, 2.0), -] - -sql_command = "INSERT INTO grid (x, y, value) VALUES (:x, :y, :value)" -test_data = [{'x': 1, 'y': 1, 'value': 1.0}] - -with engine.connect() as connection: - connection.execute(text(sql_command), test_data) - -# Generate the SQL command and data -insert_stmt = insert_into_table(table_name, columns, data) - -# Print the generated SQL command (for validation) -print("Insert SQL Command:") -print(insert_stmt) - -# Print for validation -print("Insert SQL Command:") -print(insert_sql) -print("Data:") -print(insert_data) - -# Example execution with SQLAlchemy -with engine.connect() as connection: - connection.execute(insert_stmt) - -def insert_values_sql(table_name, columns, values): - """ - Generate SQL command for inserting values into a table. - - Args: - table_name (str): The name of the table. - columns (list): List of column names. - values (list of tuples): List of values to insert. - - Returns: - str: The SQL command to insert the values. - list: Flattened list of values for binding to the SQL command. - """ - column_names = ", ".join(columns) - value_placeholders = ", ".join("?" * len(columns)) - values_part = ", ".join(f"({value_placeholders})" for _ in values) - flattened_values = [item for sublist in values for item in sublist] - - insert_command = f""" - INSERT INTO {table_name} ({column_names}) - VALUES {values_part} - """ - return insert_command.strip(), flattened_values - -def check_table_exists(engine, table_name): - """ - Check if a table exists in the database. - - Args: - engine: SQLAlchemy engine object. - table_name (str): The name of the table to check. - - Returns: - bool: True if the table exists, False otherwise. - """ - inspector = inspect(engine) - return table_name in inspector.get_table_names() - -with engine.connect() as connection: - # sql_validate(connection, "grid") - sql_inspect(connection) - sql_drop(connection, table_name) - -def select_from_table(engine, table_name, columns='*'): - """ - Select data from a table. - - Args: - engine: SQLAlchemy engine object. - table_name (str): The name of the table to select from. - columns (str or list): Columns to select. '*' selects all columns. - - Returns: - list: List of rows returned by the query. - """ - metadata = MetaData(bind=engine) - table = Table(table_name, metadata, autoload_with=engine) - - if columns == '*': - columns = [col.name for col in table.columns] - elif isinstance(columns, str): - columns = [columns] - - stmt = select([table.c[col] for col in columns]) - - with engine.connect() as connection: - result = connection.execute(stmt) - return result.fetchall() - -# Create table -table_name = "grid" -columns = {"x": "INTEGER", "y": "INTEGER", "value": "REAL"} -primary_keys = ["x", "y"] -index_columns = ["value"] - -create_sql = create_table_sql(table_name, columns, primary_keys, index_columns) -print("Create Table SQL:\n", create_sql) - -with engine.connect() as connection: - connection.execute(create_sql) - -insert_columns = ["x", "y", "value"] -insert_values = [(1, 1, 10.0), (2, 2, 20.0), (3, 3, 30.0)] - -# Insert data function -def insert_values_sql(table_name, columns, values): - column_names = ", ".join(columns) - value_placeholders = ", ".join("?" * len(columns)) - values_part = ", ".join(f"({value_placeholders})" for _ in values) - - insert_command = f""" - INSERT INTO {table_name} ({column_names}) - VALUES {values_part} - """ - # Flatten the list of values into a single list - flattened_values = [value for sublist in values for value in sublist] - - return insert_command.strip(), flattened_values - - -table_name = 'grid' -columns = ['x', 'y', 'value'] -data = [ - (1, 1, 1.0), - (2, 2, 1.5), - (3, 3, 2.0) -] - -# Prepare the columns part of the SQL statement -columns_str = ", ".join(columns) - -# Prepare the values part of the SQL statement -values_str = ", ".join( - f"({', '.join(map(str, row))})" - for row in data -) - -# Construct the full SQL command -sql_command = f""" -INSERT INTO {table_name} ({columns_str}) -VALUES {values_str}; -""" - -# Execute the SQL command -with engine.connect() as connection: - connection.execute(text(sql_command)) - -sql_command = f"SELECT * FROM {table_name};" - -with engine.connect() as connection: - result = connection.execute(text(sql_command)) - rows = result.fetchall() - -print(f"Data in table {table_name}:") -for row in rows: - print(row) -# Construct the full SQL command -sql_command = f""" -INSERT INTO {table_name} ({columns_str}) -VALUES {values_str}; -""" - - -insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values) -print("Insert Values SQL:\n", insert_sql) -print("Insert Data:\n", insert_data) - -with engine.connect() as connection: - connection.execute(insert_sql, [insert_data]) - -# Check table existence -exists = check_table_exists(engine, table_name) -print(f"Table '{table_name}' exists: {exists}") - -# Select data from table -data = select_from_table(engine, table_name, insert_columns) -print(f"Data from '{table_name}':") -for row in data: - print(row) - - - - -create_sql = create_table_sql(table_name, columns, primary_keys, index_columns) -print("Create Table SQL:\n", create_sql) - -# Define the values for insertion -insert_columns = ["x", "y", "value"] -insert_values = [(1, 1, 10.0)] - -insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values) -print("Insert Values SQL:\n", insert_sql) -print("Data:\n", insert_data) - -# Example usage -table_name = "grid" -columns = { - "x": "INTEGER", - "y": "INTEGER", - "value": "REAL" -} -primary_keys = ["x", "y"] -index_columns = ["value"] - -sql_command = create_table_sql(table_name, columns, primary_keys, index_columns) -print(sql_command) - -# Create the table -create_table_sql = """ -CREATE TABLE IF NOT EXISTS grid ( - x INTEGER, - y INTEGER, - value REAL, - PRIMARY KEY (x, y) -); -""" - -# Insert grid points -insert_values = ", ".join(f"({i}, {j}, {v})" for i, j, v in grid_points) -insert_sql = f""" -INSERT INTO grid (x, y, value) VALUES {insert_values}; -""" - -# Connect to the database and execute the commands -with engine.connect() as connection: - try: - # Create table if it does not exist - connection.execute(text(create_table_sql)) - # Insert grid points - connection.execute(text(insert_sql)) - connection.commit() - print("Grid points successfully inserted.") - except Exception as e: - print(f"An error occurred: {e}") - - -engine = create_engine(f"sqlite:///{db_file}") -metadata = MetaData() -grid_table = Table('grid', metadata, autoload_with=engine) -# Read existing grid values from the database into a DataFrame -with engine.connect() as connection: - select_stmt = select(grid_table.c.x, grid_table.c.y, grid_table.c.value) - result = connection.execute(select_stmt) - existing_data = pd.DataFrame(result.fetchall(), columns=['x', 'y', 'value']) - -# Coordinates to update -update_coords = {(1,1), (2,2), (3,3), (4,4), (5,5)} - -# Create a dictionary for fast lookup -update_dict = {(i, j): 1.0 for i, j in update_coords} - -# Update the grid_points with new values where applicable -updated_grid_points = [ - (i, j, update_dict.get((i, j), value)) - for i, j, value in grid_points -] - -# Convert the list of tuples to a DataFrame -df_updated_grid_points = pd.DataFrame(updated_grid_points, columns=['x', 'y', 'value']) - -# Print the DataFrame -print(df_updated_grid_points) - -# Merge existing and updated data to find differences -merged_data = pd.merge(existing_data, df_updated_grid_points, on=['x', 'y'], suffixes=('_existing', '_updated')) -differences = merged_data[merged_data['value_existing'] != merged_data['value_updated']] - -# Assuming 'differences' is your DataFrame with updated values -# Create a dictionary for batch updating -update_dict = differences.set_index(['x', 'y'])['value_updated'].to_dict() - -# Generate the SQLAlchemy update statement -update_stmt = update(grid_table).where( - grid_table.c.x.in_(update_dict.keys()) -).values({ - grid_table.c.value: update_dict.get((grid_table.c.x, grid_table.c.y), grid_table.c.value) -}) - -# Create the CASE statement -case_stmt = case( - { - (grid_table.c.x == x) & (grid_table.c.y == y): value - for (x, y), value in update_dict.items() - }, - else_=grid_table.c.value -) - -# Convert the DataFrame into a dictionary of case statements -case_stmt = case( - [(grid_table.c.x == x) & (grid_table.c.y == y), value] - for (x, y), value in update_dict.items() -) - -# Create the case statement -case_stmt = case( - { (x, y): value for (x, y), value in update_dict.items() }, - value=grid_table.c.x, # Assuming `x` is the column being compared - else_=grid_table.c.value -) - -case_stmt = case( - { - (x, y): value - for (x, y), value in update_dict.items() - }, - value=grid_table.c.x, - else_=grid_table.c.value -) - -# Create the case statement -# Create a CASE statement using a dictionary -case_stmt = case( - { - (grid_table.c.x == x) & (grid_table.c.y == y): value - for (x, y), value in update_dict.items() - }, - else_=grid_table.c.value -) -case_stmt = case( - {((grid_table.c.x == x) & (grid_table.c.y == y)): value - for (x, y), value in update_dict.items()}, - else_=grid_table.c.value -) -print("Case Statement:", str(case_stmt.compile(engine, compile_kwargs={"literal_binds": True}))) - - -# Create the update statement -update_stmt = ( - update(grid_table). - where(grid_table.c.value != case_stmt). - values(value=case_stmt) -) - -print("Update Statement:", str(update_stmt.compile(engine, compile_kwargs={"literal_binds": True}))) - - -# Print the SQL for each update -for (x, y), value in update_dict.items(): - update_stmt = ( - update(grid_table) - .where((grid_table.c.x == x) & (grid_table.c.y == y)) - .values(value=value) - ) - # Print the SQL statement with literal values for debugging - print("Update Statement:", str(update_stmt.compile(engine, compile_kwargs={"literal_binds": True}))) - - # Execute the update statement - with engine.connect() as connection: - result = connection.execute(update_stmt) - print(f"Updated {result.rowcount} entries for coordinates ({x}, {y}).") - -# Execute the update -with engine.connect() as connection: - result = connection.execute(update_stmt) - print(f"Updated {result.rowcount} entries.") - -engine.dispose() - -engine = create_engine(f"sqlite:///{db_file}") -metadata = MetaData() -grid_table = Table('grid', metadata, autoload_with=engine) -# Verify the updated rows -select_stmt = select(grid_table) - -with engine.connect() as connection: - result = connection.execute(select_stmt) - rows = result.fetchall() - -for row in rows: - print(row) - -# Define your SQLite engine and metadata -engine = create_engine(F'sqlite:///{db_file}') -metadata = MetaData() - -# Reflect the grid table -grid_table = Table('grid', metadata, autoload_with=engine) - -# Define your update dictionary -update_dict = {(1, 1): 1.0, (2, 2): 1.0, (3, 3): 1.0, (4, 4): 1.0, (5, 5): 1.0} - -# Execute updates +# import os +# from pathlib import Path + +# import numpy as np +# import pandas as pd +# from sqlalchemy import create_engine, text + +# SQL_COMMANDS["create"].format(**{"table_name": "A", "column_definitions": "B"}) + +# # Coordinates +# x = np.array([1, 2, 3, 4, 5]) +# y = np.array([1, 2, 3, 4, 5]) + +# # Create the grid points +# grid_points = [(i, j, 0) for i in x for j in y] + +# def initialize_grid(): + + +# data_root_dir = Path("C:/Users/Brandyn/Documents/GitHub/EchoPro_data/") +# db_directory = data_root_dir / "database" +# # ---- Create the directory if it does not already exist +# db_directory.mkdir(parents=True, exist_ok=True) +# # ---- Complete path to `biology.db` +# db_file = db_directory / "grid.db" + +# from sqlalchemy import MetaData, Table, case, create_engine, inspect, select, text, update + +# engine = create_engine(f"sqlite:///{db_file}") + +# # Define metadata and the table to drop +# metadata = MetaData() +# grid_table = Table('grid', metadata, autoload_with=engine) +# # Drop the table # with engine.connect() as connection: -connection = engine.connect() -# for (x, y), value in update_dict.items(): -(x,y) = (1, 1) -value = update_dict[(1,1)] - -update_stmt = ( - update(grid_table) - .where((grid_table.c.x == x) & (grid_table.c.y == y)) - .values(value=value) -) -# Print the SQL statement for debugging -print("Executing Update Statement:", str(update_stmt.compile(engine, compile_kwargs={"literal_binds": True}))) - -# Execute the update statement -result = connection.execute(update_stmt) -print(f"Updated {result.rowcount} entries for coordinates ({x}, {y}).") -connection.close() - -select_stmt = select(grid_table.c.x) - -# Execute the SELECT statement -with engine.connect() as connection: - result = connection.execute(select_stmt) - x_values = result.fetchall() - -type(x_values[0]) - -select_stmt = select(grid_table.c.y) - -# Execute the SELECT statement -with engine.connect() as connection: - result = connection.execute(select_stmt) - y_values = result.fetchall() - -select_stmt = select(grid_table.c.value) - -# Execute the SELECT statement -with engine.connect() as connection: - result = connection.execute(select_stmt) - values = result.fetchall() - -case_stmt = case( - *[(grid_table.c.x == x) & (grid_table.c.y == y, value) - for (x, y), value in update_dict.items()], - else_=grid_table.c.value -) - -update_dict = {(1, 2): 1.0, (3, 2): 1.0, (1, 5): 1.0, (4, 5): 1.0, (3, 5): 4.0} - -with engine.connect() as connection: - # Select all values to check the current state - result = connection.execute(select(grid_table.c.x, grid_table.c.y, grid_table.c.value)) - current_values = result.fetchall() - print("Current Values:", current_values) - -with engine.connect() as connection: - with connection.begin(): # Begin a transaction - for (x, y), value in update_dict.items(): - stmt = ( - update(grid_table) - .where((grid_table.c.x == x) & (grid_table.c.y == y)) - .values(value=grid_table.c.value + value) - ) - connection.execute(stmt) - -with engine.connect() as connection: - # Re-select to check the updated state - result = connection.execute(select(grid_table.c.x, grid_table.c.y, grid_table.c.value)) - updated_values = result.fetchall() - print("Updated Values:", updated_values) - - -# Confirm the updates -with engine.connect() as connection: - select_stmt = select([grid_table]) - result = connection.execute(select_stmt) - rows = result.fetchall() - -# Print all rows to verify updates -print("Database contents after update:") -for row in rows: - print(row) - - -# Construct the update statement -update_stmt = ( - update(grid_table) - .values(value=case_stmt) - .where(grid_table.c.value != case_stmt) -) - -# Create a SELECT statement to fetch all rows from the grid_table -select_stmt = select(grid_table) - -# Execute the SELECT statement and fetch results -with engine.connect() as connection: - result = connection.execute(select_stmt) - rows = result.fetchall() - -# Print or inspect the fetched rows -for row in rows: - print(row) - -# Create the update statement -update_stmt = ( - update(grid_table) - .where(grid_table.c.value != case_stmt) - .values(value=case_stmt) -) - -# Execute the update -with engine.connect() as connection: - result = connection.execute(update_stmt) - print(f"Updated {result.rowcount} entries.") - -case( - [ - ((grid_table.c.x == x) & (grid_table.c.y == y), value) - for (x, y), value in update_dict.items() - ], - else_=grid_table.c.value -) - -# Create a case statement for conditional update -case_statements = { - (x, y): case( - [(grid_table.c.x == x) & (grid_table.c.y == y, value)], - else_=grid_table.c.value - ) - for (x, y), value in update_dict.items() -} - - -# Define SQL command to select all data from the grid table -select_sql = "SELECT * FROM grid;" - -# Connect to the database and execute the query -with engine.connect() as connection: - try: - # Execute the select command - result = connection.execute(text(select_sql)) - # Fetch all rows from the result - rows = result.fetchall() - # Print the results - print("Data in grid table:") - for row in rows: - print(row) - except Exception as e: - print("An error occurred: {}".format(e)) - -# Coordinates to update -update_coords = {(1,1), (2,2), (3,3), (4,4), (5,5)} - -# Create a copy of grid_points and update specific coordinates -updated_grid_points = [ - (i, j, 1.0) if (i, j) in update_coords else (i, j, value) - for i, j, value in grid_points -] - -# Retrieve current data from the database -with engine.connect() as connection: - result = connection.execute(text("SELECT x, y, value FROM grid;")) - current_data = result.fetchall() - -# Convert to a dictionary for easy comparison -current_values = {(x, y): value for x, y, value in current_data} - -# Convert updated_grid_points to a dictionary -updated_values = {(i, j): value for i, j, value in updated_grid_points} - -# Find differences -differences = [ - (i, j, value) - for i, j, value in updated_grid_points - if (i, j) in updated_values and (i, j) not in current_values or - (i, j) in current_values and current_values[(i, j)] != value -] - -# Update differing values in the database -with engine.connect() as connection: - for i, j, value in differences: - connection.execute( - text(f"UPDATE grid SET value = {value} WHERE x = {i} AND y = {j}"), - ) - print(f"Updated {len(differences)} entries.") - -# Step 8: Read the table into Python -with engine.connect() as connection: - # Query to select all rows from the table - result = connection.execute(text("SELECT x, y, value FROM grid;")) - df = pd.DataFrame(result.fetchall(), columns=['x', 'y', 'value']) - -# Print the DataFrame to validate the changes -print(df) - -# Check current values -with engine.connect() as connection: - result = connection.execute(text("SELECT x, y, value FROM grid;")) - current_values = {(row[0], row[1]): row[2] for row in result.fetchall()} - -print("Current grid points in database:") -for row in current_values.items(): - print(row) - -print("Updated grid points with changes:") -for row in updated_grid_points: - print(row) - -# Determine differences -differences = [ - (i, j, value) - for i, j, value in updated_grid_points - if (i, j) in current_values and current_values[(i, j)] != value -] - -print(f"Differences to update: {differences}") - -# Step 6: Update the database with INSERT OR REPLACE -with engine.connect() as connection: - with connection.begin(): # Ensure transactions are committed - for i, j, value in updated_grid_points: - sql = """ - INSERT OR REPLACE INTO grid (x, y, value) - VALUES (:x, :y, :value) - """ - print(f"Executing SQL: {sql} with values: x={i}, y={j}, value={value}") - connection.execute( - text(sql), - {"x": i, "y": j, "value": value} - ) - print(f"Updated entries with INSERT OR REPLACE.") - -# Step 8: Read the table into Python -with engine.connect() as connection: - result = connection.execute(text("SELECT x, y, value FROM grid;")) - rows = result.fetchall() - df = pd.DataFrame(rows, columns=['x', 'y', 'value']) - -# Print the DataFrame to validate the changes -print("Updated table data:") -print(df) - - -engine.dispose() - -# Check if the file exists and then remove it -if db_file.exists(): - db_file.unlink() - print(f"Deleted the file: {db_file}") -else: - print(f"The file does not exist: {db_file}") - -with engine.connect() as connection: - connection.execute(text(""" - CREATE TABLE IF NOT EXISTS grid ( - x INTEGER, - y INTEGER, - value REAL, - PRIMARY KEY (x, y) - ); - """)) - - connection.execute(text(""" - INSERT OR REPLACE INTO grid (x, y, value) VALUES - (1, 1, 0), (1, 2, 0), (1, 3, 0), (1, 4, 0), (1, 5, 0), - (2, 1, 0), (2, 2, 0), (2, 3, 0), (2, 4, 0), (2, 5, 0), - (3, 1, 0), (3, 2, 0), (3, 3, 0), (3, 4, 0), (3, 5, 0), - (4, 1, 0), (4, 2, 0), (4, 3, 0), (4, 4, 0), (4, 5, 0), - (5, 1, 0), (5, 2, 0), (5, 3, 0), (5, 4, 0), (5, 5, 0); - """)) - - # Insert initial values (0) into the grid table - values = ",".join(["({}, {}, {})".format(i, j, 0) for i, j, _ in grid_points]) - connection.execute(text("INSERT INTO grid (x, y, value) VALUES {values};".format(values=values))) - - # Commit - connection.commit() - - # Verify data insertion - result = connection.execute(text("SELECT * FROM grid;")) - rows = result.fetchall() - print("Data in grid table:", rows) - - connection.execute(text(""" - INSERT INTO grid (x, y, value) VALUES - """ + ",".join(["({}, {}, {})".format(i, j, 0) for i, j, _ in grid_points]) + ";")) - -engine.dispose() - - - result = connection.execute(text("SELECT * FROM grid;")) - rows = result.fetchall() - print("Data in grid table:", rows) - -with engine.connect() as connection: - result = connection.execute(text("SELECT name FROM sqlite_master WHERE type='table';")) - print(result.fetchall()) - -with engine.connect() as connection: - # Describe the table schema - result = connection.execute(text("PRAGMA table_info(grid);")) - columns = result.fetchall() - print("Table schema:", columns) - -with engine.connect() as connection: - result = connection.execute(text("SELECT * FROM grid;")) - rows = result.fetchall() - for row in rows: - print(row) - -SQL(db_file, command="select") - - - - - -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt -import geopandas as gpd -from geopy.distance import distance -from shapely.geometry import Polygon, Point, box -import geopandas as gpd -from shapely.ops import unary_union -import pyproj -import geopy -from echopop.spatial.projection import wgs84_to_utm, utm_string_generator -import shapely.geometry -from echopop.survey import Survey -survey = Survey( init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/initialization_config.yml" , - survey_year_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/survey_year_2019_config.yml" ) - - -grid_settings = file_configuration["geospatial"]["griddify"] -# lat_min = grid_settings["bounds"]["latitude"][0] -lat_min = 33.75 -# lat_max = grid_settings["bounds"]["latitude"][1] -lat_max = 55.50 -# lon_min = grid_settings["bounds"]["longitude"][0] -lon_min = -134.25 -lon_max = grid_settings["bounds"]["longitude"][1] - -projection = file_configuration["geospatial"]["projection"] - -utm_code = utm_string_generator((lon_max + lon_min)/2, (lat_max + lat_min)/2) -utm_num = int(utm_code) -utm_str = f"epsg:{utm_num}" - -biology_data = filtered_biology_output - -from sqlalchemy import create_engine, text, Engine, inspect -root_dir = file_configuration["data_root_dir"] -db_directory = Path(root_dir) / "database" -db_directory.mkdir(parents=True, exist_ok=True) -db_file = db_directory / "biology.db" -# Create the engine with the full path -engine = create_engine(f'sqlite:///{db_file}') - -SQL_COMMANDS = { - "create": "CREATE TABLE IF NOT EXISTS {table_name} ({column_definitions});", - "check": "SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';", - "drop": "DROP TABLE IF EXISTS {table_name};", - "select": "SELECT {columns} FROM {table_name};", - "index": "CREATE UNIQUE INDEX IF NOT EXISTS {index_name} ON {table_name} ({columns})", - # "insert": "INSERT INTO {table_name} ({columns});", - "insert": """ - INSERT INTO {table_name} ({columns}) - SELECT {columns} - FROM (SELECT VALUES {values} FROM (VALUES {value_placeholder})) AS source ({columns}) - {filter_clause}; - """, - "inspect": None, -} - -SQL_DTYPES = { - 'int32': 'INTEGER', - 'int64': 'INTEGER', - 'float64': 'FLOAT', - 'bool': 'BOOLEAN', - 'datetime64[ns]': 'DATETIME', - 'object': 'TEXT' -} - -def SQL(db_file: str, command: str, **kwargs): - - # Create engine from `db_file` string - engine = create_engine(f"sqlite:///{db_file}") - - # Format `columns`, if there are any and more than 1 - if "columns" in kwargs.keys(): - if isinstance(kwargs["columns"], list): - kwargs["columns"] = ", ".join(kwargs["columns"]) - else: - kwargs["columns"] = "*" - - # Format `columns`, if there are any and more than 1 - # if "filter_columns" in kwargs.keys(): - # # ---- Store the value for later - # kwargs["filter_columns_store"] = kwargs["filter_columns"] - # if isinstance(kwargs["filter_columns"], list): - # kwargs["filter_columns"] = ", ".join(kwargs["filter_columns"]) - - # Run the command - try: - with engine.connect() as connection: - # ---- SELECT - if command == "select": - return pd.read_sql(text(SQL_COMMANDS[command].format(**kwargs)), con=connection) - # ---- CREATE - elif command == "create": - # ---- Extract dataframe - df_to_add = kwargs["dataframe"] - # ---- Check whether the table already exists or not - table_exists = ( - connection.execute(text(SQL_COMMANDS["check"].format(**kwargs))).fetchone() - ) - # ---- If it doesn't, pre-allocate the table - if table_exists is None: - # ---- Get column definitions as a string - column_def_dict = { - col: SQL_DTYPES.get(str(dtype), 'TEXT') - for col, dtype in zip(df_to_add.columns, df_to_add.dtypes) - } - # ---- Convert to a single string - kwargs["column_definitions"] = ( - ", ".join([f"{col} {dtype}" for col, dtype in column_def_dict.items()]) - ) - # ---- Create table - connection.execute(text(SQL_COMMANDS["create"].format(**kwargs))) - # ---- REPLACE - elif command == "replace": - # ---- Extract dataframe - df_to_add = kwargs["dataframe"] - # ---- Replace current - df_to_add.to_sql(name=kwargs["table_name"], - con=connection, - if_exists="replace", index=False) - - # ---- INSERT - elif command == "insert": - # ---- Extract dataframe - df_to_add = kwargs["dataframe"] - # ---- Check if - # table_exists = ( - # connection.execute(text(SQL_COMMANDS["check"].format(**kwargs))).fetchone() - # ) - # tables = SQL(db_file, "inspect") - # ---- If it doesn't, pre-allocate the table - # if kwargs["table_name"] not in tables and "filter_columns" in kwargs.keys(): - df_to_add.to_sql(name=kwargs["table_name"], - con=connection, - if_exists="append", index=False) - # else: - # # ---- Format `filter_columns` command if present - # if "filter_columns" in kwargs.keys(): - # # ---- Fetch table - # fetch_table = ( - # connection.execute(text( - # ("SELECT DISTINCT {filter_columns} FROM {table_name}") - # .format(**kwargs)) - # ) - # ) - # # ---- Format the SQL data into a DataFrame - # fetched_df = pd.DataFrame(fetch_table.fetchall(), columns=fetch_table.keys()) - # # ---- Create an index tuples - # index_tuples = ( - # set(fetched_df[kwargs["filter_columns_store"]] - # .itertuples(index=False, name=None)) - # ) - # # ---- Filter the dataframe - # filtered_df = ( - # df_to_add[ - # ~df_to_add[fetched_df.columns].apply(tuple, axis=1) - # .isin(index_tuples) - # ] - # ) - # # ---- Insert the data - # filtered_df.to_sql(name=kwargs["table_name"], - # con=connection, - # if_exists="append", index=False) - # else: - # df_to_add.to_sql(name=kwargs["table_name"], - # con=connection, - # if_exists="append", index=False) - # ---- INSPECT - elif command == "inspect": - return inspect(engine).get_table_names() - else: - connection.execute(text(SQL_COMMANDS[command].format(**kwargs))) - finally: - # ---- Dispose of the engine to release any resources being pooled/used - engine.dispose() - -_ = SQL(db_file, "drop", table_name="catch_df") -_ = SQL(db_file, "drop", table_name="specimen_df") -_ = SQL(db_file, "drop", table_name="length_df") -_ = SQL(db_file, "drop", table_name="files_read") - -_ = SQL(db_file, "insert", table_name="files_read", dataframe=current_files) -current = SQL(db_file, "select", table_name="files_read", columns="filepath") -current - - -# Get acoustic directory and initialization settings -# ---- Files -biology_file_settings = file_configuration["input_directories"]["biological"] -# ---- General settings -biology_analysis_settings = file_configuration["biology"] - -# Get the file-specific settings, datatypes, columns, etc. -# ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` -biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] -# ---- Extract the expected file name ID's -biology_file_ids = biology_file_settings["file_name_formats"] -# ---- Extract all of the file ids -biology_config_ids = list(biology_file_ids.keys()) -# ---- Initialize the dictionary that will define this key in the `input` attribute -biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} -# ---- Initialize the SQL dictionary -sql_biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} - -# Create full filepath -biology_directory_path = ( - Path(file_configuration["data_root_dir"]) / biology_file_settings["directory"] -) -# ---- Directory check -directory_existence = biology_directory_path.exists() -# ---- Error evaluation (if applicable) -if not directory_existence: - raise FileNotFoundError( - f"The acoustic data directory [{biology_directory_path}] does not exist." - ) -# ---- Get the defined file extension -file_extension = biology_file_settings["extension"] -# ---- Create Path.glob generator object -file_path_obj = biology_directory_path.glob(f"*{'.'+file_extension}") -#---- Create list of `*.csv`` files -csv_files = list(file_path_obj) -# ---- Ensure files exist or raise error otherwise -if len(csv_files) < 1: - raise FileNotFoundError( - f"No `*.csv` files found in [{biology_directory_path}]!" - ) -else: - # ---- Create Path to SQL database file - db_directory = Path(file_configuration["data_root_dir"]) / "database" - # ---- Create the directory if it does not already exist - db_directory.mkdir(parents=True, exist_ok=True) - # ---- Complete path to `biology.db` - db_file = db_directory / "biology.db" - # ---- Query the external SQL database to see if the file tracking table exists - tables = SQL(db_file, "inspect") - # ---- Create a list of string-formatted Path names - csv_files_str = [str(file) for file in csv_files] - # ---- Create DataFrame - current_files = pd.DataFrame(csv_files_str, columns=["filepath"]) - # ---- Create if it is missing and then advance `csv_files` - if "files_read" not in tables: - # ---- Insert into the SQL database file - _ = SQL(db_file, "insert", table_name="files_read", columns="filepath", - dataframe=current_files) - # ---- Create empty list for later comparison - new_files = [] - else: - # ---- Pull already processed filenames - previous_files = SQL(db_file, "select", table_name="files_read") - # ---- Compare against the current filelist - new_files = ( - [file for file in csv_files_str if file not in set(previous_files["filepath"])] - ) - # ---- Create a DataFrame for the new files - new_files_df = pd.DataFrame(new_files, columns=["filepath"]) - # ---- Insert into the SQL database file - _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) - -# Iterate through each of the file ids and read in the data -for id in list(biology_file_ids.keys()): - # ---- Extract the specific config mapping for this tag/id - sub_config_map = biology_config_map[id] - # ---- Drop the `{FIELD_ID}` tag identifier - file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id]) - # ---- Replace all other tags with `*` placeholders - file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) - # ---- Create Path object with the generalized format - subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}") - # ---- List all files that match this pattern - subcsv_files_str = [str(file) for file in list(subfile_path_obj)] - # ---- Filter for only new files - subset_files = set(subcsv_files_str).intersection(set(new_files)) - # ---- Pull from SQL database, if applicable - if f"{id}_df" in tables: - # ---- SELECT - sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*") - # ---- Concatenate to the dictionary - sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df]) - # ---- Add data files not stored in SQL database - if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables: - if len(subset_files) > 0: - file_list = subset_files - else: - file_list = subcsv_files_str - # ---- Create a list of relevant dataframes - sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map) - for file in file_list] - # ---- Concatenate into a single DataFrame - sub_df = pd.concat(sub_df_lst, ignore_index=True) - # ---- Concatenate to the dictionary DataFrame - biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df]) - -# Get contrasts used for filtering the dataset -# ---- Species -species_filter = file_configuration["species"]["number_code"] -# ---- Trawl partition information -trawl_filter = biology_analysis_settings["catch"]["partition"] -# ---- Apply the filter -filtered_biology_output = { - key: df[ - (df['species_id'] == species_filter if 'species_id' in df.columns else True) & - (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns else True) - ] - for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty -} - -# Update the SQL database -for table_name, df in filtered_biology_output.items(): - # ---- Update - _ = SQL(db_file, "insert", table_name=table_name, columns="*", - dataframe=df) - -# Combine the two datasets -merged_output = { - key: pd.concat([ - sql_biology_output.get(key, pd.DataFrame()), - filtered_biology_output.get(key, pd.DataFrame()) - ]).drop_duplicates().reset_index(drop=True) - for key in set(sql_biology_output) | set(filtered_biology_output) -} -# ---- Return output -merged_output - -coordinate_metadata.attrs[] - -SQL(biology_db, command="drop", table_name="catch_df") -SQL(biology_db, command="drop", table_name="specimen_df") -SQL(biology_db, command="drop", table_name="length_df") -SQL(biology_db, command="drop", table_name="files_read") -_ = SQL(db_file=db_file, command="create", table_name="files_read", columns="filepath") -tables = SQL(db_file, "inspect") -tables -current = SQL(db_file, "select", table_name="files_read", columns=["filepath"]) -current - -SQL(db_file, "select", table_name="catch_df", columns="*") -new_files_df = pd.DataFrame(csv_files_str, columns=['file_path']) -_ = SQL("insert", engine, table_name="files_read",dataframe=new_files_df) -current = SQL("select", engine, table_name="csv_files_read", columns="file_path") -current -for table_name, df in biology_data.items(): - df.to_sql(table_name, con=engine, if_exists='append', index=False) -command = "read" -engine = create_engine(f'sqlite:///{db_file}') -table_name = "files_read" -columns = "file_path" - -kwargs = { - "table_name": table_name, - "columns": columns, -} - -zarr_data_ds["depth"].diff(dim="depth") - -prc_nasc_df.groupby(["longitude", "latitude"]) - -from pandas.core.groupby import DataFrameGroupBy - -def estimate_echometrics(acoustic_data_df: pd.DataFrame): - - # Create copy - acoustic_df = acoustic_data_df.copy().reset_index(drop=True) - - # Pre-compute the change in depth - acoustic_df["dz"] = acoustic_df["depth"].diff() - - # Initialize echometrics dictionary - echometrics = {} - - # Compute the metrics center-of-mass - if acoustic_df["NASC"].sum() == 0.0: - echometrics.update({ - "n_layers": 0, - "mean_Sv": -999, - "max_Sv": -999, - "nasc_db": np.nan, - "center_of_mass": np.nan, - "dispersion": np.nan, - "evenness": np.nan, - "aggregation": np.nan, - "occupied_area": 0.0, - }) - else: - - # Compute the number of layers - echometrics.update({ - "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size - }) - - # Compute ABC - # ---- Convert NASC to ABC - acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2) - # ---- Estimate mean Sv - echometrics.update({ - "mean_Sv": 10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max()) - }) - # --- Estimate max Sv (i.e. ) - echometrics.update({ - "max_Sv": 10 * np.log10(acoustic_df["ABC"].max() - / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"]) - }) - - # Compute (acoustic) abundance - echometrics.update({ - "nasc_db": 10 * np.log10(acoustic_df["ABC"].sum()) - }) - - # Compute center of mass - echometrics.update({ - "center_of_mass": ( - (acoustic_df["depth"] * acoustic_df["NASC"]).sum() - / (acoustic_df["NASC"]).sum() - ) - }) - - # Compute the dispersion - echometrics.update({ - "dispersion": ( - ((acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2 - * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum() - ) - }) - - # Compute the evenness - echometrics.update({ - "evenness": (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2 - }) - - # Compute the index of aggregation - echometrics.update({ - "aggregation": 1 / echometrics["evenness"] - }) - - # Get the occupied area - echometrics.update({ - "occupied_area": ( - acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max() - ) - }) - - # Return the dictionary - return echometrics - -def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True): - - # Vertically integrate PRC NASC - nasc_dict = {"nasc": acoustic_data_df["NASC"].sum()} - - # Horizontally concatenate `echometrics`, if `True` - if echometrics: - # ---- Compute values - # NOTE: This uses NASC instead of linear `sv` - echometrics_dict = estimate_echometrics(acoustic_data_df) - # ---- Merge - nasc_dict.update(echometrics_dict) - - # Convert `nasc_dict` to a DataFrame and return the output - return pd.Series(nasc_dict) - -def process_group(group): - result = integrate_nasc(group, echometrics=True) - result = result.reset_index(drop=True) - # Concatenate the result back to the original group for alignment - group = group.reset_index(drop=True) - combined = pd.concat([group, result], axis=1) - return combined - -acoustic_data_df = acoustic_data["prc_nasc_df"] - - -rc_nasc_df[prc_nasc_df["distance"] == 0.0] -acoustic_data_df = mek[mek["distance"] == 0.0] -pd.DataFrame(nasc_dict, index=[0]).reset_index(drop=True).unstack() -nasc_data_df = ( - prc_nasc_df.groupby(["longitude", "latitude", "ping_time"]) - .apply(lambda group: integrate_nasc(group, echometrics=False), include_groups=False) - .reset_index() -) - - - - -kwargs = { - "table_name": "csv_files_read", - "columns": "file_path", - "dataframe": new_files_df -} - -current_process = psutil.Process() -import logging - -# Create a session -Session = sessionmaker(bind=engine) -session = Session() - -# Perform database operations -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) -logger.info("Performing database operations") - -# Create a session -Session = sessionmaker(bind=engine) -session = Session() - -# Perform database operations -logger.info("Performing database operations") - -# Close the session -session.close() -logger.info("Session closed") - -# Dispose the engine -engine.dispose() -logger.info("Engine disposed") - -# Force garbage collection -import gc -gc.collect() -logger.info("Garbage collection performed") - -import psutil - -pid = psutil.Process().pid -process = psutil.Process(pid) -open_files = process.open_files() -db_path = r'C:\Users\Brandyn\Documents\GitHub\EchoPro_data\live_2019_files\database\biology.db' - -# Check if the file is still in use -for file in open_files: - if db_path in file.path: - logger.info(f"File {db_path} is still in use.") - else: - logger.info(f"File {db_path} is not in use.") - -# Define the SQL to drop the table -drop_table_sql = "DROP TABLE IF EXISTS csv_files_read;" -# Execute the drop table SQL -with engine.connect() as connection: - _ = connection.execute(text(drop_table_sql)) - -import sqlite3 -if os.path.exists(db_path): - conn = sqlite3.connect(db_path) - conn.close() - # Force the file to be removed - try: - os.remove(db_path) - print(f"Database file {db_path} has been deleted.") - except PermissionError: - print(f"Failed to delete {db_path}. The file is still in use.") - -create_table_sql = """ -CREATE TABLE IF NOT EXISTS csv_files_read ( - file_path TEXT UNIQUE -); -""" -# Execute the create table SQL -with engine.connect() as connection: - _ = connection.execute(text(create_table_sql)) - -root_directory = Path(root_dir) -dataset = "biology" - -# Convert to strings -csv_files_str = [str(file) for file in csv_files] - -existing_files_df = pd.read_sql('SELECT file_path FROM csv_files_read', con=engine) -existing_files_set = set(existing_files_df['file_path']) -# Filter out duplicates from the csv_files list -new_files = [file for file in csv_files_str if file not in existing_files_set] -# Insert only new file paths into the SQL table -if new_files: - new_files_df = pd.DataFrame(new_files, columns=['file_path']) - _ = new_files_df.to_sql('csv_files_read', con=engine, if_exists='append', index=False) - - -with engine.connect() as conn: - conn.execute(""" - CREATE TABLE IF NOT EXISTS csv_files_read ( - file_path TEXT UNIQUE - ) - """) - -csv_files -files_df.to_sql('csv_files_read', con=engine, if_exists='append', index=False) -file_name_format = biology_file_ids[id] -def compile_filename_format(file_name_format: str): - - # Create a copy of `file_name_format` - regex_pattern = file_name_format - - # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern - for key, value in LIVE_FILE_FORMAT_MAP.items(): - regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"]) - # ---- Replace the `FILE_ID` tag - regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P\1)', regex_pattern) - - # Compile the regex pattern and return the output - return re.compile(regex_pattern) - -from sqlalchemy.orm import sessionmaker -Session = sessionmaker(bind=engine) -session = Session() -session.close() -engine.pool.status() -# Dispose the engine to close all connections -engine.dispose() -import gc -gc.collect() -import psutil -dbapi_conn = engine.raw_connection() -dbapi_conn.close() -# Get the process ID of the current process -pid = psutil.Process().pid - -# List all open files for the current process -process = psutil.Process(pid) -open_files = process.open_files() - -for file in open_files: - print(file.path) - - -pattern = filename_format -config_settings = sub_config_map -regex_pattern = pattern - -# Replace patterns based on LIVE_FILE_FORMAT_MAP -for key, value in LIVE_FILE_FORMAT_MAP.items(): - regex_pattern = regex_pattern.replace(f'{{{key}}}', value['expression']) -regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P\1)', regex_pattern) -new_pattern = compile_filename_format(regex_pattern) -match_obj = new_pattern.search(file.name) -# Get substring components as a list -filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern) -valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings))) - -for i in valid_tags: - matched_key = LIVE_FILE_FORMAT_MAP[i] - df[matched_key["name"]] = matched_key["dtype"](match_obj.group(i)) - - - -# Assign the data as new columns to the DataFrame -for key, value in data_to_add.items(): - df[key] = value - -for i in valid_tags: - matched_key = LIVE_FILE_FORMAT_MAP[i] - df[matched_key["name"]] = matched_key["dtype"](match_obj.group(i)) -biology_analysis_settings -species_id_value = 22500 -trawl_partition_value = 'Codend' # Adjust as needed -{ - key: df[ - (('species_id' not in df.columns) or (df['species_id'] == species_id_value)) & - (('trawl_partition' not in df.columns) or (df['trawl_partition'] == trawl_partition_value)) - ] - for key, df in biology_output.items() if isinstance(df, pd.DataFrame) -} - -(match_obj.group(i)).astype(matched_key["dtype"]) -pattern = '{DATE:YYYYMM}_{HAUL}_{FILE_ID:catch_perc}' -modified_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'\1', pattern) -# Create the regex pattern -regex_pattern = modified_pattern.replace('{', '(?P<').replace('}', '>.+?)') -re.compile(regex_pattern) - -modified_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'\1', pattern) - -# Create the regex pattern -regex_pattern = modified_pattern.replace('{', '(?P<').replace('}', '>.+?)') -compile_filename_format(regex_pattern) -# Regular expression to capture values inside the curly braces -regex = r'\{([^:}]+):([^}]+)\}' - -# Find all matches -matches = re.findall(regex, modified_pattern) - -# Get substring components as a list -filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern) - -pattern_changed = pattern.replace("FILE_ID:", "") - -# Compilte the filename regular expression format -compiled_regex = compile_filename_format(pattern_changed) - -file_id_tag = pattern.split('{FILE_ID:')[1].split('}')[0] - - # Get the file name and produce a `re.Match` object -match_obj = compiled_regex.search(file.name) - - -def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict): - - # Get the file name and produce a `re.Match` object - match_obj = pattern.search(file.name) - - # Read in the `*.csv` file - df = pd.read_csv(file, usecols=list(config_settings["dtypes"].keys())) - - # Validate the dataframe - # ---- Check for any missing columns - missing_columns = ( - [key for key in config_settings["dtypes"].keys() if key not in df.columns] - ) - # ---- Raise Error, if needed - if missing_columns: - raise ValueError( - f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!" - ) - # ---- Ensure the correct datatypes - df_validated = df.astype(config_settings["dtypes"]) - - # Replace column names and drop - df_validated = df_validated.rename(columns=config_settings["names"]) - - # Get the haul number and add the the dataframe - # ---- Extract the haul number and convert to an integer - haul_num = int(match_obj.group("HAUL")) - # ---- Add the column - df_validated["haul_num"] = haul_num - - # Return the resulting DataFrame - return df_validated - -boundary_dict = griddify_definitions["bounds"] - -from geopy.distance import distance -import numpy as np -import pandas as pd -import geopandas as gpd -from echopop.spatial.projection import utm_string_generator - -## -grid_settings["grid_resolution"]["x"] = 50 -grid_settings["grid_resolution"]["y"] = 50 -lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters -lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters - -# CREATE BOUNDING -bound_df = pd.DataFrame({ - "lon": np.array([lon_min, lon_max, lon_max, lon_min, lon_min]), - "lat": np.array([lat_min, lat_min, lat_max, lat_max, lat_min]) -}) - -bound_gdf = gpd.GeoDataFrame( - data=bound_df, - geometry=gpd.points_from_xy(bound_df["lon"], bound_df["lat"]), - crs = projection -) -from echopop.spatial.projection import utm_string_generator -import shapely.geometry -utm_string_generator(-117.0, 33.75) -bound_gdf.total_bounds -# Convert to UTM -bound_utm = bound_gdf.to_crs(utm_num) -bound_utm.total_bounds -y_step = lat_step -x_step = lon_step -# bound_utm = bound_gdf -# y_step = grid_settings["grid_resolution"]["y"] * 1852 / 110574 -# x_step = grid_settings["grid_resolution"]["x"] * 1852 / 60.0 - -xmin, ymin, xmax, ymax = bound_utm.total_bounds - -# Get number of cells -n_x_cells = int(np.ceil((xmax - xmin) / x_step)) -n_y_cells = int(np.ceil((ymax - ymin) / y_step)) - -import pyproj -# create the cells in a loop -# grid_cells = [] -# for x0 in np.arange(xmin, xmax, x_step): -# for y0 in np.arange(ymin, ymax, y_step): -# # bounds -# utm_zone = utm_string_generator(x0, y0) -# proj = pyproj.Proj(f"epsg:{utm_code}") -# x1 = x0-x_step -# y1 = y0+y_step -# grid_cells.append(shapely.geometry.box(x0, y0, x1, y1)) +# grid_table.drop(connection) +# print("Table 'grid' has been dropped.") + +# # Inspect the database +# inspector = inspect(engine) +# tables = inspector.get_table_names() +# print(tables) + +# def create_table_sql(table_name, columns, primary_keys=None, index_columns=None): +# """ +# Generate a SQL command to create a table with dynamic columns, primary keys, and indices. + +# Args: +# table_name (str): The name of the table. +# columns (dict): A dictionary where keys are column names and values are data types. +# primary_keys (list, optional): List of column names to be used as primary keys. +# index_columns (list, optional): List of column names to be indexed. + +# Returns: +# str: The SQL command to create the table. +# """ +# # Generate column definitions +# column_definitions = ",\n ".join(f"{col} {dtype}" for col, dtype in columns.items()) + +# # Generate primary key definition +# primary_key_definition = "" +# if primary_keys: +# primary_key_definition = f",\n PRIMARY KEY ({', '.join(primary_keys)})" + +# # Generate index definitions +# index_definitions = "" +# if index_columns: +# index_definitions = "\n".join( +# f"CREATE INDEX IF NOT EXISTS idx_{table_name}_{col} ON {table_name} ({col});" +# for col in index_columns +# ) + +# # Combine all parts into the final SQL command +# create_table_command = f""" +# CREATE TABLE IF NOT EXISTS {table_name} ( +# {column_definitions} +# {primary_key_definition} +# ); +# """ +# # Return the command and any index definitions +# return create_table_command.strip() + "\n" + index_definitions + +# # Define metadata and the table to drop +# metadata = MetaData() +# grid_table = Table('grid', metadata, autoload_with=engine) +# # Drop the table +# with engine.connect() as connection: +# grid_table.drop(connection) +# print("Table 'grid' has been dropped.") -grid_cells = [] -for y0 in np.arange(ymin, ymax, y_step): - - # x_step = grid_settings["grid_resolution"]["x"] * 1852 / (1852 * 60 * np.cos(np.radians(y0))) - - for x0 in np.arange(xmin, xmax, x_step): - # bounds - # utm_zone = utm_string_generator(x0, y0) - # proj = pyproj.Proj(f"epsg:{utm_code}") - # x1, y1 = proj(x0, y0) - # x2, y2 = proj(x0 - x_step, y0 + y_step) - # grid_cells.append(box(x1, y1, x2, y2)) - x1 = x0-x_step - y1 = y0+y_step - grid_cells.append(shapely.geometry.box(x0, y0, x1, y1)) - -cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=utm_code) -cells_gdf.shape -n_x_cells * n_y_cells -# cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"]) -cells_gdf.total_bounds -cells_gdf.to_crs(projection).total_bounds -from shapely.validation import make_valid -from shapely.geometry import mapping -######## -world = gpd.read_file("C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files/coastline/ne_10m_land/ne_10m_land.shp") -bb_orig = box(lon_min, lat_min, lon_max, lat_max) -boundary_box = box(lon_min - 5, lat_min - 5, lon_max + 5, lat_max + 5) -world_orig = gpd.clip(world, box(lon_min-1, lat_min-1, lon_max+1, lat_max+1)) -world_clipped_latlon = gpd.clip(world, boundary_box) -world_clipped = gpd.clip(world, boundary_box).to_crs(utm_code) - -world_utm = world.to_crs(utm_code) -world_utm = world_utm[~world_utm.is_empty] - -bbox_latlon = box(lon_min, lat_min, lon_max, lat_max) - -gpd.GeoDataFrame(geometry=[bbox_latlon], crs=projection).to_crs(utm_code) - -bbox_utm = bound_utm.total_bounds - -buffer = [-lon_step * 1.01, -lat_step * 1.01, lon_step * 1.01, lat_step * 1.01] -array_buffer = bbox_utm + buffer -array_names = ["minx", "miny", "maxx", "maxy"] -buffered = dict(zip(array_names, array_buffer)) -buffer_boundary = box(**buffered) -# box(array_buffer[0], array_buffer[1], array_buffer[2], array_buffer[3]) -# buffer_boundary = buffer_boundary.to_crs(world_utm.crs) - -buffer_boundary_gdf = gpd.GeoDataFrame(geometry=[buffer_boundary], crs=world_utm.crs) # Replace with the correct EPSG code -bb_orig_gdf = gpd.GeoDataFrame(geometry=[bb_orig], crs=projection) -# sub_clipped = gpd.clip(world_utm, buffer_boundary) -# sub_clipped = gpd.clip(world_utm, bbox_utm) +# check_table_exists(engine, "grid") -# fig, ax = plt.subplots(figsize=(10, 10)) -# # Plot the buffer_boundary -# world.plot(ax=ax, linewidth=2, color='gray') -# buffer_boundary_gdf.to_crs(projection).plot(ax=ax, facecolor='none', edgecolor='blue') -# bb_orig_gdf.plot(ax=ax, facecolor='none', edgecolor='red') -# plt.xlim(lon_min-3, lon_max+3) -# plt.ylim(lat_min-3, lat_max+3) -# plt.show() -from echopop.live.sql_methods import SQL -from shapely import wkt -import matplotlib.pyplot as plt -import geopandas as gpd -import matplotlib.colors as colors -import matplotlib.cm as cm -import numpy as np -from matplotlib.colors import ListedColormap -import matplotlib.dates as mdates -from datetime import datetime -db_filepath = realtime_survey.config["database"]["grid"] -survey_db = realtime_survey.config["database"]["acoustics"] -grid_df = SQL(db_filepath, "select", table_name="grid_df") -# grid_df[grid_df.abundance > 0] -grid_df[grid_df.abundance > 1e10] -# grid_df[grid_df.abundance > 0] -coast_df = SQL(db_filepath, "select", table_name="coastline_df") -survey_df = SQL(survey_db, "select", table_name="survey_data_df") - -# def parse_datetime(date_str): -# # List of possible formats -# formats = [ -# '%Y-%m-%d %H:%M:%S.%f', # With fractional seconds -# '%Y-%m-%d %H:%M:%S', # Without fractional seconds -# '%Y-%m-%dT%H:%M:%S.%f', # ISO 8601 format with fractional seconds -# '%Y-%m-%dT%H:%M:%S' # ISO 8601 format without fractional seconds -# ] - -# for fmt in formats: +# with engine.connect() as connection: +# sql_create(connection, df, table_name, primary_keys) + +# # Create the table +# table_name = "grid" +# columns = {"x": "INTEGER", "y": "INTEGER", "value": "REAL"} +# primary_keys = ["x", "y"] +# index_columns = ["x", "y"] + +# create_sql = create_table_sql(table_name, columns, primary_keys, index_columns) +# print("Create Table SQL:\n", create_sql) + +# with engine.connect() as connection: +# connection.execute(text(create_sql)) + +# inspector = inspect(engine) +# tables = inspector.get_table_names() +# print(tables) + +# check_table_exists(engine, "grid") + +# sql_command = f"SELECT * FROM {table_name};" + +# with engine.connect() as connection: +# result = connection.execute(text(sql_command)) +# rows = result.fetchall() + +# for row in rows: +# print(row) + +# converted_data[0] +# check_table_exists(engine, "files_read") + +# zarr_files_str = ["A", "B", "C", "D"] +# # ---- Create DataFrame +# current_files = pd.DataFrame(zarr_files_str, columns=["filepath"]) + +# with engine.connect() as connection: +# sql_create(connection, table_name="files_read", df=current_files) +# sql_insert(connection, table_name="files_read", columns=["filepath"], dataframe=current_files) + +# table_name = "files_read" +# sql_command = f"SELECT * FROM {table_name};" + +# with engine.connect() as connection: +# result = connection.execute(text(sql_command)) +# rows = result.fetchall() + +# for row in rows: +# print(row) + + +# from sqlalchemy.exc import IntegrityError + + +# def insert_or_update(engine, table_name, columns, data, conflict_columns): +# """ +# Insert or update data in a table. + +# Args: +# engine (Engine): The SQLAlchemy engine instance. +# table_name (str): The name of the table. +# columns (list): List of column names. +# data (list of dict): List of dictionaries containing data to insert or update. +# conflict_columns (list): List of column names to use for conflict resolution. +# """ + +# # Prepare the SQL statement for insertion +# column_names = ", ".join(columns) +# placeholder = ", ".join(f":{col}" for col in columns) +# # values_list = ", ".join(f"({', '.join(f':{col}' for col in columns)})" for _ in data) +# values_str = ", ".join( +# f"({', '.join(map(str, row))})" +# for row in data +# ) + + +# # Construct the SQL query +# sql = f""" +# INSERT INTO {table_name} ({column_names}) +# VALUES {values_str} +# ON CONFLICT ({', '.join(conflict_columns)}) +# DO UPDATE SET {', '.join(f'{col}=excluded.{col}' for col in columns)} +# """ + +# # Flatten the list of data for execution +# # flattened_data = [item for sublist in [[(item[col] for col in columns)] for item +# in data] for item in sublist] + +# # Execute the SQL command +# with engine.connect() as connection: # try: -# return pd.to_datetime(date_str, format=fmt) -# except (ValueError, TypeError): -# continue # Try the next format - -# return pd.NaT # Return NaT if no formats match +# connection.execute(text(sql)) +# # connection.commit() +# print(f"Data inserted or updated successfully in table '{table_name}'.") +# except IntegrityError as e: +# print(f"IntegrityError: {e}") +# except Exception as e: +# print(f"An error occurred: {e}") -# survey_df["ping_time"] = survey_df["ping_time"].apply(parse_datetime) +# # Prepare data for insertion or update +# # data = [{'x': i, 'y': j, 'value': v} for i, j, v in grid_points] +# data = grid_points -# pd.to_datetime(survey_df["ping_time"], format='%Y-%m-%d %H:%M:%S.%f', errors="coerce") +# # Insert or update data +# insert_or_update(engine, table_name, columns.keys(), data, primary_keys) -# fig, ax = plt.subplots(figsize=(5, 8)) -# ax.scatter(survey_df.ping_time, survey_df.nasc) -# plt.ylabel("NASC") -# # ax.xaxis.set_major_locator(mdates.DayLocator(5, 10, 15)) -# plt.show() +# sql_command = f"SELECT * FROM {table_name};" + +# with engine.connect() as connection: +# result = connection.execute(text(sql_command)) +# rows = result.fetchall() +# for row in rows: +# print(row) -# times = np.arange(np.datetime64('2001-01-02'), -# np.datetime64('2002-02-03'), np.timedelta64(75, 'm')) -# y = np.random.randn(len(times)) -# survey_df[(survey_df.nasc > 0) & (survey_df.nasc < 1e5)]["nasc"].mean() -# survey_df[(survey_df.nasc > 0) & (survey_df.nasc > 1e5)]["nasc"].mean() +# def update_specific_rows(engine, table_name, updates, conditions): +# """ +# Update specific rows in a table based on conditions. -# fig, ax = plt.subplots() -# ax.plot(times, y) -# survey_df[(survey_df.number_density > 0) & (survey_df.x == 21)] -# # a = self.input["acoustics"]["prc_nasc_df"] -# # survey_df[(survey_df.x) == 24 & (survey_df.y == 13)] +# Args: +# engine (Engine): The SQLAlchemy engine instance. +# table_name (str): The name of the table. +# updates (dict): Dictionary of columns and their new values to be updated. +# conditions (dict): Dictionary of columns and their values to be used in the WHERE clause. +# """ -grid_df["geometry"] = grid_df["geometry"].apply(wkt.loads) -coast_df["geometry"] = coast_df["geometry"].apply(wkt.loads) +# # Construct the SET clause for the update +# set_clause = ", ".join(f"{col} = :{col}" for col in updates.keys()) -projection = realtime_survey.config["geospatial"]["projection"] +# # Construct the WHERE clause for the update +# where_clause = " AND ".join(f"{col} = :{col}_cond" for col in conditions.keys()) + +# # Construct the SQL query +# sql = f""" +# UPDATE {table_name} +# SET {set_clause} +# WHERE {where_clause} +# """ + +# # Prepare parameters for the query +# parameters = {**updates, **{f"{col}_cond": val for col, val in conditions.items()}} + +# # Execute the SQL command +# with engine.connect() as connection: +# try: +# connection.execute(text(sql), parameters) +# print(f"Rows updated successfully in table '{table_name}'.") +# except IntegrityError as e: +# print(f"IntegrityError: {e}") +# except Exception as e: +# print(f"An error occurred: {e}") + +# # Define table name +# table_name = "grid" +# # Define the table and columns +# table_name = 'grid' +# condition_columns = ['x', 'y'] + +# # Define the updates and conditions +# dd = {"x": np.array([1, 2, 3 , 4, 5]),"" "y": np.array([1, 2, 3 , 4, 5]), "value": +# np.array([1, 2, 3 , 4, 5]).astype(float)} +# new_data = pd.DataFrame(dd) +# new_data +# df = new_data + +# kwargs = {"table_name": "grid", "columns": df.columns, "df": df} + +# with engine.connect() as connection: +# # sql_create(connection, table_name = "grid", df = df) +# # sql_validate(connection, "grid") +# # sql_drop(connection, "grid") +# sql_insert(connection, table_name="grid", columns=df.columns, dataframe=df, +# id_columns=["x", "y"]) + + +# data_tuples = [tuple(row) for row in df.itertuples(index=False)] + +# all_columns = df.columns.tolist() +# if len(condition_columns) >= len(all_columns): +# raise ValueError("The number of condition columns must be less than the number of +# columns in data.") + +# # Prepare column names and conditions +# update_columns = [col for col in all_columns if col not in condition_columns] +# condition_str = " AND ".join(f"{col} = ?" for col in condition_columns) +# update_str = ", ".join(f"{col} = ?" for col in update_columns) +# data_tuples = [tuple(row) for row in df.itertuples(index=False)] +# # Generate values string for SQL command +# values_str = ", ".join( +# f"({', '.join(map(str, row))})" +# for row in data_tuples +# ) + +# # Construct the SQL query +# sql = f""" +# INSERT INTO {table_name} ({', '.join(all_columns)}) +# VALUES {values_str} +# ON CONFLICT ({', '.join(condition_columns)}) +# DO UPDATE SET {', '.join(f'{col} = {table_name}.{col} + excluded.{col}' for col inupdate_columns)} +# """ + +# # Execute the SQL command +# with engine.connect() as connection: +# try: +# connection.execute(text(sql)) +# connection.commit() +# print(f"Specific rows updated successfully in table '{table_name}'.") +# except IntegrityError as e: +# print(f"IntegrityError: {e}") +# except Exception as e: +# print(f"An error occurred: {e}") + +# sql_command = f"SELECT * FROM {table_name};" + +# with engine.connect() as connection: +# result = connection.execute(text(sql_command)) +# rows = result.fetchall() + +# for row in rows: +# print(row) + + +# # Insert or update data +# insert_or_update(engine, table_name, columns.keys(), data, primary_keys) + +# sql_command = f"SELECT * FROM {table_name};" + +# with engine.connect() as connection: +# result = connection.execute(text(sql_command)) +# rows = result.fetchall() + +# for row in rows: +# print(row) + +# # Ensure that condition_columns match the length of data tuples minus the update column +# if len(condition_columns) != len(df.columns) - 1: +# raise ValueError("The number of condition columns must match the number of columns in +# data minus the update column.") + +# # Prepare the SQL statement for update +# update_columns = [col for col in df.columns if col not in condition_columns] +# condition_str = " AND ".join(f"{col} = ?" for col in condition_columns) +# update_str = ", ".join(f"{col} = ?" for col in update_columns) +# # Convert DataFrame rows to list of tuples +# data_tuples = [tuple(row) for row in df.itertuples(index=False)] + +# # Generate a values string for the SQL command +# values_str = ", ".join( +# f"({', '.join(map(str, row))})" +# for row in data_tuples +# ) +# # Construct the SQL query +# sql = f""" +# UPDATE {table_name} +# SET {update_str} +# WHERE {condition_str} +# """ + +# # Flatten the list of data for execution +# flattened_data = [] +# for row in data_tuples: +# conditions = row[:len(condition_columns)] +# update_values = row[len(condition_columns):] +# flattened_data.extend(conditions + update_values) + +# # Execute the SQL command +# with engine.connect() as connection: +# try: +# connection.execute(text(sql), flattened_data) +# print(f"Specific rows updated successfully in table '{table_name}'.") +# except IntegrityError as e: +# print(f"IntegrityError: {e}") +# except Exception as e: +# print(f"An error occurred: {e}") + +# # Execute the SQL command +# with engine.connect() as connection: +# try: +# connection.execute(text(sql), flattened_data) +# print(f"Specific rows updated successfully in table '{table_name}'.") +# except IntegrityError as e: +# print(f"IntegrityError: {e}") +# except Exception as e: +# print(f"An error occurred: {e}") +# # Update specific rows +# update_specific_rows(engine, table_name, updates, conditions) + +# # Verify the update +# sql_command = f"SELECT * FROM {table_name};" +# with engine.connect() as connection: +# result = connection.execute(text(sql_command)) +# rows = result.fetchall() + +# for row in rows: +# print(row) +# # Construct the full SQL command +# sql_command = f""" +# INSERT INTO {table_name} ({columns_str}) +# VALUES {values_str}; +# """ + +# # Execute the SQL command +# with engine.connect() as connection: +# connection.execute(text(sql_command)) +# connection.commit() + +# check_table_exists(engine, "grid") + +# # Define table name, columns, and data +# table_name = 'grid' +# columns = ['x', 'y', 'value'] +# data = [ +# (1, 1, 1.0), +# (2, 2, 1.5), +# (3, 3, 2.0) +# ] + +# # Prepare the columns part of the SQL statement +# columns_str = ", ".join(columns) + +# # Prepare the values part of the SQL statement +# values_str = ", ".join( +# f"({', '.join(map(str, row))})" +# for row in data +# ) + + +# print("Generated SQL Command:") +# print(sql_command) + +# # Execute the SQL command +# with engine.connect() as connection: +# connection.execute(text(sql_command)) + +# def insert_values_sql(table_name, columns, values, filter_clause=""): +# """ +# Generate a SQL command to insert values into a table. + +# Args: +# table_name (str): The name of the table. +# columns (list): List of column names to be inserted. +# values (list of tuples): List of tuples where each tuple represents a row of values +# to be inserted. +# filter_clause (str, optional): Optional filter clause to specify conditions for insertion. + +# Returns: +# str: The SQL command to insert values into the table. +# """ +# # Generate column names +# column_names = ", ".join(columns) + +# # Generate value placeholders +# value_placeholders = ", ".join("?" * len(columns)) + +# # Generate values part +# values_part = ", ".join(f"({', '.join('?' * len(columns))})" for _ in values) + +# # Flatten the values list for insertion +# flattened_values = [item for sublist in values for item in sublist] + +# # Create the SQL command +# insert_command = f""" +# INSERT INTO {table_name} ({column_names}) +# VALUES {values_part} +# {filter_clause} +# """ +# return insert_command.strip(), flattened_values + +# # Define the values for insertion +# insert_columns = ["x", "y", "value"] +# insert_values = [(1, 1, 10.0)] + +# insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values) +# print("Insert Values SQL:\n", insert_sql) +# print("Data:\n", insert_data) + +# insrt_stmt = + +# with engine.connect() as connection: +# connection.execute(text(insert_sql), tuple(insert_data)) + +# # Define the values for insertion +# insert_columns = ["x", "y", "value"] +# insert_values = [(1, 1, 10.0), (2, 2, 20.0), (3, 3, 30.0)] + +# # Call the function +# insert_or_update_table(engine, table_name, columns, data, conflict_columns) + +# # Example usage +# table_name = "grid" +# columns = ["x", "y", "value"] +# data = [ +# (1, 1, 1.0), +# (2, 2, 1.5), +# (3, 3, 2.0), +# ] + +# sql_command = "INSERT INTO grid (x, y, value) VALUES (:x, :y, :value)" +# test_data = [{'x': 1, 'y': 1, 'value': 1.0}] + +# with engine.connect() as connection: +# connection.execute(text(sql_command), test_data) + +# # Generate the SQL command and data +# insert_stmt = insert_into_table(table_name, columns, data) + +# # Print the generated SQL command (for validation) +# print("Insert SQL Command:") +# print(insert_stmt) + +# # Print for validation +# print("Insert SQL Command:") +# print(insert_sql) +# print("Data:") +# print(insert_data) + +# # Example execution with SQLAlchemy +# with engine.connect() as connection: +# connection.execute(insert_stmt) + +# def insert_values_sql(table_name, columns, values): +# """ +# Generate SQL command for inserting values into a table. + +# Args: +# table_name (str): The name of the table. +# columns (list): List of column names. +# values (list of tuples): List of values to insert. + +# Returns: +# str: The SQL command to insert the values. +# list: Flattened list of values for binding to the SQL command. +# """ +# column_names = ", ".join(columns) +# value_placeholders = ", ".join("?" * len(columns)) +# values_part = ", ".join(f"({value_placeholders})" for _ in values) +# flattened_values = [item for sublist in values for item in sublist] + +# insert_command = f""" +# INSERT INTO {table_name} ({column_names}) +# VALUES {values_part} +# """ +# return insert_command.strip(), flattened_values + +# def check_table_exists(engine, table_name): +# """ +# Check if a table exists in the database. + +# Args: +# engine: SQLAlchemy engine object. +# table_name (str): The name of the table to check. + +# Returns: +# bool: True if the table exists, False otherwise. +# """ +# inspector = inspect(engine) +# return table_name in inspector.get_table_names() + +# with engine.connect() as connection: +# # sql_validate(connection, "grid") +# sql_inspect(connection) +# sql_drop(connection, table_name) -grid_gdf = gpd.GeoDataFrame(grid_df, geometry="geometry", crs=projection) -grid_gdf_1 = grid_gdf[grid_gdf.abundance > 0] -coast_gdf = gpd.GeoDataFrame(coast_df, geometry="geometry", crs=projection) +# def select_from_table(engine, table_name, columns='*'): +# """ +# Select data from a table. -lims = grid_gdf.total_bounds -# nu = dataset_gdf[(dataset_gdf.stratum_x == 25) & (dataset_gdf.stratum_y == 11)] -# dataset_gdf.stratum_x.max() -# # np.linspace(1, 1, len(np.arange(xmin, xmax+x_step, x_step))-1) +# Args: +# engine: SQLAlchemy engine object. +# table_name (str): The name of the table to select from. +# columns (str or list): Columns to select. '*' selects all columns. + +# Returns: +# list: List of rows returned by the query. +# """ +# metadata = MetaData(bind=engine) +# table = Table(table_name, metadata, autoload_with=engine) + +# if columns == '*': +# columns = [col.name for col in table.columns] +# elif isinstance(columns, str): +# columns = [columns] + +# stmt = select([table.c[col] for col in columns]) + +# with engine.connect() as connection: +# result = connection.execute(stmt) +# return result.fetchall() + +# # Create table +# table_name = "grid" +# columns = {"x": "INTEGER", "y": "INTEGER", "value": "REAL"} +# primary_keys = ["x", "y"] +# index_columns = ["value"] + +# create_sql = create_table_sql(table_name, columns, primary_keys, index_columns) +# print("Create Table SQL:\n", create_sql) + +# with engine.connect() as connection: +# connection.execute(create_sql) + +# insert_columns = ["x", "y", "value"] +# insert_values = [(1, 1, 10.0), (2, 2, 20.0), (3, 3, 30.0)] + +# # Insert data function +# def insert_values_sql(table_name, columns, values): +# column_names = ", ".join(columns) +# value_placeholders = ", ".join("?" * len(columns)) +# values_part = ", ".join(f"({value_placeholders})" for _ in values) + +# insert_command = f""" +# INSERT INTO {table_name} ({column_names}) +# VALUES {values_part} +# """ +# # Flatten the list of values into a single list +# flattened_values = [value for sublist in values for value in sublist] + +# return insert_command.strip(), flattened_values + + +# table_name = 'grid' +# columns = ['x', 'y', 'value'] +# data = [ +# (1, 1, 1.0), +# (2, 2, 1.5), +# (3, 3, 2.0) +# ] + +# # Prepare the columns part of the SQL statement +# columns_str = ", ".join(columns) + +# # Prepare the values part of the SQL statement +# values_str = ", ".join( +# f"({', '.join(map(str, row))})" +# for row in data +# ) + +# # Construct the full SQL command +# sql_command = f""" +# INSERT INTO {table_name} ({columns_str}) +# VALUES {values_str}; +# """ + +# # Execute the SQL command +# with engine.connect() as connection: +# connection.execute(text(sql_command)) + +# sql_command = f"SELECT * FROM {table_name};" + +# with engine.connect() as connection: +# result = connection.execute(text(sql_command)) +# rows = result.fetchall() + +# print(f"Data in table {table_name}:") +# for row in rows: +# print(row) +# # Construct the full SQL command +# sql_command = f""" +# INSERT INTO {table_name} ({columns_str}) +# VALUES {values_str}; +# """ + + +# insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values) +# print("Insert Values SQL:\n", insert_sql) +# print("Insert Data:\n", insert_data) + +# with engine.connect() as connection: +# connection.execute(insert_sql, [insert_data]) + +# # Check table existence +# exists = check_table_exists(engine, table_name) +# print(f"Table '{table_name}' exists: {exists}") + +# # Select data from table +# data = select_from_table(engine, table_name, insert_columns) +# print(f"Data from '{table_name}':") +# for row in data: +# print(row) + + +# create_sql = create_table_sql(table_name, columns, primary_keys, index_columns) +# print("Create Table SQL:\n", create_sql) + +# # Define the values for insertion +# insert_columns = ["x", "y", "value"] +# insert_values = [(1, 1, 10.0)] + +# insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values) +# print("Insert Values SQL:\n", insert_sql) +# print("Data:\n", insert_data) + +# # Example usage +# table_name = "grid" +# columns = { +# "x": "INTEGER", +# "y": "INTEGER", +# "value": "REAL" +# } +# primary_keys = ["x", "y"] +# index_columns = ["value"] + +# sql_command = create_table_sql(table_name, columns, primary_keys, index_columns) +# print(sql_command) + +# # Create the table +# create_table_sql = """ +# CREATE TABLE IF NOT EXISTS grid ( +# x INTEGER, +# y INTEGER, +# value REAL, +# PRIMARY KEY (x, y) +# ); +# """ + +# # Insert grid points +# insert_values = ", ".join(f"({i}, {j}, {v})" for i, j, v in grid_points) +# insert_sql = f""" +# INSERT INTO grid (x, y, value) VALUES {insert_values}; +# """ + +# # Connect to the database and execute the commands +# with engine.connect() as connection: +# try: +# # Create table if it does not exist +# connection.execute(text(create_table_sql)) +# # Insert grid points +# connection.execute(text(insert_sql)) +# connection.commit() +# print("Grid points successfully inserted.") +# except Exception as e: +# print(f"An error occurred: {e}") + + +# engine = create_engine(f"sqlite:///{db_file}") +# metadata = MetaData() +# grid_table = Table('grid', metadata, autoload_with=engine) +# # Read existing grid values from the database into a DataFrame +# with engine.connect() as connection: +# select_stmt = select(grid_table.c.x, grid_table.c.y, grid_table.c.value) +# result = connection.execute(select_stmt) +# existing_data = pd.DataFrame(result.fetchall(), columns=['x', 'y', 'value']) + +# # Coordinates to update +# update_coords = {(1,1), (2,2), (3,3), (4,4), (5,5)} + +# # Create a dictionary for fast lookup +# update_dict = {(i, j): 1.0 for i, j in update_coords} + +# # Update the grid_points with new values where applicable +# updated_grid_points = [ +# (i, j, update_dict.get((i, j), value)) +# for i, j, value in grid_points +# ] + +# # Convert the list of tuples to a DataFrame +# df_updated_grid_points = pd.DataFrame(updated_grid_points, columns=['x', 'y', 'value']) + +# # Print the DataFrame +# print(df_updated_grid_points) + +# # Merge existing and updated data to find differences +# merged_data = pd.merge(existing_data, df_updated_grid_points, +# on=['x', 'y'], suffixes=('_existing', '_updated')) +# differences = merged_data[merged_data['value_existing'] != merged_data['value_updated']] + +# # Assuming 'differences' is your DataFrame with updated values +# # Create a dictionary for batch updating +# update_dict = differences.set_index(['x', 'y'])['value_updated'].to_dict() + +# # Generate the SQLAlchemy update statement +# update_stmt = update(grid_table).where( +# grid_table.c.x.in_(update_dict.keys()) +# ).values({ +# grid_table.c.value: update_dict.get((grid_table.c.x, grid_table.c.y), grid_table.c.value) +# }) + +# # Create the CASE statement +# case_stmt = case( +# { +# (grid_table.c.x == x) & (grid_table.c.y == y): value +# for (x, y), value in update_dict.items() +# }, +# else_=grid_table.c.value +# ) + +# # Convert the DataFrame into a dictionary of case statements +# case_stmt = case( +# [(grid_table.c.x == x) & (grid_table.c.y == y), value] +# for (x, y), value in update_dict.items() +# ) + +# # Create the case statement +# case_stmt = case( +# { (x, y): value for (x, y), value in update_dict.items() }, +# value=grid_table.c.x, # Assuming `x` is the column being compared +# else_=grid_table.c.value +# ) + +# case_stmt = case( +# { +# (x, y): value +# for (x, y), value in update_dict.items() +# }, +# value=grid_table.c.x, +# else_=grid_table.c.value +# ) + +# # Create the case statement +# # Create a CASE statement using a dictionary +# case_stmt = case( +# { +# (grid_table.c.x == x) & (grid_table.c.y == y): value +# for (x, y), value in update_dict.items() +# }, +# else_=grid_table.c.value +# ) +# case_stmt = case( +# {((grid_table.c.x == x) & (grid_table.c.y == y)): value +# for (x, y), value in update_dict.items()}, +# else_=grid_table.c.value +# ) +# print("Case Statement:", str(case_stmt.compile(engine, +# compile_kwargs={"literal_binds": True}))) + + +# # Create the update statement +# update_stmt = ( +# update(grid_table). +# where(grid_table.c.value != case_stmt). +# values(value=case_stmt) +# ) + +# print("Update Statement:", str(update_stmt.compile(engine, +# compile_kwargs={"literal_binds": True}))) + + +# # Print the SQL for each update +# for (x, y), value in update_dict.items(): +# update_stmt = ( +# update(grid_table) +# .where((grid_table.c.x == x) & (grid_table.c.y == y)) +# .values(value=value) +# ) +# # Print the SQL statement with literal values for debugging +# print("Update Statement:", str(update_stmt.compile(engine, +# compile_kwargs={"literal_binds": True}))) + +# # Execute the update statement +# with engine.connect() as connection: +# result = connection.execute(update_stmt) +# print(f"Updated {result.rowcount} entries for coordinates ({x}, {y}).") + +# # Execute the update +# with engine.connect() as connection: +# result = connection.execute(update_stmt) +# print(f"Updated {result.rowcount} entries.") + +# engine.dispose() + +# engine = create_engine(f"sqlite:///{db_file}") +# metadata = MetaData() +# grid_table = Table('grid', metadata, autoload_with=engine) +# # Verify the updated rows +# select_stmt = select(grid_table) + +# with engine.connect() as connection: +# result = connection.execute(select_stmt) +# rows = result.fetchall() + +# for row in rows: +# print(row) + +# # Define your SQLite engine and metadata +# engine = create_engine(F'sqlite:///{db_file}') +# metadata = MetaData() + +# # Reflect the grid table +# grid_table = Table('grid', metadata, autoload_with=engine) + +# # Define your update dictionary +# update_dict = {(1, 1): 1.0, (2, 2): 1.0, (3, 3): 1.0, (4, 4): 1.0, (5, 5): 1.0} + +# # Execute updates +# # with engine.connect() as connection: +# connection = engine.connect() +# # for (x, y), value in update_dict.items(): +# (x,y) = (1, 1) +# value = update_dict[(1,1)] + +# update_stmt = ( +# update(grid_table) +# .where((grid_table.c.x == x) & (grid_table.c.y == y)) +# .values(value=value) +# ) +# # Print the SQL statement for debugging +# print("Executing Update Statement:", str(update_stmt.compile(engine, +# compile_kwargs={"literal_binds": True}))) + +# # Execute the update statement +# result = connection.execute(update_stmt) +# print(f"Updated {result.rowcount} entries for coordinates ({x}, {y}).") +# connection.close() + +# select_stmt = select(grid_table.c.x) + +# # Execute the SELECT statement +# with engine.connect() as connection: +# result = connection.execute(select_stmt) +# x_values = result.fetchall() + +# type(x_values[0]) + +# select_stmt = select(grid_table.c.y) + +# # Execute the SELECT statement +# with engine.connect() as connection: +# result = connection.execute(select_stmt) +# y_values = result.fetchall() + +# select_stmt = select(grid_table.c.value) + +# # Execute the SELECT statement +# with engine.connect() as connection: +# result = connection.execute(select_stmt) +# values = result.fetchall() + +# case_stmt = case( +# *[(grid_table.c.x == x) & (grid_table.c.y == y, value) +# for (x, y), value in update_dict.items()], +# else_=grid_table.c.value +# ) + +# update_dict = {(1, 2): 1.0, (3, 2): 1.0, (1, 5): 1.0, (4, 5): 1.0, (3, 5): 4.0} + +# with engine.connect() as connection: +# # Select all values to check the current state +# result = connection.execute(select(grid_table.c.x, grid_table.c.y, grid_table.c.value)) +# current_values = result.fetchall() +# print("Current Values:", current_values) + +# with engine.connect() as connection: +# with connection.begin(): # Begin a transaction +# for (x, y), value in update_dict.items(): +# stmt = ( +# update(grid_table) +# .where((grid_table.c.x == x) & (grid_table.c.y == y)) +# .values(value=grid_table.c.value + value) +# ) +# connection.execute(stmt) + +# with engine.connect() as connection: +# # Re-select to check the updated state +# result = connection.execute(select(grid_table.c.x, grid_table.c.y, grid_table.c.value)) +# updated_values = result.fetchall() +# print("Updated Values:", updated_values) + + +# # Confirm the updates +# with engine.connect() as connection: +# select_stmt = select([grid_table]) +# result = connection.execute(select_stmt) +# rows = result.fetchall() + +# # Print all rows to verify updates +# print("Database contents after update:") +# for row in rows: +# print(row) + + +# # Construct the update statement +# update_stmt = ( +# update(grid_table) +# .values(value=case_stmt) +# .where(grid_table.c.value != case_stmt) +# ) + +# # Create a SELECT statement to fetch all rows from the grid_table +# select_stmt = select(grid_table) + +# # Execute the SELECT statement and fetch results +# with engine.connect() as connection: +# result = connection.execute(select_stmt) +# rows = result.fetchall() + +# # Print or inspect the fetched rows +# for row in rows: +# print(row) + +# # Create the update statement +# update_stmt = ( +# update(grid_table) +# .where(grid_table.c.value != case_stmt) +# .values(value=case_stmt) +# ) + +# # Execute the update +# with engine.connect() as connection: +# result = connection.execute(update_stmt) +# print(f"Updated {result.rowcount} entries.") + +# case( +# [ +# ((grid_table.c.x == x) & (grid_table.c.y == y), value) +# for (x, y), value in update_dict.items() +# ], +# else_=grid_table.c.value +# ) + +# # Create a case statement for conditional update +# case_statements = { +# (x, y): case( +# [(grid_table.c.x == x) & (grid_table.c.y == y, value)], +# else_=grid_table.c.value +# ) +# for (x, y), value in update_dict.items() +# } + + +# # Define SQL command to select all data from the grid table +# select_sql = "SELECT * FROM grid;" + +# # Connect to the database and execute the query +# with engine.connect() as connection: +# try: +# # Execute the select command +# result = connection.execute(text(select_sql)) +# # Fetch all rows from the result +# rows = result.fetchall() +# # Print the results +# print("Data in grid table:") +# for row in rows: +# print(row) +# except Exception as e: +# print("An error occurred: {}".format(e)) + +# # Coordinates to update +# update_coords = {(1,1), (2,2), (3,3), (4,4), (5,5)} + +# # Create a copy of grid_points and update specific coordinates +# updated_grid_points = [ +# (i, j, 1.0) if (i, j) in update_coords else (i, j, value) +# for i, j, value in grid_points +# ] + +# # Retrieve current data from the database +# with engine.connect() as connection: +# result = connection.execute(text("SELECT x, y, value FROM grid;")) +# current_data = result.fetchall() + +# # Convert to a dictionary for easy comparison +# current_values = {(x, y): value for x, y, value in current_data} + +# # Convert updated_grid_points to a dictionary +# updated_values = {(i, j): value for i, j, value in updated_grid_points} + +# # Find differences +# differences = [ +# (i, j, value) +# for i, j, value in updated_grid_points +# if (i, j) in updated_values and (i, j) not in current_values or +# (i, j) in current_values and current_values[(i, j)] != value +# ] + +# # Update differing values in the database +# with engine.connect() as connection: +# for i, j, value in differences: +# connection.execute( +# text(f"UPDATE grid SET value = {value} WHERE x = {i} AND y = {j}"), +# ) +# print(f"Updated {len(differences)} entries.") + +# # Step 8: Read the table into Python +# with engine.connect() as connection: +# # Query to select all rows from the table +# result = connection.execute(text("SELECT x, y, value FROM grid;")) +# df = pd.DataFrame(result.fetchall(), columns=['x', 'y', 'value']) + +# # Print the DataFrame to validate the changes +# print(df) + +# # Check current values +# with engine.connect() as connection: +# result = connection.execute(text("SELECT x, y, value FROM grid;")) +# current_values = {(row[0], row[1]): row[2] for row in result.fetchall()} + +# print("Current grid points in database:") +# for row in current_values.items(): +# print(row) + +# print("Updated grid points with changes:") +# for row in updated_grid_points: +# print(row) + +# # Determine differences +# differences = [ +# (i, j, value) +# for i, j, value in updated_grid_points +# if (i, j) in current_values and current_values[(i, j)] != value +# ] + +# print(f"Differences to update: {differences}") + +# # Step 6: Update the database with INSERT OR REPLACE +# with engine.connect() as connection: +# with connection.begin(): # Ensure transactions are committed +# for i, j, value in updated_grid_points: +# sql = """ +# INSERT OR REPLACE INTO grid (x, y, value) +# VALUES (:x, :y, :value) +# """ +# print(f"Executing SQL: {sql} with values: x={i}, y={j}, value={value}") +# connection.execute( +# text(sql), +# {"x": i, "y": j, "value": value} +# ) +# print(f"Updated entries with INSERT OR REPLACE.") + +# # Step 8: Read the table into Python +# with engine.connect() as connection: +# result = connection.execute(text("SELECT x, y, value FROM grid;")) +# rows = result.fetchall() +# df = pd.DataFrame(rows, columns=['x', 'y', 'value']) + +# # Print the DataFrame to validate the changes +# print("Updated table data:") +# print(df) + + +# engine.dispose() + +# # Check if the file exists and then remove it +# if db_file.exists(): +# db_file.unlink() +# print(f"Deleted the file: {db_file}") +# else: +# print(f"The file does not exist: {db_file}") + +# with engine.connect() as connection: +# connection.execute(text(""" +# CREATE TABLE IF NOT EXISTS grid ( +# x INTEGER, +# y INTEGER, +# value REAL, +# PRIMARY KEY (x, y) +# ); +# """)) + +# connection.execute(text(""" +# INSERT OR REPLACE INTO grid (x, y, value) VALUES +# (1, 1, 0), (1, 2, 0), (1, 3, 0), (1, 4, 0), (1, 5, 0), +# (2, 1, 0), (2, 2, 0), (2, 3, 0), (2, 4, 0), (2, 5, 0), +# (3, 1, 0), (3, 2, 0), (3, 3, 0), (3, 4, 0), (3, 5, 0), +# (4, 1, 0), (4, 2, 0), (4, 3, 0), (4, 4, 0), (4, 5, 0), +# (5, 1, 0), (5, 2, 0), (5, 3, 0), (5, 4, 0), (5, 5, 0); +# """)) + +# # Insert initial values (0) into the grid table +# values = ",".join(["({}, {}, {})".format(i, j, 0) for i, j, _ in grid_points]) +# connection.execute(text("INSERT INTO grid (x, y, value) VALUES {values};" +# .format(values=values))) + +# # Commit +# connection.commit() + +# # Verify data insertion +# result = connection.execute(text("SELECT * FROM grid;")) +# rows = result.fetchall() +# print("Data in grid table:", rows) + +# connection.execute(text(""" +# INSERT INTO grid (x, y, value) VALUES +# """ + ",".join(["({}, {}, {})".format(i, j, 0) for i, j, _ in grid_points]) + ";")) + +# engine.dispose() + + +# result = connection.execute(text("SELECT * FROM grid;")) +# rows = result.fetchall() +# print("Data in grid table:", rows) + +# with engine.connect() as connection: +# result = connection.execute(text("SELECT name FROM sqlite_master WHERE type='table';")) +# print(result.fetchall()) + +# with engine.connect() as connection: +# # Describe the table schema +# result = connection.execute(text("PRAGMA table_info(grid);")) +# columns = result.fetchall() +# print("Table schema:", columns) + +# with engine.connect() as connection: +# result = connection.execute(text("SELECT * FROM grid;")) +# rows = result.fetchall() +# for row in rows: +# print(row) + +# SQL(db_file, command="select") + + +# import geopandas as gpd +# import geopy +# import matplotlib.pyplot as plt +# import numpy as np +# import pandas as pd +# import pyproj +# import shapely.geometry +# from geopy.distance import distance +# from shapely.geometry import Point, Polygon, box +# from shapely.ops import unary_union + +# from echopop.spatial.projection import utm_string_generator, wgs84_to_utm +# from echopop.survey import Survey + +# survey = Survey( init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/ini +# tialization_config.yml" , +# survey_year_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_fil +# es/survey_year_2019_config.yml" ) + + +# grid_settings = file_configuration["geospatial"]["griddify"] +# # lat_min = grid_settings["bounds"]["latitude"][0] +# lat_min = 33.75 +# # lat_max = grid_settings["bounds"]["latitude"][1] +# lat_max = 55.50 +# # lon_min = grid_settings["bounds"]["longitude"][0] +# lon_min = -134.25 +# lon_max = grid_settings["bounds"]["longitude"][1] + +# projection = file_configuration["geospatial"]["projection"] + +# utm_code = utm_string_generator((lon_max + lon_min)/2, (lat_max + lat_min)/2) +# utm_num = int(utm_code) +# utm_str = f"epsg:{utm_num}" + +# biology_data = filtered_biology_output + +# from sqlalchemy import Engine, create_engine, inspect, text + +# root_dir = file_configuration["data_root_dir"] +# db_directory = Path(root_dir) / "database" +# db_directory.mkdir(parents=True, exist_ok=True) +# db_file = db_directory / "biology.db" +# # Create the engine with the full path +# engine = create_engine(f'sqlite:///{db_file}') + +# SQL_COMMANDS = { +# "create": "CREATE TABLE IF NOT EXISTS {table_name} ({column_definitions});", +# "check": "SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';", +# "drop": "DROP TABLE IF EXISTS {table_name};", +# "select": "SELECT {columns} FROM {table_name};", +# "index": "CREATE UNIQUE INDEX IF NOT EXISTS {index_name} ON {table_name} ({columns})", +# # "insert": "INSERT INTO {table_name} ({columns});", +# "insert": """ +# INSERT INTO {table_name} ({columns}) +# SELECT {columns} +# FROM (SELECT VALUES {values} FROM (VALUES {value_placeholder})) AS source ({columns}) +# {filter_clause}; +# """, +# "inspect": None, +# } + +# SQL_DTYPES = { +# 'int32': 'INTEGER', +# 'int64': 'INTEGER', +# 'float64': 'FLOAT', +# 'bool': 'BOOLEAN', +# 'datetime64[ns]': 'DATETIME', +# 'object': 'TEXT' +# } + +# def SQL(db_file: str, command: str, **kwargs): + +# # Create engine from `db_file` string +# engine = create_engine(f"sqlite:///{db_file}") + +# # Format `columns`, if there are any and more than 1 +# if "columns" in kwargs.keys(): +# if isinstance(kwargs["columns"], list): +# kwargs["columns"] = ", ".join(kwargs["columns"]) +# else: +# kwargs["columns"] = "*" + +# # Format `columns`, if there are any and more than 1 +# # if "filter_columns" in kwargs.keys(): +# # # ---- Store the value for later +# # kwargs["filter_columns_store"] = kwargs["filter_columns"] +# # if isinstance(kwargs["filter_columns"], list): +# # kwargs["filter_columns"] = ", ".join(kwargs["filter_columns"]) + +# # Run the command +# try: +# with engine.connect() as connection: +# # ---- SELECT +# if command == "select": +# return pd.read_sql(text(SQL_COMMANDS[command].format(**kwargs)), con=connection) +# # ---- CREATE +# elif command == "create": +# # ---- Extract dataframe +# df_to_add = kwargs["dataframe"] +# # ---- Check whether the table already exists or not +# table_exists = ( +# connection.execute(text(SQL_COMMANDS["check"].format(**kwargs))).fetchone() +# ) +# # ---- If it doesn't, pre-allocate the table +# if table_exists is None: +# # ---- Get column definitions as a string +# column_def_dict = { +# col: SQL_DTYPES.get(str(dtype), 'TEXT') +# for col, dtype in zip(df_to_add.columns, df_to_add.dtypes) +# } +# # ---- Convert to a single string +# kwargs["column_definitions"] = ( +# ", ".join([f"{col} {dtype}" for col, dtype in column_def_dict.items()]) +# ) +# # ---- Create table +# connection.execute(text(SQL_COMMANDS["create"].format(**kwargs))) +# # ---- REPLACE +# elif command == "replace": +# # ---- Extract dataframe +# df_to_add = kwargs["dataframe"] +# # ---- Replace current +# df_to_add.to_sql(name=kwargs["table_name"], +# con=connection, +# if_exists="replace", index=False) + +# # ---- INSERT +# elif command == "insert": +# # ---- Extract dataframe +# df_to_add = kwargs["dataframe"] +# # ---- Check if +# # table_exists = ( +# # connection.execute(text(SQL_COMMANDS["check"].format(**kwargs))).fetchone() +# # ) +# # tables = SQL(db_file, "inspect") +# # ---- If it doesn't, pre-allocate the table +# # if kwargs["table_name"] not in tables and "filter_columns" in kwargs.keys(): +# df_to_add.to_sql(name=kwargs["table_name"], +# con=connection, +# if_exists="append", index=False) +# # else: +# # # ---- Format `filter_columns` command if present +# # if "filter_columns" in kwargs.keys(): +# # # ---- Fetch table +# # fetch_table = ( +# # connection.execute(text( +# # ("SELECT DISTINCT {filter_columns} FROM {table_name}") +# # .format(**kwargs)) +# # ) +# # ) +# # # ---- Format the SQL data into a DataFrame +# # fetched_df = pd.DataFrame(fetch_table.fetchall(), +# columns=fetch_table.keys()) +# # # ---- Create an index tuples +# # index_tuples = ( +# # set(fetched_df[kwargs["filter_columns_store"]] +# # .itertuples(index=False, name=None)) +# # ) +# # # ---- Filter the dataframe +# # filtered_df = ( +# # df_to_add[ +# # ~df_to_add[fetched_df.columns].apply(tuple, axis=1) +# # .isin(index_tuples) +# # ] +# # ) +# # # ---- Insert the data +# # filtered_df.to_sql(name=kwargs["table_name"], +# # con=connection, +# # if_exists="append", index=False) +# # else: +# # df_to_add.to_sql(name=kwargs["table_name"], +# # con=connection, +# # if_exists="append", index=False) +# # ---- INSPECT +# elif command == "inspect": +# return inspect(engine).get_table_names() +# else: +# connection.execute(text(SQL_COMMANDS[command].format(**kwargs))) +# finally: +# # ---- Dispose of the engine to release any resources being pooled/used +# engine.dispose() + +# _ = SQL(db_file, "drop", table_name="catch_df") +# _ = SQL(db_file, "drop", table_name="specimen_df") +# _ = SQL(db_file, "drop", table_name="length_df") +# _ = SQL(db_file, "drop", table_name="files_read") + +# _ = SQL(db_file, "insert", table_name="files_read", dataframe=current_files) +# current = SQL(db_file, "select", table_name="files_read", columns="filepath") +# current + + +# # Get acoustic directory and initialization settings +# # ---- Files +# biology_file_settings = file_configuration["input_directories"]["biological"] +# # ---- General settings +# biology_analysis_settings = file_configuration["biology"] + +# # Get the file-specific settings, datatypes, columns, etc. +# # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` +# biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] +# # ---- Extract the expected file name ID's +# biology_file_ids = biology_file_settings["file_name_formats"] +# # ---- Extract all of the file ids +# biology_config_ids = list(biology_file_ids.keys()) +# # ---- Initialize the dictionary that will define this key in the `input` attribute +# biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} +# # ---- Initialize the SQL dictionary +# sql_biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} + +# # Create full filepath +# biology_directory_path = ( +# Path(file_configuration["data_root_dir"]) / biology_file_settings["directory"] +# ) +# # ---- Directory check +# directory_existence = biology_directory_path.exists() +# # ---- Error evaluation (if applicable) +# if not directory_existence: +# raise FileNotFoundError( +# f"The acoustic data directory [{biology_directory_path}] does not exist." +# ) +# # ---- Get the defined file extension +# file_extension = biology_file_settings["extension"] +# # ---- Create Path.glob generator object +# file_path_obj = biology_directory_path.glob(f"*{'.'+file_extension}") +# #---- Create list of `*.csv`` files +# csv_files = list(file_path_obj) +# # ---- Ensure files exist or raise error otherwise +# if len(csv_files) < 1: +# raise FileNotFoundError( +# f"No `*.csv` files found in [{biology_directory_path}]!" +# ) +# else: +# # ---- Create Path to SQL database file +# db_directory = Path(file_configuration["data_root_dir"]) / "database" +# # ---- Create the directory if it does not already exist +# db_directory.mkdir(parents=True, exist_ok=True) +# # ---- Complete path to `biology.db` +# db_file = db_directory / "biology.db" +# # ---- Query the external SQL database to see if the file tracking table exists +# tables = SQL(db_file, "inspect") +# # ---- Create a list of string-formatted Path names +# csv_files_str = [str(file) for file in csv_files] +# # ---- Create DataFrame +# current_files = pd.DataFrame(csv_files_str, columns=["filepath"]) +# # ---- Create if it is missing and then advance `csv_files` +# if "files_read" not in tables: +# # ---- Insert into the SQL database file +# _ = SQL(db_file, "insert", table_name="files_read", columns="filepath", +# dataframe=current_files) +# # ---- Create empty list for later comparison +# new_files = [] +# else: +# # ---- Pull already processed filenames +# previous_files = SQL(db_file, "select", table_name="files_read") +# # ---- Compare against the current filelist +# new_files = ( +# [file for file in csv_files_str if file not in set(previous_files["filepath"])] +# ) +# # ---- Create a DataFrame for the new files +# new_files_df = pd.DataFrame(new_files, columns=["filepath"]) +# # ---- Insert into the SQL database file +# _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) + +# # Iterate through each of the file ids and read in the data +# for id in list(biology_file_ids.keys()): +# # ---- Extract the specific config mapping for this tag/id +# sub_config_map = biology_config_map[id] +# # ---- Drop the `{FIELD_ID}` tag identifier +# file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id]) +# # ---- Replace all other tags with `*` placeholders +# file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format) +# # ---- Create Path object with the generalized format +# subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}") +# # ---- List all files that match this pattern +# subcsv_files_str = [str(file) for file in list(subfile_path_obj)] +# # ---- Filter for only new files +# subset_files = set(subcsv_files_str).intersection(set(new_files)) +# # ---- Pull from SQL database, if applicable +# if f"{id}_df" in tables: +# # ---- SELECT +# sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*") +# # ---- Concatenate to the dictionary +# sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df]) +# # ---- Add data files not stored in SQL database +# if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables: +# if len(subset_files) > 0: +# file_list = subset_files +# else: +# file_list = subcsv_files_str +# # ---- Create a list of relevant dataframes +# sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map) +# for file in file_list] +# # ---- Concatenate into a single DataFrame +# sub_df = pd.concat(sub_df_lst, ignore_index=True) +# # ---- Concatenate to the dictionary DataFrame +# biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df]) + +# # Get contrasts used for filtering the dataset +# # ---- Species +# species_filter = file_configuration["species"]["number_code"] +# # ---- Trawl partition information +# trawl_filter = biology_analysis_settings["catch"]["partition"] +# # ---- Apply the filter +# filtered_biology_output = { +# key: df[ +# (df['species_id'] == species_filter if 'species_id' in df.columns else True) & +# (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns +# else True) +# ] +# for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty +# } + +# # Update the SQL database +# for table_name, df in filtered_biology_output.items(): +# # ---- Update +# _ = SQL(db_file, "insert", table_name=table_name, columns="*", +# dataframe=df) + +# # Combine the two datasets +# merged_output = { +# key: pd.concat([ +# sql_biology_output.get(key, pd.DataFrame()), +# filtered_biology_output.get(key, pd.DataFrame()) +# ]).drop_duplicates().reset_index(drop=True) +# for key in set(sql_biology_output) | set(filtered_biology_output) +# } +# # ---- Return output +# merged_output + +# coordinate_metadata.attrs[] + +# SQL(biology_db, command="drop", table_name="catch_df") +# SQL(biology_db, command="drop", table_name="specimen_df") +# SQL(biology_db, command="drop", table_name="length_df") +# SQL(biology_db, command="drop", table_name="files_read") +# _ = SQL(db_file=db_file, command="create", table_name="files_read", columns="filepath") +# tables = SQL(db_file, "inspect") +# tables +# current = SQL(db_file, "select", table_name="files_read", columns=["filepath"]) +# current + +# SQL(db_file, "select", table_name="catch_df", columns="*") +# new_files_df = pd.DataFrame(csv_files_str, columns=['file_path']) +# _ = SQL("insert", engine, table_name="files_read",dataframe=new_files_df) +# current = SQL("select", engine, table_name="csv_files_read", columns="file_path") +# current +# for table_name, df in biology_data.items(): +# df.to_sql(table_name, con=engine, if_exists='append', index=False) +# command = "read" +# engine = create_engine(f'sqlite:///{db_file}') +# table_name = "files_read" +# columns = "file_path" + +# kwargs = { +# "table_name": table_name, +# "columns": columns, +# } + +# zarr_data_ds["depth"].diff(dim="depth") + +# prc_nasc_df.groupby(["longitude", "latitude"]) + +# from pandas.core.groupby import DataFrameGroupBy + + +# def estimate_echometrics(acoustic_data_df: pd.DataFrame): + +# # Create copy +# acoustic_df = acoustic_data_df.copy().reset_index(drop=True) + +# # Pre-compute the change in depth +# acoustic_df["dz"] = acoustic_df["depth"].diff() + +# # Initialize echometrics dictionary +# echometrics = {} + +# # Compute the metrics center-of-mass +# if acoustic_df["NASC"].sum() == 0.0: +# echometrics.update({ +# "n_layers": 0, +# "mean_Sv": -999, +# "max_Sv": -999, +# "nasc_db": np.nan, +# "center_of_mass": np.nan, +# "dispersion": np.nan, +# "evenness": np.nan, +# "aggregation": np.nan, +# "occupied_area": 0.0, +# }) +# else: + +# # Compute the number of layers +# echometrics.update({ +# "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size +# }) + +# # Compute ABC +# # ---- Convert NASC to ABC +# acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2) +# # ---- Estimate mean Sv +# echometrics.update({ +# "mean_Sv": 10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max()) +# }) +# # --- Estimate max Sv (i.e. ) +# echometrics.update({ +# "max_Sv": 10 * np.log10(acoustic_df["ABC"].max() +# / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"]) +# }) + +# # Compute (acoustic) abundance +# echometrics.update({ +# "nasc_db": 10 * np.log10(acoustic_df["ABC"].sum()) +# }) + +# # Compute center of mass +# echometrics.update({ +# "center_of_mass": ( +# (acoustic_df["depth"] * acoustic_df["NASC"]).sum() +# / (acoustic_df["NASC"]).sum() +# ) +# }) + +# # Compute the dispersion +# echometrics.update({ +# "dispersion": ( +# ((acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2 +# * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum() +# ) +# }) + +# # Compute the evenness +# echometrics.update({ +# "evenness": (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2 +# }) + +# # Compute the index of aggregation +# echometrics.update({ +# "aggregation": 1 / echometrics["evenness"] +# }) + +# # Get the occupied area +# echometrics.update({ +# "occupied_area": ( +# acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max() +# ) +# }) + +# # Return the dictionary +# return echometrics + +# def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True): + +# # Vertically integrate PRC NASC +# nasc_dict = {"nasc": acoustic_data_df["NASC"].sum()} + +# # Horizontally concatenate `echometrics`, if `True` +# if echometrics: +# # ---- Compute values +# # NOTE: This uses NASC instead of linear `sv` +# echometrics_dict = estimate_echometrics(acoustic_data_df) +# # ---- Merge +# nasc_dict.update(echometrics_dict) + +# # Convert `nasc_dict` to a DataFrame and return the output +# return pd.Series(nasc_dict) + +# def process_group(group): +# result = integrate_nasc(group, echometrics=True) +# result = result.reset_index(drop=True) +# # Concatenate the result back to the original group for alignment +# group = group.reset_index(drop=True) +# combined = pd.concat([group, result], axis=1) +# return combined + +# acoustic_data_df = acoustic_data["prc_nasc_df"] + + +# rc_nasc_df[prc_nasc_df["distance"] == 0.0] +# acoustic_data_df = mek[mek["distance"] == 0.0] +# pd.DataFrame(nasc_dict, index=[0]).reset_index(drop=True).unstack() +# nasc_data_df = ( +# prc_nasc_df.groupby(["longitude", "latitude", "ping_time"]) +# .apply(lambda group: integrate_nasc(group, echometrics=False), include_groups=False) +# .reset_index() +# ) + + +# kwargs = { +# "table_name": "csv_files_read", +# "columns": "file_path", +# "dataframe": new_files_df +# } + +# current_process = psutil.Process() +# import logging + +# # Create a session +# Session = sessionmaker(bind=engine) +# session = Session() + +# # Perform database operations +# logging.basicConfig(level=logging.INFO) +# logger = logging.getLogger(__name__) +# logger.info("Performing database operations") + +# # Create a session +# Session = sessionmaker(bind=engine) +# session = Session() + +# # Perform database operations +# logger.info("Performing database operations") + +# # Close the session +# session.close() +# logger.info("Session closed") + +# # Dispose the engine +# engine.dispose() +# logger.info("Engine disposed") + +# # Force garbage collection +# import gc + +# gc.collect() +# logger.info("Garbage collection performed") + +# import psutil + +# pid = psutil.Process().pid +# process = psutil.Process(pid) +# open_files = process.open_files() +# db_path = r'C:\Users\Brandyn\Documents\GitHub\EchoPro_data\live_2019_files\database\biology.db' + +# # Check if the file is still in use +# for file in open_files: +# if db_path in file.path: +# logger.info(f"File {db_path} is still in use.") +# else: +# logger.info(f"File {db_path} is not in use.") + +# # Define the SQL to drop the table +# drop_table_sql = "DROP TABLE IF EXISTS csv_files_read;" +# # Execute the drop table SQL +# with engine.connect() as connection: +# _ = connection.execute(text(drop_table_sql)) + +# import sqlite3 + +# if os.path.exists(db_path): +# conn = sqlite3.connect(db_path) +# conn.close() +# # Force the file to be removed +# try: +# os.remove(db_path) +# print(f"Database file {db_path} has been deleted.") +# except PermissionError: +# print(f"Failed to delete {db_path}. The file is still in use.") + +# create_table_sql = """ +# CREATE TABLE IF NOT EXISTS csv_files_read ( +# file_path TEXT UNIQUE +# ); +# """ +# # Execute the create table SQL +# with engine.connect() as connection: +# _ = connection.execute(text(create_table_sql)) + +# root_directory = Path(root_dir) +# dataset = "biology" + +# # Convert to strings +# csv_files_str = [str(file) for file in csv_files] + +# existing_files_df = pd.read_sql('SELECT file_path FROM csv_files_read', con=engine) +# existing_files_set = set(existing_files_df['file_path']) +# # Filter out duplicates from the csv_files list +# new_files = [file for file in csv_files_str if file not in existing_files_set] +# # Insert only new file paths into the SQL table +# if new_files: +# new_files_df = pd.DataFrame(new_files, columns=['file_path']) +# _ = new_files_df.to_sql('csv_files_read', con=engine, if_exists='append', index=False) + + +# with engine.connect() as conn: +# conn.execute(""" +# CREATE TABLE IF NOT EXISTS csv_files_read ( +# file_path TEXT UNIQUE +# ) +# """) + +# csv_files +# files_df.to_sql('csv_files_read', con=engine, if_exists='append', index=False) +# file_name_format = biology_file_ids[id] +# def compile_filename_format(file_name_format: str): + +# # Create a copy of `file_name_format` +# regex_pattern = file_name_format + +# # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern +# for key, value in LIVE_FILE_FORMAT_MAP.items(): +# regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"]) +# # ---- Replace the `FILE_ID` tag +# regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P\1)', regex_pattern) + +# # Compile the regex pattern and return the output +# return re.compile(regex_pattern) + +# from sqlalchemy.orm import sessionmaker + +# Session = sessionmaker(bind=engine) +# session = Session() +# session.close() +# engine.pool.status() +# # Dispose the engine to close all connections +# engine.dispose() +# import gc + +# gc.collect() +# import psutil + +# dbapi_conn = engine.raw_connection() +# dbapi_conn.close() +# # Get the process ID of the current process +# pid = psutil.Process().pid + +# # List all open files for the current process +# process = psutil.Process(pid) +# open_files = process.open_files() + +# for file in open_files: +# print(file.path) + + +# pattern = filename_format +# config_settings = sub_config_map +# regex_pattern = pattern + +# # Replace patterns based on LIVE_FILE_FORMAT_MAP +# for key, value in LIVE_FILE_FORMAT_MAP.items(): +# regex_pattern = regex_pattern.replace(f'{{{key}}}', value['expression']) +# regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P\1)', regex_pattern) +# new_pattern = compile_filename_format(regex_pattern) +# match_obj = new_pattern.search(file.name) +# # Get substring components as a list +# filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern) +# valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings))) + +# for i in valid_tags: +# matched_key = LIVE_FILE_FORMAT_MAP[i] +# df[matched_key["name"]] = matched_key["dtype"](match_obj.group(i)) + + +# # Assign the data as new columns to the DataFrame +# for key, value in data_to_add.items(): +# df[key] = value + +# for i in valid_tags: +# matched_key = LIVE_FILE_FORMAT_MAP[i] +# df[matched_key["name"]] = matched_key["dtype"](match_obj.group(i)) +# biology_analysis_settings +# species_id_value = 22500 +# trawl_partition_value = 'Codend' # Adjust as needed +# { +# key: df[ +# (('species_id' not in df.columns) or (df['species_id'] == species_id_value)) & +# (('trawl_partition' not in df.columns) or (df['trawl_partition'] == +# trawl_partition_value)) +# ] +# for key, df in biology_output.items() if isinstance(df, pd.DataFrame) +# } + +# (match_obj.group(i)).astype(matched_key["dtype"]) +# pattern = '{DATE:YYYYMM}_{HAUL}_{FILE_ID:catch_perc}' +# modified_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'\1', pattern) +# # Create the regex pattern +# regex_pattern = modified_pattern.replace('{', '(?P<').replace('}', '>.+?)') +# re.compile(regex_pattern) + +# modified_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'\1', pattern) + +# # Create the regex pattern +# regex_pattern = modified_pattern.replace('{', '(?P<').replace('}', '>.+?)') +# compile_filename_format(regex_pattern) +# # Regular expression to capture values inside the curly braces +# regex = r'\{([^:}]+):([^}]+)\}' + +# # Find all matches +# matches = re.findall(regex, modified_pattern) + +# # Get substring components as a list +# filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern) + +# pattern_changed = pattern.replace("FILE_ID:", "") + +# # Compilte the filename regular expression format +# compiled_regex = compile_filename_format(pattern_changed) + +# file_id_tag = pattern.split('{FILE_ID:')[1].split('}')[0] + +# # Get the file name and produce a `re.Match` object +# match_obj = compiled_regex.search(file.name) + + +# def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict): + +# # Get the file name and produce a `re.Match` object +# match_obj = pattern.search(file.name) + +# # Read in the `*.csv` file +# df = pd.read_csv(file, usecols=list(config_settings["dtypes"].keys())) + +# # Validate the dataframe +# # ---- Check for any missing columns +# missing_columns = ( +# [key for key in config_settings["dtypes"].keys() if key not in df.columns] +# ) +# # ---- Raise Error, if needed +# if missing_columns: +# raise ValueError( +# f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!" +# ) +# # ---- Ensure the correct datatypes +# df_validated = df.astype(config_settings["dtypes"]) + +# # Replace column names and drop +# df_validated = df_validated.rename(columns=config_settings["names"]) + +# # Get the haul number and add the the dataframe +# # ---- Extract the haul number and convert to an integer +# haul_num = int(match_obj.group("HAUL")) +# # ---- Add the column +# df_validated["haul_num"] = haul_num + +# # Return the resulting DataFrame +# return df_validated + +# boundary_dict = griddify_definitions["bounds"] + +# import geopandas as gpd +# import numpy as np +# import pandas as pd +# from geopy.distance import distance + +# from echopop.spatial.projection import utm_string_generator + +# ## +# grid_settings["grid_resolution"]["x"] = 50 +# grid_settings["grid_resolution"]["y"] = 50 +# lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters +# lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters + +# # CREATE BOUNDING +# bound_df = pd.DataFrame({ +# "lon": np.array([lon_min, lon_max, lon_max, lon_min, lon_min]), +# "lat": np.array([lat_min, lat_min, lat_max, lat_max, lat_min]) +# }) + +# bound_gdf = gpd.GeoDataFrame( +# data=bound_df, +# geometry=gpd.points_from_xy(bound_df["lon"], bound_df["lat"]), +# crs = projection +# ) +# import shapely.geometry + +# from echopop.spatial.projection import utm_string_generator + +# utm_string_generator(-117.0, 33.75) +# bound_gdf.total_bounds +# # Convert to UTM +# bound_utm = bound_gdf.to_crs(utm_num) +# bound_utm.total_bounds +# y_step = lat_step +# x_step = lon_step +# # bound_utm = bound_gdf +# # y_step = grid_settings["grid_resolution"]["y"] * 1852 / 110574 +# # x_step = grid_settings["grid_resolution"]["x"] * 1852 / 60.0 + +# xmin, ymin, xmax, ymax = bound_utm.total_bounds + +# # Get number of cells +# n_x_cells = int(np.ceil((xmax - xmin) / x_step)) +# n_y_cells = int(np.ceil((ymax - ymin) / y_step)) + +# import pyproj + +# # create the cells in a loop +# # grid_cells = [] +# # for x0 in np.arange(xmin, xmax, x_step): +# # for y0 in np.arange(ymin, ymax, y_step): +# # # bounds +# # utm_zone = utm_string_generator(x0, y0) +# # proj = pyproj.Proj(f"epsg:{utm_code}") +# # x1 = x0-x_step +# # y1 = y0+y_step +# # grid_cells.append(shapely.geometry.box(x0, y0, x1, y1)) + +# grid_cells = [] +# for y0 in np.arange(ymin, ymax, y_step): + +# # x_step = grid_settings["grid_resolution"]["x"] * 1852 / (1852 * 60 * np.cos(np.radians(y0))) + +# for x0 in np.arange(xmin, xmax, x_step): +# # bounds +# # utm_zone = utm_string_generator(x0, y0) +# # proj = pyproj.Proj(f"epsg:{utm_code}") +# # x1, y1 = proj(x0, y0) +# # x2, y2 = proj(x0 - x_step, y0 + y_step) +# # grid_cells.append(box(x1, y1, x2, y2)) +# x1 = x0-x_step +# y1 = y0+y_step +# grid_cells.append(shapely.geometry.box(x0, y0, x1, y1)) + +# cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=utm_code) +# cells_gdf.shape +# n_x_cells * n_y_cells +# # cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"]) +# cells_gdf.total_bounds +# cells_gdf.to_crs(projection).total_bounds +# from shapely.geometry import mapping +# from shapely.validation import make_valid + +# ######## +# world = gpd.read_file("C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files/coastline/ +# ne_10m_land/ne_10m_land.shp") +# bb_orig = box(lon_min, lat_min, lon_max, lat_max) +# boundary_box = box(lon_min - 5, lat_min - 5, lon_max + 5, lat_max + 5) +# world_orig = gpd.clip(world, box(lon_min-1, lat_min-1, lon_max+1, lat_max+1)) +# world_clipped_latlon = gpd.clip(world, boundary_box) +# world_clipped = gpd.clip(world, boundary_box).to_crs(utm_code) + +# world_utm = world.to_crs(utm_code) +# world_utm = world_utm[~world_utm.is_empty] + +# bbox_latlon = box(lon_min, lat_min, lon_max, lat_max) + +# gpd.GeoDataFrame(geometry=[bbox_latlon], crs=projection).to_crs(utm_code) + +# bbox_utm = bound_utm.total_bounds + +# buffer = [-lon_step * 1.01, -lat_step * 1.01, lon_step * 1.01, lat_step * 1.01] +# array_buffer = bbox_utm + buffer +# array_names = ["minx", "miny", "maxx", "maxy"] +# buffered = dict(zip(array_names, array_buffer)) +# buffer_boundary = box(**buffered) +# # box(array_buffer[0], array_buffer[1], array_buffer[2], array_buffer[3]) +# # buffer_boundary = buffer_boundary.to_crs(world_utm.crs) + +# buffer_boundary_gdf = gpd.GeoDataFrame(geometry=[buffer_boundary], crs=world_utm.crs) +# # Replace with the correct EPSG code +# bb_orig_gdf = gpd.GeoDataFrame(geometry=[bb_orig], crs=projection) +# # sub_clipped = gpd.clip(world_utm, buffer_boundary) +# # sub_clipped = gpd.clip(world_utm, bbox_utm) + +# from datetime import datetime + +# import geopandas as gpd +# import matplotlib.cm as cm +# import matplotlib.colors as colors +# import matplotlib.dates as mdates +# import matplotlib.pyplot as plt +# import numpy as np +# from matplotlib.colors import ListedColormap +# from shapely import wkt + +# # fig, ax = plt.subplots(figsize=(10, 10)) +# # # Plot the buffer_boundary +# # world.plot(ax=ax, linewidth=2, color='gray') +# # buffer_boundary_gdf.to_crs(projection).plot(ax=ax, facecolor='none', edgecolor='blue') +# # bb_orig_gdf.plot(ax=ax, facecolor='none', edgecolor='red') +# # plt.xlim(lon_min-3, lon_max+3) +# # plt.ylim(lat_min-3, lat_max+3) +# # plt.show() +# from echopop.live.sql_methods import SQL + +# db_filepath = realtime_survey.config["database"]["grid"] +# survey_db = realtime_survey.config["database"]["acoustics"] +# grid_df = SQL(db_filepath, "select", table_name="grid_df") +# # grid_df[grid_df.abundance > 0] +# grid_df[grid_df.abundance > 1e10] +# # grid_df[grid_df.abundance > 0] +# coast_df = SQL(db_filepath, "select", table_name="coastline_df") +# survey_df = SQL(survey_db, "select", table_name="survey_data_df") + +# # def parse_datetime(date_str): +# # # List of possible formats +# # formats = [ +# # '%Y-%m-%d %H:%M:%S.%f', # With fractional seconds +# # '%Y-%m-%d %H:%M:%S', # Without fractional seconds +# # '%Y-%m-%dT%H:%M:%S.%f', # ISO 8601 format with fractional seconds +# # '%Y-%m-%dT%H:%M:%S' # ISO 8601 format without fractional seconds +# # ] + +# # for fmt in formats: +# # try: +# # return pd.to_datetime(date_str, format=fmt) +# # except (ValueError, TypeError): +# # continue # Try the next format + +# # return pd.NaT # Return NaT if no formats match + +# # survey_df["ping_time"] = survey_df["ping_time"].apply(parse_datetime) + +# # pd.to_datetime(survey_df["ping_time"], format='%Y-%m-%d %H:%M:%S.%f', errors="coerce") + +# # fig, ax = plt.subplots(figsize=(5, 8)) +# # ax.scatter(survey_df.ping_time, survey_df.nasc) +# # plt.ylabel("NASC") +# # # ax.xaxis.set_major_locator(mdates.DayLocator(5, 10, 15)) +# # plt.show() + + +# # times = np.arange(np.datetime64('2001-01-02'), +# # np.datetime64('2002-02-03'), np.timedelta64(75, 'm')) +# # y = np.random.randn(len(times)) +# # survey_df[(survey_df.nasc > 0) & (survey_df.nasc < 1e5)]["nasc"].mean() +# # survey_df[(survey_df.nasc > 0) & (survey_df.nasc > 1e5)]["nasc"].mean() + +# # fig, ax = plt.subplots() +# # ax.plot(times, y) +# # survey_df[(survey_df.number_density > 0) & (survey_df.x == 21)] +# # # a = self.input["acoustics"]["prc_nasc_df"] +# # # survey_df[(survey_df.x) == 24 & (survey_df.y == 13)] + +# grid_df["geometry"] = grid_df["geometry"].apply(wkt.loads) +# coast_df["geometry"] = coast_df["geometry"].apply(wkt.loads) + +# projection = realtime_survey.config["geospatial"]["projection"] + +# grid_gdf = gpd.GeoDataFrame(grid_df, geometry="geometry", crs=projection) +# grid_gdf_1 = grid_gdf[grid_gdf.abundance > 0] +# coast_gdf = gpd.GeoDataFrame(coast_df, geometry="geometry", crs=projection) + +# lims = grid_gdf.total_bounds +# # nu = dataset_gdf[(dataset_gdf.stratum_x == 25) & (dataset_gdf.stratum_y == 11)] +# # dataset_gdf.stratum_x.max() +# # # np.linspace(1, 1, len(np.arange(xmin, xmax+x_step, x_step))-1) + +# # # np.arange(1, len(np.arange(xmin, xmax+x_step, x_step))) +# # pd.cut( +# # nu["x"], +# # np.arange(xmin, xmax, x_step), +# # right = False, +# # labels = np.arange(1, len(np.arange(xmin, xmax, x_step))), +# # ).astype(int) - 1 +# # grid_gdf["x"] = grid_gdf["x"] - 1 + +# # fig, ax = plt.subplots(figsize=(5, 8)) +# # grid_gdf.plot(ax=ax, edgecolor="gainsboro", color="white", linewidth=0.5, legend=False) +# # plt.plot(dataset_gdf.longitude, dataset_gdf.latitude, linewidth=1, color='black') +# # plt.plot(nu.longitude, nu.latitude, linewidth=1, color="red") +# # # Calculate centroids and plot text +# # for idx, row in grid_gdf.iterrows(): +# # centroid = row.geometry.centroid +# # var = f"{row.x}-{row.y}" +# # ax.annotate(var, xy=(centroid.x, centroid.y), +# # xytext=(0,0), fontsize=8, +# # textcoords="offset points", +# # ha='center', va='center', color='black') +# # plt.tight_layout() +# # plt.margins(0, 0) +# # coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") +# # plt.xlim(lims[0]*1.005, lims[2]*1.01) +# # plt.ylim(lims[1]*0.98, lims[3]*1.005) +# # plt.show() + + +# variable = "abundance" +# VARIABLE_MAP = { +# "number_density_mean": { +# "name": "Mean number density", +# "units": "fish $\\mathregular{nmi^{-2}}$" +# }, +# "biomass_density_mean": { +# "name": "Mean biomass density", +# "units": "kg $\\mathregular{nmi^{-2}}$" +# }, +# "biomass": { +# "name": "Biomass", +# "units": "kg" +# }, +# "abundance": { +# "name": "Abundance", +# "units": "$\\it{N}$" +# } +# } + +# viridis = plt.colormaps.get_cmap('viridis').resampled(1024) +# newcolors = viridis(np.linspace(0, 1, 1024))[::-1] +# white = np.array([1, 1, 1, 1]) +# newcolors[0, :] = white +# custom_cmap = ListedColormap(newcolors) +# # Check the minimum and maximum values for normalization -# # np.arange(1, len(np.arange(xmin, xmax+x_step, x_step))) -# pd.cut( -# nu["x"], -# np.arange(xmin, xmax, x_step), -# right = False, -# labels = np.arange(1, len(np.arange(xmin, xmax, x_step))), -# ).astype(int) - 1 -# grid_gdf["x"] = grid_gdf["x"] - 1 # fig, ax = plt.subplots(figsize=(5, 8)) # grid_gdf.plot(ax=ax, edgecolor="gainsboro", color="white", linewidth=0.5, legend=False) -# plt.plot(dataset_gdf.longitude, dataset_gdf.latitude, linewidth=1, color='black') -# plt.plot(nu.longitude, nu.latitude, linewidth=1, color="red") -# # Calculate centroids and plot text -# for idx, row in grid_gdf.iterrows(): -# centroid = row.geometry.centroid -# var = f"{row.x}-{row.y}" -# ax.annotate(var, xy=(centroid.x, centroid.y), -# xytext=(0,0), fontsize=8, -# textcoords="offset points", -# ha='center', va='center', color='black') +# grid_gdf_1.plot(ax=ax, column=variable, edgecolor="black", linewidth=2, cmap=custom_cmap, +# legend=False, norm=norm) +# plt.scatter(survey_df["longitude"], survey_df["latitude"], linewidth=0.5, color="black") +# vmin = grid_gdf[variable][grid_gdf[variable] > 0.0].min() +# vmax = grid_gdf[variable].max() +# norm = colors.Normalize(vmin=0, vmax=vmax, clip=False) +# # norm = colors.Normalize(vmin=grid_gdf[variable][grid_gdf[variable] > 0.0].min(), +# vmax=grid_gdf[variable].max()) +# # cbar = plt.colorbar(cm.ScalarMappable(norm=norm, cmap=custom_cmap), ax=ax, +# orientation="horizontal", shrink=0.5) +# cbar = plt.colorbar(cm.ScalarMappable(cmap=custom_cmap, norm=norm), ax=ax, +# orientation="horizontal", shrink=0.5) +# cbar.set_label(f"{VARIABLE_MAP[variable]["name"]} ({VARIABLE_MAP[variable]["units"]})", +# fontsize=12, labelpad=10, loc='center') +# cbar.ax.xaxis.set_label_position('top') +# cbar.ax.xaxis.set_ticks_position('top') # plt.tight_layout() -# plt.margins(0, 0) +# plt.margins(0,0) +# # grid_gdf_1.plot(ax=ax, linewidth=1.5, color="black") # coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") # plt.xlim(lims[0]*1.005, lims[2]*1.01) # plt.ylim(lims[1]*0.98, lims[3]*1.005) +# plt.xlabel(u'Longitude (\u00B0E)') +# plt.ylabel(u'Latitude (\u00B0N)') # plt.show() -variable = "abundance" -VARIABLE_MAP = { - "number_density_mean": { - "name": "Mean number density", - "units": "fish $\\mathregular{nmi^{-2}}$" - }, - "biomass_density_mean": { - "name": "Mean biomass density", - "units": "kg $\\mathregular{nmi^{-2}}$" - }, - "biomass": { - "name": "Biomass", - "units": "kg" - }, - "abundance": { - "name": "Abundance", - "units": "$\\it{N}$" - } -} - -viridis = plt.colormaps.get_cmap('viridis').resampled(1024) -newcolors = viridis(np.linspace(0, 1, 1024))[::-1] -white = np.array([1, 1, 1, 1]) -newcolors[0, :] = white -custom_cmap = ListedColormap(newcolors) -# Check the minimum and maximum values for normalization - - -fig, ax = plt.subplots(figsize=(5, 8)) -grid_gdf.plot(ax=ax, edgecolor="gainsboro", color="white", linewidth=0.5, legend=False) -grid_gdf_1.plot(ax=ax, column=variable, edgecolor="black", linewidth=2, cmap=custom_cmap, legend=False, norm=norm) -plt.scatter(survey_df["longitude"], survey_df["latitude"], linewidth=0.5, color="black") -vmin = grid_gdf[variable][grid_gdf[variable] > 0.0].min() -vmax = grid_gdf[variable].max() -norm = colors.Normalize(vmin=0, vmax=vmax, clip=False) -# norm = colors.Normalize(vmin=grid_gdf[variable][grid_gdf[variable] > 0.0].min(), vmax=grid_gdf[variable].max()) -# cbar = plt.colorbar(cm.ScalarMappable(norm=norm, cmap=custom_cmap), ax=ax, orientation="horizontal", shrink=0.5) -cbar = plt.colorbar(cm.ScalarMappable(cmap=custom_cmap, norm=norm), ax=ax, orientation="horizontal", shrink=0.5) -cbar.set_label(f"{VARIABLE_MAP[variable]["name"]} ({VARIABLE_MAP[variable]["units"]})", - fontsize=12, labelpad=10, loc='center') -cbar.ax.xaxis.set_label_position('top') -cbar.ax.xaxis.set_ticks_position('top') -plt.tight_layout() -plt.margins(0,0) -# grid_gdf_1.plot(ax=ax, linewidth=1.5, color="black") -coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") -plt.xlim(lims[0]*1.005, lims[2]*1.01) -plt.ylim(lims[1]*0.98, lims[3]*1.005) -plt.xlabel(u'Longitude (\u00B0E)') -plt.ylabel(u'Latitude (\u00B0N)') -plt.show() - - -co = SQL(db_filepath, "select", table_name="coastline_df") -co["geometry"] = co["geometry"].apply(wkt.loads) -co_gdf = gpd.GeoDataFrame(co, geometry="geometry", crs=projection) - - - -test["geometry"].apply(wkt.loads) -clipped_cells_latlon["geometry"] -len(bbox_latlon.exterior.coords) -len(buffer_boundary.exterior.coords) - -# world_clipped_latlon = gpd.clip(world_utm, buffer_boundary).to_crs(projection) -world_clipped_latlon -######## -cells_clipped = cells_gdf["geometry"].difference(world_clipped.geometry.union_all()).to_frame("geometry") -# cells_clipped = cells_gdf["geometry"].difference(world_clipped_latlon.geometry.union_all()).to_frame("geometry") -cell_colors = cells_clipped.area / (lat_step * lon_step) -# cell_colors = cells_clipped.to_crs({"proj": "cea"}).area / 46300.00000000001**2 -cells_clipped['cell_colors'] = cell_colors -# ---> back to epsg lat/long -cells_latlon = cells_clipped.to_crs(projection) -cells_latlon_clipped = gpd.clip(cells_latlon, bb_orig_gdf) -cell_colors_clipped = cells_latlon_clipped.to_crs(utm_code).area / (lat_step * lon_step) -# cell_colors = cells_clipped.to_crs({"proj": "cea"}).area / 46300.00000000001**2 -cells_latlon_clipped['cell_colors'] = cell_colors_clipped -######## -from shapely.geometry import Point, LineString, shape -nasc_df = survey.input["acoustics"]["nasc_df"] -nasc_gdf = gpd.GeoDataFrame(data=nasc_df, geometry=gpd.points_from_xy(nasc_df["longitude"], nasc_df["latitude"]), crs=projection) -geo_df = nasc_gdf.groupby(["transect_num"])['geometry'].apply(lambda x: LineString(x.tolist())).to_frame("geometry").set_crs(projection) -custom_crs = '+proj=epsg:4326 +lat_ts=0 +lat_0=0 +lon_0=-180 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs +type=crs' -cells_latlon_clipped.to_crs(custom_crs).crs -######## -import sqlalchemy as sqla -import matplotlib.colors as colors -import matplotlib.cm as cm -cells_transformed = cells_latlon.to_crs(utm_code) -lims = cells_transformed.total_bounds - -fig, ax = plt.subplots(figsize=(10, 10)) -# cells_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=True) -# cells_clipped.plot.hexbin() -cells_latlon.to_crs(utm_code).plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=False) -# cells_latlon.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=False) -# cells_latlon_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=False) -# cells_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=True) -# cells_gdf.plot(ax=ax, facecolor="none", edgecolor="black") -norm = colors.Normalize(vmin=cells_latlon["cell_colors"].min(), vmax=cells_latlon["cell_colors"].max()) -cbar = plt.colorbar(cm.ScalarMappable(norm=norm, cmap="viridis"), ax=ax, orientation="horizontal", shrink=0.5) -cbar.set_label("Normalized grid area (50x50 nmi)", fontsize=12, labelpad=10, loc='center') -cbar.ax.xaxis.set_label_position('top') -cbar.ax.xaxis.set_ticks_position('top') -geo_df.reset_index().to_crs(utm_code).plot(ax=ax, color="red") -# geo_df.reset_index().plot(ax=ax, color="red") -# plt.plot(ax=ax, nasc_df["longitude"], nasc_df["latitude"], color="red") -ax.margins(0.00, 0.00) -world_orig.to_crs(utm_code).plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") -# world_orig.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") -# bb_orig_gdf.to_crs(utm_code).plot(ax=ax, facecolor='none', edgecolor='red') -plt.xlim(lims[0]*1.02, lims[2]*1.01) +# co = SQL(db_filepath, "select", table_name="coastline_df") +# co["geometry"] = co["geometry"].apply(wkt.loads) +# co_gdf = gpd.GeoDataFrame(co, geometry="geometry", crs=projection) + + +# test["geometry"].apply(wkt.loads) +# clipped_cells_latlon["geometry"] +# len(bbox_latlon.exterior.coords) +# len(buffer_boundary.exterior.coords) + +# # world_clipped_latlon = gpd.clip(world_utm, buffer_boundary).to_crs(projection) +# world_clipped_latlon +# ######## +# cells_clipped = cells_gdf["geometry"].difference(world_clipped.geometry.union_all()) +# .to_frame("geometry") +# # cells_clipped = cells_gdf["geometry"].difference(world_clipped_latlon.geometry.union_all()) +# .to_frame("geometry") +# cell_colors = cells_clipped.area / (lat_step * lon_step) +# # cell_colors = cells_clipped.to_crs({"proj": "cea"}).area / 46300.00000000001**2 +# cells_clipped['cell_colors'] = cell_colors +# # ---> back to epsg lat/long +# cells_latlon = cells_clipped.to_crs(projection) +# cells_latlon_clipped = gpd.clip(cells_latlon, bb_orig_gdf) +# cell_colors_clipped = cells_latlon_clipped.to_crs(utm_code).area / (lat_step * lon_step) +# # cell_colors = cells_clipped.to_crs({"proj": "cea"}).area / 46300.00000000001**2 +# cells_latlon_clipped['cell_colors'] = cell_colors_clipped +# ######## +# from shapely.geometry import LineString, Point, shape + +# nasc_df = survey.input["acoustics"]["nasc_df"] +# nasc_gdf = gpd.GeoDataFrame(data=nasc_df, geometry=gpd.points_from_xy(nasc_df["longitude"], +# nasc_df["latitude"]), crs=projection) +# geo_df = nasc_gdf.groupby(["transect_num"])['geometry'].apply(lambda x: LineString(x.tolist())) +# .to_frame("geometry").set_crs(projection) +# custom_crs = '+proj=epsg:4326 +lat_ts=0 +lat_0=0 +lon_0=-180 +x_0=0 +y_0=0 +datum=WGS84 +units=m +# +no_defs +type=crs' +# cells_latlon_clipped.to_crs(custom_crs).crs +# import matplotlib.cm as cm +# import matplotlib.colors as colors + +# ######## +# import sqlalchemy as sqla + +# cells_transformed = cells_latlon.to_crs(utm_code) +# lims = cells_transformed.total_bounds + +# fig, ax = plt.subplots(figsize=(10, 10)) +# # cells_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=True) +# # cells_clipped.plot.hexbin() +# cells_latlon.to_crs(utm_code).plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", + +# legend=False) +# # cells_latlon.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=False) +# # cells_latlon_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", +# # legend=False) +# # cells_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=True) +# # cells_gdf.plot(ax=ax, facecolor="none", edgecolor="black") +# norm = colors.Normalize(vmin=cells_latlon["cell_colors"].min(), +# vmax=cells_latlon["cell_colors"].max()) +# cbar = plt.colorbar(cm.ScalarMappable(norm=norm, cmap="viridis"), ax=ax, +# orientation="horizontal", +# shrink=0.5) +# cbar.set_label("Normalized grid area (50x50 nmi)", fontsize=12, labelpad=10, loc='center') +# cbar.ax.xaxis.set_label_position('top') +# cbar.ax.xaxis.set_ticks_position('top') +# geo_df.reset_index().to_crs(utm_code).plot(ax=ax, color="red") +# # geo_df.reset_index().plot(ax=ax, color="red") +# # plt.plot(ax=ax, nasc_df["longitude"], nasc_df["latitude"], color="red") +# ax.margins(0.00, 0.00) +# world_orig.to_crs(utm_code).plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") +# # world_orig.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black") +# # bb_orig_gdf.to_crs(utm_code).plot(ax=ax, facecolor='none', edgecolor='red') +# plt.xlim(lims[0]*1.02, lims[2]*1.01) +# # ax.set_yticks([4e6, 5e6, 6e6]) +# # ax.set_yticklabels(["4000", "5000", "6000"], fontsize=10) +# plt.ylim(lims[1]*0.98, lims[3]*1.005) # ax.set_yticks([4e6, 5e6, 6e6]) # ax.set_yticklabels(["4000", "5000", "6000"], fontsize=10) -plt.ylim(lims[1]*0.98, lims[3]*1.005) -ax.set_yticks([4e6, 5e6, 6e6]) -ax.set_yticklabels(["4000", "5000", "6000"], fontsize=10) -plt.xlabel("Eastings (km)") -plt.ylabel("Northings (km)") -# plt.xlabel("Longitude (°E)") -# ax.set_xticks([-135, -130, -125, -120]) -# plt.ylabel("Latitude (°N)") -ax.set_xticks([-600e3, -400e3, -200e3, 0, 200e3, 400e3, 600e3, 800e3]) -ax.set_xticklabels(["-600", "-400", "-200", "0", "200", "400", "600", "800"], fontsize=10) -# Adding the colorbar title -# cax = fig.get_axes()[1] # Assuming the colorbar is the second axis -# cax.set_ylabel("Normalized grid area (25x25 nmi)") # Setting the title of the colorbar -plt.tight_layout() -plt.show() \ No newline at end of file +# plt.xlabel("Eastings (km)") +# plt.ylabel("Northings (km)") +# # plt.xlabel("Longitude (°E)") +# # ax.set_xticks([-135, -130, -125, -120]) +# # plt.ylabel("Latitude (°N)") +# ax.set_xticks([-600e3, -400e3, -200e3, 0, 200e3, 400e3, 600e3, 800e3]) +# ax.set_xticklabels(["-600", "-400", "-200", "0", "200", "400", "600", "800"], fontsize=10) +# # Adding the colorbar title +# # cax = fig.get_axes()[1] # Assuming the colorbar is the second axis +# # cax.set_ylabel("Normalized grid area (25x25 nmi)") # Setting the title of the colorbar +# plt.tight_layout() +# plt.show() diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index 8e15088c..7c462db8 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -1,373 +1,393 @@ -from echopop.live.live_survey import LiveSurvey -from echopop.live.sql_methods import SQL -import echopop.live.live_visualizer as elv -from pathlib import Path -from echopop.live import live_data_processing as eldp -from echopop.live import live_data_loading as eldl -from echopop.live.live_core import( - LIVE_DATA_STRUCTURE, LIVE_INPUT_FILE_CONFIG_MAP -) -import boto3 -from botocore.exceptions import NoCredentialsError, ClientError -import pandas as pd -import numpy as np -from echopop.live.sql_methods import SQL, sql_data_exchange, get_table_key_names, sql_group_update, query_processed_files, sql_update_strata_summary -from echopop.live.live_spatial_methods import apply_spatial_definitions -from echopop.live.live_acoustics import average_sigma_bs, compute_nasc -from echopop.live.live_biology import compute_sigma_bs -from echopop.acoustics import ts_length_regression, to_dB, to_linear -from echopop.utils.operations import group_interpolator_creator -from functools import reduce -from echopop.live.live_data_loading import filter_filenames, read_biology_csv - -#################################################################################################### -# TEST: Set up `LiveSurvey` object -# NOTE: General initialization parameter configuration -live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml" -# NOTE: File configuration -live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" -# NOTE: Create object -realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True) -# NOTE: String-representation via `LiveSurvey.__repr__`: -# NOTE: Lists current files being processed and linked databases (WIP) -self = realtime_survey -file_configuration = self.config - -input_filenames = ["202407_003_operation_info.csv", "202407_22500_003_lf.csv", "202407_22500_003_spec.csv", "202407_003_catch_perc.csv"] -realtime_survey.config["input_directories"]["biology"]["directory"] = "s3://sh2407-upload/data/Echopop-biology" - -survey_data = SQL("C:/Users/Brandyn/Downloads/acoustics.db", "select", table_name="survey_data_df") - - -del realtime_survey.config["data_root_dir"] -self = realtime_survey - -# realtime_survey.config["storage_options"] = aws_credentials -realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True) -realtime_survey.load_biology_data(input_filenames=input_filenames) -realtime_survey.input["biology"] -def is_s3_path(path): - """Check if a path is an S3 path.""" - return path.startswith("s3://") - -dataset_directory = realtime_survey.config["input_directories"]["biology"]["directory"] -s3_path = dataset_directory -is_s3_path(dataset_directory) - -cloud_credentials = aws_credentials -cloud_credentials = {} -def validate_s3_path(s3_path: str, cloud_credentials: dict): - """Check if (parts of) S3 path exists.""" - - # Redundant validation that S3 object validation is appropriate - if not is_s3_path(s3_path): - raise ValueError("The path is not an S3 path.") - - # Validate credentials - if not all([True if param in cloud_credentials.keys() else False - for param in ["key", "secret"]]): - # ---- Find missing credentials - missing_creds = set(["key", "secret"]) - set(cloud_credentials) - # ---- Format into string - missing_creds_str = ", ".join(["'{}'".format(x.replace("'", "''")) for x in missing_creds]) - # ---- Raise Error - raise PermissionError( - f"Required S3 credentials missing: {missing_creds_str}." - ) - - # Remove the s3:// prefix - s3_path_reduced = s3_path[len("s3://"):] - - # Split into bucket and key - parts = s3_path_reduced.split("/", 1) - if len(parts) < 2: - raise ValueError(f"Invalid S3 path format for '{s3_path}'.") - - # Get bucket name and directory keys - bucket_name, directory = parts - - # Initialize the S3 client - s3_client = boto3.client("s3", - aws_access_key_id=cloud_credentials["key"], - aws_secret_access_key=cloud_credentials["secret"]) - - # Check if the bucket exists - try: - s3_client.head_bucket(Bucket=bucket_name) - except ClientError as e: - raise FileNotFoundError( - f"S3 bucket '{bucket_name}' does not exist or you do not have access." - ) - - # Check if the S3 directory exists - try: - # ---- Ping a response from the bucket - response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=directory, MaxKeys=1) - # ---- Check for `Contents` - if "Contents" not in response: - raise FileNotFoundError(f"S3 path '{s3_path}' does not exist.") - except ClientError as e: - # --- Raise Error and propagate it upwards - raise e - -validate_s3_path(s3_path, cloud_credentials) - -import pandas as pd - -self = realtime_survey -biology_files = self.meta["provenance"]["biology_files_read"] -file_configuration = self.config -dataset = "biology" - -# Get the dataset file settings -file_settings = file_configuration["input_directories"][dataset] - -def construct_directorypath(file_configuration: dict, file_settings: dict): - """Construct the root directory path.""" - - # Get the general root_directory, if present - if "data_root_dir" in file_configuration: - root_directory = file_configuration["data_root_dir"] - else: - root_directory = "" - - # Get the local directory (or this may be the root directory depending on the config) - data_directory = file_settings["directory"] - - # Return the directory path - if root_directory != "": - return "/".join([root_directory, data_directory]) - else: - return data_directory - -directory_path = construct_directorypath(file_configuration, file_settings) - -def validate_local_path(directory_path: str): - - # Validate filepath - # ---- Error evaluation (if applicable) - if not Path(directory_path).exists(): - raise FileNotFoundError( - f"The acoustic data directory [{directory_path}] does not exist." - ) - - # Validate that files even exist - # ---- List available files of target extension - data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}")) - # ---- Error evaluation (if applicable) - if not data_files: - raise FileNotFoundError( - f"No `*.{file_settings['extension']}` files found in [{directory_path}]!" - ) - - - - -# Get the biology data file settings -file_settings = file_configuration["input_directories"]["biology"] - -# Get the file-specific settings, datatypes, columns, etc. -# ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` -biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] -# ---- Extract the expected file name ID's -biology_file_ids = file_settings["file_name_formats"] -# ---- Extract all of the file ids -biology_config_ids = list(biology_file_ids.keys()) -# ---- Initialize the dictionary that will define this key in the `input` attribute -biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} - - -# Initialize a session with AWS credentials -s3_client = boto3.client( - 's3', - aws_access_key_id=aws_credentials["key"], - aws_secret_access_key=aws_credentials["secret"] -) -response = s3_client.list_buckets() -buckets = response.get('Buckets', []) -for bucket in buckets: - print(f"Bucket Name: {bucket['Name']}") -s3_client.head_bucket(Bucket="sh2407-upload") -realtime_survey.load_biology_data(pandas_kwargs=aws_credentials, input_filenames=input_filenames) -realtime_survey.config["ship_id"] -grid_data = SQL(realtime_survey.config["database"]["grid"], "select", table_name="grid_df") -grid_data[grid_data.abundance > 0] -bucket = boto3.client("s3", region_name=None) -bucket.head_bucket(Bucket=realtime_survey.config["input_directories"]["biology"]["directory"] + "/") -bucket.list_objects_v2(Bucket=realtime_survey.config["input_directories"]["biology"]["directory"], Prefix=path, MaxKeys=1) -#################################################################################################### -# TEST: TRIGGER --> NEW ACOUSTIC DATA -# NOTE: Load new acoustic data (Either glob file search or `input_filenames Optional[List[str]]`) -realtime_survey.load_acoustic_data() -# NOTE: Process new acoustic data -# NOTE: This will update linked database tables -realtime_survey.process_acoustic_data() -# NOTE: Generate population estimates (or pass if there are no biological data) -# NOTE: `working_dataset = Literal["acoustic", "biology"]` -realtime_survey.estimate_population(working_dataset="acoustic") -# NOTE: String-representation via `LiveSurvey.__repr__`: -# NOTE: Lists current files being processed and linked databases (WIP) -realtime_survey.input["acoustics"] -#################################################################################################### -# TEST: TRIGGER --> NEW BIOLOGY DATA -# NOTE: Load new biological data (Either glob file search or `input_filenames Optional[List[str]]`) -realtime_survey.load_biology_data() -len(realtime_survey.meta["provenance"]["biology_files_checkpoint1"]) -realtime_survey.meta["provenance"]["biology_files_checkpoint3"] -# NOTE: Process new biological data -# NOTE: This will update linked database tables -realtime_survey.process_biology_data() -# NOTE: Generate population estimates (or pass if there are no acoustic data) -# NOTE: `working_dataset = Literal["acoustic", "biology"]` -realtime_survey.estimate_population(working_dataset="biology") -# NOTE: String-representation via `LiveSurvey.__repr__`: -# NOTE: Lists current files being processed and linked databases (WIP) -realtime_survey -#################################################################################################### -# TEST: `LiveSurvey` --[`files_processed`]--> `Echodataflow` -# NOTE: `LiveSurvey.meta` attribute -# ---- ACOUSTIC -realtime_survey.meta["provenance"]["acoustic_files"] -# ---- BIOLOGICAL -realtime_survey.meta["provenance"]["biology_files"] -# NOTE: SQL function query from database file [cumulative list] -# ---- ACOUSTIC -SQL(db_file=realtime_survey.config["database"]["acoustics"], - command="select", table_name="files_processed") -dat = SQL(db_file=realtime_survey.config["database"]["acoustics"],command="select", table_name="files_processed") -# ---- BIOLOGICAL -SQL(db_file=realtime_survey.config["database"]["biology"],command="select", table_name="files_processed") -dat.loc[0:, "filepath"][105] -#################################################################################################### -# TEST: `LiveSurvey` --[(key) SQL tables]--> Users -# !!! The SQL functions will fail if the tables have not yet been created/initialized -# ---- ACOUSTICS -# NOTE: Mean linear backscatter coefficient (`sigma_bs`) keyed for each haul and stratum -SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df") -SQL(realtime_survey.config["database"]["biology"], "select", table_name="specimen_df").latitude.max() -realtime_survey.input["spatial"]["strata"] -# NOTE: Along-track acoustically-derived number/biomass densities and NASC -SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") -# ---- BIOLOGICAL -# NOTE: Fitted (discretized) length-weight relationship -SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_fitted_df") -# NOTE: Quantized length-binned weights (summed) -SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df") -# NOTE: Average weights per stratum -SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df") -# NOTE: Stratum summary tables -SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df") -#################################################################################################### -# FROM THE `LiveSurvey` object ! -# ---- Convert to a Panel -import panel as pn -# ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table -survey_data_db = Path(realtime_survey.config["database"]["acoustics"]) -# grid_db = Path(realtime_survey.config["database"]["grid"]) -grid_db = Path("C:/Users/Brandyn/Downloads/grid.db") -dat = SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") -dat -dat1 = SQL(grid_db, "select", table_name="grid_df") -SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df") - -sql_cmd = "SELECT * FROM sigma_bs_mean_df ORDER BY stratum, haul_num, species_id" -# Create the engine -engine = create_engine(f"sqlite:///{"C:/Users/Brandyn/Downloads/biology.db"}") -# Create the SQL database connection and send the script -with engine.connect() as connection: - table = connection.execute(text(sql_cmd)) - -data = table.fetchall() -dd = pd.DataFrame(data, columns=table.keys()).loc[0:1, :] -dd = dd[["stratum", "haul_num", "species_id", "sigma_bs", "sigma_bs_count", "sigma_bs_sum", "id"]] -dd.loc[:, "id"] = pd.Series([f"{(4,4,4)}", f"{(5,5,5)}"]) -SQL("C:/Users/Brandyn/Downloads/biology.db", "insert", table_name="sigma_bs_mean_df", dataframe=dd) -SQL("C:/Users/Brandyn/Downloads/biology.db", "map") -SQL(biology_db, "drop", table_name="sigma_bs_mean_df") -SQL(biology_db, "select", table_name="sigma_bs_mean_df") -dd.loc[:, "haul_num"] = pd.Series([101, 103]) -dd = dd[["species_id", "haul_num", "id", "stratum", "sigma_bs", "sigma_bs_count", "sigma_bs_sum"]] -SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=dd, id_columns=key_list+["id"]) -SQL(biology_db, "select", table_name="sigma_bs_mean_df") -import numpy as np; import pandas as pd -SQL("C:/Users/Brandyn/Downloads/biology.db", "select", table_name="length_weight_df") -sigma_bs_df = SQL("C:/Users/Brandyn/Downloads/biology.db", "select", table_name="sigma_bs_mean_df") -table_df = SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df") -sigma_bs_df = table_df -# ---- Check the table keys -table_keys = np.unique(table_df["id"]).tolist() -# ---- Get unique values -current_keys = np.unique(sigma_bs_df["id"]).tolist() -# ---- Get INSERTION keys -insertion_keys = list(set(current_keys).difference(set(table_keys))) -# ---- Get UPDATE keys -update_keys = list(set(current_keys).intersection(set(table_keys))) -insertion_df = sigma_bs_df[sigma_bs_df["id"].isin(insertion_keys)] -insertion_df.loc[0, "species_id"] = 22500 -insertion_df.loc[0, "stratum"] = 5 -insertion_df.loc[0, "haul_num"] = 100 -insertion_df.loc[0, "sigma_bs"] = 1e-10 -insertion_df.loc[0, "sigma_bs_count"] = 100 -insertion_df.loc[0, "sigma_bs_sum"] = 1e10 * 100 -insertion_df.loc[0, "id"] = f"{(1,1,1)}" -SQL(realtime_survey.config["database"]["biology"], "insert", table_name="sigma_bs_mean_df", - dataframe=insertion_df) -SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df") -survey_data = SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") -dat1[dat1.abundance > 0] -dat[dat.number_density > 0] -coast_db = grid_db -biology_db = Path(realtime_survey.config["database"]["biology"]) -projection = realtime_survey.config["geospatial"]["projection"] -# NOTE: PLOTS -# Ensure Panel is initialized -pn.extension() -# ---- Helper function -def plt_to_pn(fig): - # Convert to a panel object - panel = pn.panel(fig) - # Display - panel.show() # OR panel.servable() if you want to serve it in a Panel server -# ---- PLOT GRID -fig = elv.plot_livesurvey_grid(grid_db, projection, coast_db) -fig.show() -plt_to_pn(fig) -# ---- PLOT TRACK -from echopop.live.live_visualizer import plot_livesurvey_track -fig1 = plot_livesurvey_track(survey_data, projection, coast_db) -fig1.show() -plt_to_pn(fig1) -# ---- PLOT DISTRIBUTIONS -weight_table = SQL(biology_db, "select", - table_name="length_weight_df") -stratum_table = SQL(biology_db, "select", - table_name="strata_summary_df") -specimen_table = SQL(biology_db, "select", - table_name="specimen_data_df") -length_table = SQL(biology_db, "select", - table_name="length_df") -fig2 = elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table) -plt_to_pn(fig2) -### MULTIPANEL -panel0 = pn.panel(fig, name='Gridded population estimates') -panel1 = pn.panel(fig1, name='Alongtrack population estimates') -panel2 = pn.panel(fig2, name='Length and weight distributions') - -def serve_panels(): - # Create links to each panel - home = pn.Column( - pn.pane.Markdown("# Main Page"), - pn.pane.Markdown("[Gridded population estimates](gridded_population_estimates)", sizing_mode="stretch_width"), - pn.pane.Markdown("[Alongtrack population estimates](alongtrack_population_estimates)", sizing_mode="stretch_width"), - pn.pane.Markdown("[Length and weight distributions](length_weight_distributions)", sizing_mode="stretch_width") - ) - - # Serve the home page and individual panels - pn.serve({ - 'Main Page': home, - 'gridded_population_estimates': panel0, - 'alongtrack_population_estimates': panel1, - 'length_weight_distributions': panel2 - }, show=True) -# Run the function to serve panels -serve_panels() \ No newline at end of file +# from echopop.live.live_survey import LiveSurvey +# from echopop.live.sql_methods import SQL +# import echopop.live.live_visualizer as elv +# from pathlib import Path +# from echopop.live import live_data_processing as eldp +# from echopop.live import live_data_loading as eldl +# from echopop.live.live_core import( +# LIVE_DATA_STRUCTURE, LIVE_INPUT_FILE_CONFIG_MAP +# ) +# import boto3 +# from botocore.exceptions import NoCredentialsError, ClientError +# import pandas as pd +# import numpy as np +# from echopop.live.sql_methods import SQL, sql_data_exchange, get_table_key_names, +# sql_group_update, query_processed_files, sql_update_strata_summary +# from echopop.live.live_spatial_methods import apply_spatial_definitions +# from echopop.live.live_acoustics import average_sigma_bs, compute_nasc +# from echopop.live.live_biology import compute_sigma_bs +# from echopop.acoustics import ts_length_regression, to_dB, to_linear +# from echopop.utils.operations import group_interpolator_creator +# from functools import reduce +# from echopop.live.live_data_loading import filter_filenames, read_biology_csv + +# ################################################################################################## +# # TEST: Set up `LiveSurvey` object +# # NOTE: General initialization parameter configuration +# live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initializat +# ion_config.yml" +# # NOTE: File configuration +# live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_yea +# r_2019_config.yml" +# # NOTE: Create object +# realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True) +# realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path, verbose=True) + +# # NOTE: String-representation via `LiveSurvey.__repr__`: +# # NOTE: Lists current files being processed and linked databases (WIP) +# self = realtime_survey +# file_configuration = self.config + +# input_filenames = ["202407_003_operation_info.csv", "202407_22500_003_lf.csv", +# "202407_22500_003_spec.csv", "202407_003_catch_perc.csv"] +# realtime_survey.config["input_directories"]["biology"]["directory"] = +# "s3://sh2407-upload/data/Echopop-biology" + +# survey_data = SQL("C:/Users/Brandyn/Downloads/acoustics.db", "select", +# table_name="survey_data_df") + + +# del realtime_survey.config["data_root_dir"] +# self = realtime_survey + +# # realtime_survey.config["storage_options"] = aws_credentials +# realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True) +# realtime_survey.load_biology_data(input_filenames=input_filenames) +# realtime_survey.input["biology"] +# def is_s3_path(path): +# """Check if a path is an S3 path.""" +# return path.startswith("s3://") + +# dataset_directory = realtime_survey.config["input_directories"]["biology"]["directory"] +# s3_path = dataset_directory +# is_s3_path(dataset_directory) + +# cloud_credentials = aws_credentials +# cloud_credentials = {} +# def validate_s3_path(s3_path: str, cloud_credentials: dict): +# """Check if (parts of) S3 path exists.""" + +# # Redundant validation that S3 object validation is appropriate +# if not is_s3_path(s3_path): +# raise ValueError("The path is not an S3 path.") + +# # Validate credentials +# if not all([True if param in cloud_credentials.keys() else False +# for param in ["key", "secret"]]): +# # ---- Find missing credentials +# missing_creds = set(["key", "secret"]) - set(cloud_credentials) +# # ---- Format into string +# missing_creds_str = ", ".join(["'{}'".format(x.replace("'", "''")) for x in +# missing_creds]) +# # ---- Raise Error +# raise PermissionError( +# f"Required S3 credentials missing: {missing_creds_str}." +# ) + +# # Remove the s3:// prefix +# s3_path_reduced = s3_path[len("s3://"):] + +# # Split into bucket and key +# parts = s3_path_reduced.split("/", 1) +# if len(parts) < 2: +# raise ValueError(f"Invalid S3 path format for '{s3_path}'.") + +# # Get bucket name and directory keys +# bucket_name, directory = parts + +# # Initialize the S3 client +# s3_client = boto3.client("s3", +# aws_access_key_id=cloud_credentials["key"], +# aws_secret_access_key=cloud_credentials["secret"]) + +# # Check if the bucket exists +# try: +# s3_client.head_bucket(Bucket=bucket_name) +# except ClientError as e: +# raise FileNotFoundError( +# f"S3 bucket '{bucket_name}' does not exist or you do not have access." +# ) + +# # Check if the S3 directory exists +# try: +# # ---- Ping a response from the bucket +# response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=directory, MaxKeys=1) +# # ---- Check for `Contents` +# if "Contents" not in response: +# raise FileNotFoundError(f"S3 path '{s3_path}' does not exist.") +# except ClientError as e: +# # --- Raise Error and propagate it upwards +# raise e + +# validate_s3_path(s3_path, cloud_credentials) + +# import pandas as pd + +# self = realtime_survey +# biology_files = self.meta["provenance"]["biology_files_read"] +# file_configuration = self.config +# dataset = "biology" + +# # Get the dataset file settings +# file_settings = file_configuration["input_directories"][dataset] + +# def construct_directorypath(file_configuration: dict, file_settings: dict): +# """Construct the root directory path.""" + +# # Get the general root_directory, if present +# if "data_root_dir" in file_configuration: +# root_directory = file_configuration["data_root_dir"] +# else: +# root_directory = "" + +# # Get the local directory (or this may be the root directory depending on the config) +# data_directory = file_settings["directory"] + +# # Return the directory path +# if root_directory != "": +# return "/".join([root_directory, data_directory]) +# else: +# return data_directory + +# directory_path = construct_directorypath(file_configuration, file_settings) + +# def validate_local_path(directory_path: str): + +# # Validate filepath +# # ---- Error evaluation (if applicable) +# if not Path(directory_path).exists(): +# raise FileNotFoundError( +# f"The acoustic data directory [{directory_path}] does not exist." +# ) + +# # Validate that files even exist +# # ---- List available files of target extension +# data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}")) +# # ---- Error evaluation (if applicable) +# if not data_files: +# raise FileNotFoundError( +# f"No `*.{file_settings['extension']}` files found in [{directory_path}]!" +# ) + + +# # Get the biology data file settings +# file_settings = file_configuration["input_directories"]["biology"] + +# # Get the file-specific settings, datatypes, columns, etc. +# # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` +# biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] +# # ---- Extract the expected file name ID's +# biology_file_ids = file_settings["file_name_formats"] +# # ---- Extract all of the file ids +# biology_config_ids = list(biology_file_ids.keys()) +# # ---- Initialize the dictionary that will define this key in the `input` attribute +# biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} + + +# # Initialize a session with AWS credentials +# s3_client = boto3.client( +# 's3', +# aws_access_key_id=aws_credentials["key"], +# aws_secret_access_key=aws_credentials["secret"] +# ) +# response = s3_client.list_buckets() +# buckets = response.get('Buckets', []) +# for bucket in buckets: +# print(f"Bucket Name: {bucket['Name']}") +# s3_client.head_bucket(Bucket="sh2407-upload") +# realtime_survey.load_biology_data(pandas_kwargs=aws_credentials, input_filenames=input_filenames) +# realtime_survey.config["ship_id"] +# grid_data = SQL(realtime_survey.config["database"]["grid"], "select", table_name="grid_df") +# grid_data[grid_data.abundance > 0] +# bucket = boto3.client("s3", region_name=None) +# bucket.head_bucket(Bucket=realtime_survey.config["input_directories"]["biology"]["directory"] +# +"/") +# bucket.list_objects_v2(Bucket=realtime_survey.config["input_directories"]["biology"]["directory"], +# Prefix=path, MaxKeys=1) +# ################################################################################################# +# # TEST: TRIGGER --> NEW ACOUSTIC DATA +# # NOTE: Load new acoustic data (Either glob file search or `input_filenames Optional[List[str]]`) +# realtime_survey.load_acoustic_data() +# # NOTE: Process new acoustic data +# # NOTE: This will update linked database tables +# realtime_survey.process_acoustic_data() +# # NOTE: Generate population estimates (or pass if there are no biological data) +# # NOTE: `working_dataset = Literal["acoustic", "biology"]` +# realtime_survey.estimate_population(working_dataset="acoustic") +# # NOTE: String-representation via `LiveSurvey.__repr__`: +# # NOTE: Lists current files being processed and linked databases (WIP) +# realtime_survey.input["acoustics"] +# ################################################################################################## +# # TEST: TRIGGER --> NEW BIOLOGY DATA +# # NOTE: Load new biological data (Either glob file search or `input_filenames Optional[List[str]]` +# realtime_survey.load_biology_data() +# len(realtime_survey.meta["provenance"]["biology_files_checkpoint1"]) +# realtime_survey.meta["provenance"]["biology_files_checkpoint3"] +# # NOTE: Process new biological data +# # NOTE: This will update linked database tables +# realtime_survey.process_biology_data() +# # NOTE: Generate population estimates (or pass if there are no acoustic data) +# # NOTE: `working_dataset = Literal["acoustic", "biology"]` +# realtime_survey.estimate_population(working_dataset="biology") +# # NOTE: String-representation via `LiveSurvey.__repr__`: +# # NOTE: Lists current files being processed and linked databases (WIP) +# realtime_survey +# ################################################################################################## +# # TEST: `LiveSurvey` --[`files_processed`]--> `Echodataflow` +# # NOTE: `LiveSurvey.meta` attribute +# # ---- ACOUSTIC +# realtime_survey.meta["provenance"]["acoustic_files"] +# # ---- BIOLOGICAL +# realtime_survey.meta["provenance"]["biology_files"] +# # NOTE: SQL function query from database file [cumulative list] +# # ---- ACOUSTIC +# SQL(db_file=realtime_survey.config["database"]["acoustics"], +# command="select", table_name="files_processed") +# dat = SQL(db_file=realtime_survey.config["database"]["acoustics"],command="select", +# table_name="files_processed") +# # ---- BIOLOGICAL +# SQL(db_file=realtime_survey.config["database"]["biology"],command="select", +# table_name="files_processed") +# dat.loc[0:, "filepath"][105] +# ################################################################################################## +# # TEST: `LiveSurvey` --[(key) SQL tables]--> Users +# # !!! The SQL functions will fail if the tables have not yet been created/initialized +# # ---- ACOUSTICS +# # NOTE: Mean linear backscatter coefficient (`sigma_bs`) keyed for each haul and stratum +# SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df") +# SQL(realtime_survey.config["database"]["biology"], "select", table_name="specimen_df") +# .latitude.max() +# realtime_survey.input["spatial"]["strata"] +# # NOTE: Along-track acoustically-derived number/biomass densities and NASC +# SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") +# # ---- BIOLOGICAL +# # NOTE: Fitted (discretized) length-weight relationship +# SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_fitted_df") +# # NOTE: Quantized length-binned weights (summed) +# SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df") +# # NOTE: Average weights per stratum +# SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df") +# # NOTE: Stratum summary tables +# SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df") +# ################################################################################################## +# # FROM THE `LiveSurvey` object ! +# # ---- Convert to a Panel +# import panel as pn +# # ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table +# survey_data_db = Path(realtime_survey.config["database"]["acoustics"]) +# # grid_db = Path(realtime_survey.config["database"]["grid"]) +# grid_db = Path("C:/Users/Brandyn/Downloads/grid.db") +# dat = SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") +# dat +# dat1 = SQL(grid_db, "select", table_name="grid_df") +# SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df") + +# sql_cmd = "SELECT * FROM sigma_bs_mean_df ORDER BY stratum, haul_num, species_id" +# # Create the engine +# engine = create_engine(f"sqlite:///{"C:/Users/Brandyn/Downloads/biology.db"}") +# # Create the SQL database connection and send the script +# with engine.connect() as connection: +# table = connection.execute(text(sql_cmd)) + +# data = table.fetchall() +# dd = pd.DataFrame(data, columns=table.keys()).loc[0:1, :] +# dd = dd[["stratum", "haul_num", "species_id", "sigma_bs", "sigma_bs_count", "sigma_bs_sum", "id"]] +# dd.loc[:, "id"] = pd.Series([f"{(4,4,4)}", f"{(5,5,5)}"]) +# SQL("C:/Users/Brandyn/Downloads/biology.db", "insert", table_name="sigma_bs_mean_df", +# dataframe=dd) +# SQL("C:/Users/Brandyn/Downloads/biology.db", "map") +# SQL(biology_db, "drop", table_name="sigma_bs_mean_df") +# SQL(biology_db, "select", table_name="sigma_bs_mean_df") +# dd.loc[:, "haul_num"] = pd.Series([101, 103]) +# dd = dd[["species_id", "haul_num", "id", "stratum", "sigma_bs", "sigma_bs_count", "sigma_bs_sum"]] +# SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=dd, id_columns=key_list+["id"]) +# SQL(biology_db, "select", table_name="sigma_bs_mean_df") +# import numpy as np; import pandas as pd +# SQL("C:/Users/Brandyn/Downloads/biology.db", "select", table_name="length_weight_df") +# sigma_bs_df = SQL("C:/Users/Brandyn/Downloads/biology.db", "select", +# table_name="sigma_bs_mean_df") +# table_df = SQL(realtime_survey.config["database"]["biology"], "select", +# table_name="sigma_bs_mean_df") +# sigma_bs_df = table_df +# # ---- Check the table keys +# table_keys = np.unique(table_df["id"]).tolist() +# # ---- Get unique values +# current_keys = np.unique(sigma_bs_df["id"]).tolist() +# # ---- Get INSERTION keys +# insertion_keys = list(set(current_keys).difference(set(table_keys))) +# # ---- Get UPDATE keys +# update_keys = list(set(current_keys).intersection(set(table_keys))) +# insertion_df = sigma_bs_df[sigma_bs_df["id"].isin(insertion_keys)] +# insertion_df.loc[0, "species_id"] = 22500 +# insertion_df.loc[0, "stratum"] = 5 +# insertion_df.loc[0, "haul_num"] = 100 +# insertion_df.loc[0, "sigma_bs"] = 1e-10 +# insertion_df.loc[0, "sigma_bs_count"] = 100 +# insertion_df.loc[0, "sigma_bs_sum"] = 1e10 * 100 +# insertion_df.loc[0, "id"] = f"{(1,1,1)}" +# SQL(realtime_survey.config["database"]["biology"], "insert", table_name="sigma_bs_mean_df", +# dataframe=insertion_df) +# SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df") +# survey_data = SQL(realtime_survey.config["database"]["acoustics"], "select", +# table_name="survey_data_df") +# dat1[dat1.abundance > 0] +# dat[dat.number_density > 0] +# coast_db = grid_db +# biology_db = Path(realtime_survey.config["database"]["biology"]) +# projection = realtime_survey.config["geospatial"]["projection"] +# # NOTE: PLOTS +# # Ensure Panel is initialized +# pn.extension() +# # ---- Helper function +# def plt_to_pn(fig): +# # Convert to a panel object +# panel = pn.panel(fig) +# # Display +# panel.show() # OR panel.servable() if you want to serve it in a Panel server +# # ---- PLOT GRID +# fig = elv.plot_livesurvey_grid(grid_db, projection, coast_db) +# fig.show() +# plt_to_pn(fig) +# # ---- PLOT TRACK +# from echopop.live.live_visualizer import plot_livesurvey_track +# fig1 = plot_livesurvey_track(survey_data, projection, coast_db) +# fig1.show() +# plt_to_pn(fig1) +# # ---- PLOT DISTRIBUTIONS +# weight_table = SQL(biology_db, "select", +# table_name="length_weight_df") +# stratum_table = SQL(biology_db, "select", +# table_name="strata_summary_df") +# specimen_table = SQL(biology_db, "select", +# table_name="specimen_data_df") +# length_table = SQL(biology_db, "select", +# table_name="length_df") +# fig2 = elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, +# length_table) +# plt_to_pn(fig2) +# ### MULTIPANEL +# panel0 = pn.panel(fig, name='Gridded population estimates') +# panel1 = pn.panel(fig1, name='Alongtrack population estimates') +# panel2 = pn.panel(fig2, name='Length and weight distributions') + +# def serve_panels(): +# # Create links to each panel +# home = pn.Column( +# pn.pane.Markdown("# Main Page"), +# pn.pane.Markdown("[Gridded population estimates](gridded_population_estimates)", +# sizing_mode="stretch_width"), +# pn.pane.Markdown("[Alongtrack population estimates](alongtrack_population_estimates)", +# sizing_mode="stretch_width"), +# pn.pane.Markdown("[Length and weight distributions](length_weight_distributions)", +# sizing_mode="stretch_width") +# ) + +# # Serve the home page and individual panels +# pn.serve({ +# 'Main Page': home, +# 'gridded_population_estimates': panel0, +# 'alongtrack_population_estimates': panel1, +# 'length_weight_distributions': panel2 +# }, show=True) +# # Run the function to serve panels +# serve_panels() diff --git a/echopop/utils/operations.py b/echopop/utils/operations.py index 5db0e84c..2ac1b77e 100644 --- a/echopop/utils/operations.py +++ b/echopop/utils/operations.py @@ -306,10 +306,12 @@ def group_merge(dataframe, dataframes_to_add, inner_on, outer_on, how="outer", d def group_interpolator_creator( - grouped_data: pd.DataFrame, independent_var: str, dependent_var: str, - contrast: Union[List[str], str] + grouped_data: pd.DataFrame, + independent_var: str, + dependent_var: str, + contrast: Union[List[str], str], ) -> dict: - + # Check if `contrast` is a list or not if not isinstance(contrast, list): contrast = [] @@ -328,9 +330,7 @@ def interpolator_factory(sub_group): # Produce a dictionary comprising all of the produced interpolators interpolators = ( - grouped_data.groupby(contrast).apply( - lambda group: interpolator_factory(group) - ) + grouped_data.groupby(contrast).apply(lambda group: interpolator_factory(group)) ).to_dict() # Return output diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py index 101bc81a..f9385b6b 100644 --- a/echopop/zarr_read_ingest_test.py +++ b/echopop/zarr_read_ingest_test.py @@ -1,1885 +1,1948 @@ -import xarray as xr -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -from typing import Union, Tuple, Optional -from pathlib import Path -import copy -import yaml -import glob -from datetime import datetime -import geopandas as gpd -import os -import re -import contextlib -from echopop.acoustics import ts_length_regression, to_linear, to_dB -from sqlalchemy import create_engine, text, Engine, inspect -from echopop.live.live_core import LIVE_DATA_STRUCTURE, LIVE_FILE_FORMAT_MAP, LIVE_INPUT_FILE_CONFIG_MAP, SPATIAL_CONFIG_MAP -from echopop.live.live_data_loading import validate_data_directory -from echopop.live.sql_methods import SQL, SQL_COMMANDS, query_processed_files, format_sql_columns, sql_group_update, sql_data_exchange, initialize_database, sql_update_strata_summary -from echopop.live import live_data_processing as eldp -from echopop.live import live_data_loading as eldl -from echopop.live.live_data_processing import query_dataset, get_unique_identifiers -from echopop.live.live_survey import LiveSurvey -from echopop.live.live_acoustics import integrate_nasc, configure_transmit_frequency -from echopop.live.live_biology import preprocess_biology_data -from echopop.survey import Survey -import geopandas as gpd -import pandas as pd -import numpy as np -import shapely.geometry -from shapely.geometry import box -from echopop.spatial.projection import utm_string_generator -from geopy.distance import distance -from echopop.live.sql_methods import SQL -from shapely import wkt -import matplotlib.pyplot as plt -import geopandas as gpd -import matplotlib.colors as colors -import matplotlib.cm as cm -import numpy as np -from matplotlib.colors import ListedColormap -self = realtime_survey -spatial_config = self.config["geospatial"] -dataset = self.input["acoustics"]["nasc_df"] - - - - -survey_2019 = Survey("C:/Users/Brandyn/Documents/GitHub/echopop/config_files/initialization_config.yml", "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/survey_year_2019_config.yml") -survey_2019.transect_analysis() -survey_2019.analysis["transect"]["biology"]["weight"]["weight_stratum_df"] -analysis_dict = survey_2019.analysis["transect"] -SQL(acoustic_db, "select", table_name="sigma_bs_mean_df") -proportions_dict=analysis_dict["biology"]["proportions"]["number"] -length_weight_dict = analysis_dict["biology"]["weight"] -stratum_proportions_sexed["proportion_aged"] + stratum_proportions_sexed["proportion_unaged"] - -updated_survey_data = nasc_biology.copy() -gridding_column = file_configuration["gridding_column"] - -unique_keys = get_unique_identifiers(updated_survey_data, gridding_column) - - -file_configuration = self.config -grid_settings["grid_resolution"]["x"] = 50 -grid_settings["grid_resolution"]["y"] = 50 -lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters -lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters -self = realtime_survey -file_configuration = self.config - -def initialize_grid(): - - # Get root directory, if defined - if "data_root_dir" in file_configuration: - root_dir = Path(file_configuration["data_root_dir"]) - else: - root_dir = Path() - - # Get `grid` settings - grid_database = file_configuration["input_directories"]["grid"]["database_name"] - - # Create full filepath - db_filepath = root_dir / "database" / grid_database - - # Create if file doesn't already exist - if not db_filepath.exists(): - - # Get projection - projection = file_configuration["geospatial"]["projection"] - - # Get grid settings - grid_settings = file_configuration["geospatial"]["griddify"] - - # Get the resolution - resolution = grid_settings["grid_resolution"] - # ---- Convert from nmi to m - resolution_m = {key: distance(nautical=dist).meters for key, dist in resolution.items()} - - # Get boundary coordinates - boundary = grid_settings["bounds"] - # ---- x - x = boundary["longitude"] - # ---- y - y = boundary["latitude"] - # ---- Create DataFrame - boundary_df = pd.DataFrame({ - "x": np.array([np.min(x), np.max(x), np.max(x), np.min(x), np.min(x)]), - "y": np.array([np.min(y), np.min(y), np.max(y), np.max(y), np.min(y)]) - }) - - # Create GeoDataFrame - boundary_gdf = gpd.GeoDataFrame( - data = boundary_df, - geometry=gpd.points_from_xy(boundary_df["x"], boundary_df["y"]), - crs = projection - ) - - # Convert to UTM (decimal degrees to m) - # ---- Create UTM code - utm_code = utm_string_generator((boundary_df.x.min() + boundary_df.x.max()) / 2, - (boundary_df.y.min() + boundary_df.y.max()) / 2) - # ---- Create number code - utm_num = int(utm_code) - # ---- Create string code - utm_str = f"epsg:{utm_num}" - # ---- UTM conversion - boundary_gdf_utm = boundary_gdf.to_crs(utm_num) - - # Get step sizes for each grid cell - # ---- x - x_step = resolution_m["x_distance"] - # ---- y - y_step = resolution_m["y_distance"] - - # Prepare grid cell generation - # ---- Get new boundaries - xmin, ymin, xmax, ymax = boundary_gdf_utm.total_bounds - # ---- Initialize empty list - grid_cells = [] - # ---- Initialize coordinate counter - y_ct = 0 - x_coord = []; y_coord = [] - # ---- Iterate through to generate cells - for y0 in np.arange(ymin, ymax, y_step): - y_ct += 1 - x_ct = 0 - for x0 in np.arange(xmin, xmax, x_step): - x_ct += 1 - # ---- Step forward - x_coord.append(x_ct) - y_coord.append(y_ct) - x1 = x0 - x_step - y1 = y0 + y_step - # ---- Append to list - grid_cells.append(shapely.geometry.box(x0, y0, x1, y1)) - - # Convert to a GeoDataFrame - cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=utm_code) - # ---- Add cordinates - cells_gdf.loc[:, "x"] = np.array(x_coord) - cells_gdf.loc[:, "y"] = np.array(y_coord) - - # Get coastline shapefile directory, if defined - if "coastline" in file_configuration["input_directories"]: - - # Get coastline settings - coast_settings = file_configuration["input_directories"]["coastline"] - # ---- Create filepath - shp_filepath = ( - root_dir / coast_settings["directory"] - / coast_settings["coastline_name"] / f"{coast_settings["coastline_name"]}.shp" - ) - # ---- Validate existence - if not shp_filepath.exists(): - raise FileNotFoundError( - f"{shp_filepath} does not exist!" - ) - - # Get original lat/lon geometry boundaries - xmin0, ymin0, xmax0, ymax0 = boundary_gdf.total_bounds - - # Read in file - full_coast = gpd.read_file(shp_filepath) - # ---- Convert to UTM - full_coast_utm = full_coast.to_crs(utm_code) - # ---- Remove empty - full_coast_utm = full_coast_utm[~full_coast_utm.is_empty] - - # Create bouning box with a buffer - boundary_box = box(xmin0 - 5, ymin0 - 5, xmax0 + 5, ymax0 + 5) - # ---- Create an unbuffered copy - boundary_box_unbuffered = box(xmin0, ymin0, xmax0, ymax0) - # ---- Convert to a GeoDataFrame - boundary_box_unbuffered_gdf = ( - gpd.GeoDataFrame(geometry=[boundary_box_unbuffered], crs=projection) - ) - # ---- Clip the coastline for saving - clipped_coast_original = ( - gpd.clip(full_coast, box(xmin0 + 1, ymin0 + 1, xmax0 + 1, ymax0 + 1)) - ) - - # Clip the coastline shapefile - clipped_coast = gpd.clip(full_coast, boundary_box).to_crs(utm_code) - - # Clip the grid cells - cells_gdf.loc[:, "geometry"] = ( - cells_gdf["geometry"].difference(clipped_coast.geometry.union_all()) - ) - - # Calculate area per cell - cells_gdf.loc[:, "area"] = cells_gdf.area - - # Convert back to original projection and clip - clipped_cells_latlon = ( - gpd.clip(cells_gdf.to_crs(projection), boundary_box_unbuffered_gdf) - .reset_index(drop=True) - ) - - # Initialize empty columns that can be added to later on - clipped_cells_latlon.loc[:, ["number_density_mean", "biomass_density_mean", - "abundance", "biomass"]] = 0.0 - - # Create output DataFrame - output_df = pd.DataFrame({ - "geometry": clipped_cells_latlon["geometry"].apply(lambda geom: geom.wkt) - }) - # ---- Add the required columns - output_df = pd.concat([output_df, clipped_cells_latlon.loc[:, ["x", "y", "area"]]], - axis=1) - # ---- Initialize empty columns that can be added to later on - output_df.loc[:, ["number_density_mean", "biomass_density_mean", "abundance", - "biomass"]] = 0.0 - - # Write to the database file (for the grid) - # ---- Create engine - engine = sqla.create_engine(f"sqlite:///{db_filepath}") - # ---- Connect and create table - _ = output_df.to_sql("grid_df", engine, if_exists="replace") - - # Write to the database file (for the coastline shapefile) - # ---- Create output copy - coastline_out = pd.DataFrame({ - "geometry": clipped_coast_original["geometry"].apply(lambda geom: geom.wkt) - }) - # ---- Concatenate - coastline_out = ( - pd.concat([coastline_out, clipped_coast_original.drop(columns="geometry")], axis=1) - ) - # ---- Connect and create table - _ = coastline_out.to_sql("coastline_df", engine, if_exists="replace") - -#################################################################################################### -# TEST: YAML FILE CONFIGURATION -# ---- Define filepaths -self = LiveSurvey -live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml" -live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" -# ---- Run function: `live_configuration` -file_configuration = self.config -files = biology_files - -biology_output = initial_biology_output -file_configuration = self.config -table_name = "length_df" -df = filtered_biology_output[table_name] -database_file = biology_db -kwargs = dict(dataframe=df, table_name=table_name, id_columns=["id"], primary_keys=["id"], output_type=pd.DataFrame) - -# NOTE: ARGUMENT: {working_dataset: Literal["acoustics", "biology"]} -working_dataset = "acoustics" -self = realtime_survey -file_configuration = self.config -self.results["biology"] = self.input["biology_processed"] -self.results["acoustics"] = self.input["nasc_df"] - -# Get spatial column -spatial_column = file_configuration["spatial_column"] - -# Initialize the working data dictionary -working_data = copy.deepcopy(self.results) -contrast_columns = [] -# ---- Define unique columns -unique_columns = spatial_column + contrast_columns - -acoustic_db = file_configuration["database"][working_dataset] -self = realtime_survey -acoustic_dict = self.input["acoustics"] -verbose = True -contrast_columns = [] -db_file = acoustic_db -table_name="survey_data_df" -data_columns = data_columns -unique_columns=unique_columns -constraint="nasc > 0.0" -data_dict = self.input["acoustics"] -data_dict["nasc_df"]["stratum"] = 1 -data_dict["prc_nasc_df"]["stratum"] = 2 -table_name = "sigma_bs_mean_df" -data_columns=["sigma_bs", "sigma_bs_count"] -biology_db -strata_df = self.input["spatial"]["strata"] - -def biology_pipeline(biology_dict: dict, - strata_df: pd.DataFrame, - file_configuration: dict, - verbose: bool, - contrast_columns: List[str] = []): - - # Get spatial column - spatial_column = file_configuration["spatial_column"] - unique_columns = spatial_column + contrast_columns - - # Get database file - acoustic_db = file_configuration["database"]["acoustics"] - - # Get biology database file - biology_db = file_configuration["database"]["biology"] - - # Check for data completion - # ---- List of boolean values - full_biology_data = ( - [True for _, df in biology_dict.items() if isinstance(df, pd.DataFrame) and df is not None] - ) - # ---- Validation - if not all(full_biology_data): - # ---- Print, if verbose - if verbose: - print( - f"No new processed biology data available for processing." - ) - else: - # Get related biology data - acoustic_df = get_nasc_sql_data(acoustic_db, - biology_dict, - unique_columns=unique_columns) - - # Get the corresopding `sigma_bs` data (and also compute the sample-number weighted average) - sigma_bs_df = get_sigma_bs_sql_data(acoustic_db, - biology_dict, - unique_columns=unique_columns) - - # Calculate population estimates if valid data are available - if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]): - # ---- Merge the NASC and sigma_bs datasets - nasc_biology = acoustic_df.merge(sigma_bs_df, on=unique_columns) - # ---- Compute the number densities (animals nmi^-2) - nasc_biology["number_density"] = ( - nasc_biology["nasc"] - / (4.0 * np.pi * nasc_biology["sigma_bs_mean"]) - ) - - # Get the corresponding average strata weights (computed for all fish) - weight_spatial_averages = get_average_strata_weights(biology_db, - biology_dict, - unique_columns=unique_columns) - - if weight_spatial_averages is not None: - # Merge average weights with number density estimates - nasc_biology = nasc_biology.merge(weight_spatial_averages, on=unique_columns) - - # Compute biomass densities - nasc_biology["biomass_density"] = ( - nasc_biology["number_density"] * nasc_biology["average_weight"] - ) - - # Update the survey population estimate DataFrame with the newly computed densities - if not nasc_biology.empty: - sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df", - columns=["number_density", "biomass_density"], - unique_columns=["stratum", "longitude", "latitude", "ping_time"]) - - # Summarize strata - summarize_strata(nasc_biology, strata_df, file_configuration) - -db_file=acoustic_db -dataframe=nasc_biology -table_name="survey_data_df" -columns=["number_density", "biomass_density"] -unique_columns=["stratum", "longitude", "latitude", "ping_time"] -nasc_biology["number_density"].sum() / 2 -nasc_biology["number_density"] -SQL(acoustic_db, "select", table_name="survey_data_df") -SQL(biology_db, "select", table_name="strata_summary_df") -strata_df = self.input["spatial"]["strata"].copy() -strata_df[["length_mean", "weight_mean", "TS_mean", "number_density_mean", - "biomass_density_mean", "abundance_sum", "biomass_sum"]] = np.nan -strata_df.drop(columns=["latitude_interval"], inplace=True) -SQL(acoustic_db, "select", table_name="survey_data_df") - -SQL(biology_db, "drop", table_name="strata_summary_df") -SQL(biology_db, "create", table_name="strata_summary_df", dataframe=strata_df, primary_keys=["stratum"]) -SQL(biology_db, "insert", table_name="strata_summary_df", dataframe=strata_df, - id_columns=["stratum"]) - -tt = pd.DataFrame({ - "x": np.array([1, 1, 1, 2, 2, 2, 3, 3, 3]), - "y": np.array([1, 2, 3, 1, 2, 3, 1, 2, 3]), - "area": 50 ** 2, - "mean_number_density": 0.0, - "mean_biomass_density": 0.0, - "abundance": 0.0, - "biomass": 0.0 -}) - -nasc_biology_output_a = self.input["nasc_df"].assign(x=1, y=1).reset_index(drop=True) -nasc_biology_output_a.loc[3, "x"] = 2 -nasc_biology_output_a.loc[3, "y"] = 3 -nasc_biology_output_a = nasc_biology_output_a.filter(["stratum", "x", "y", "longitude", "latitude", "nasc", "number_density", "biomass_density"]) -nasc_biology_output = nasc_biology_output_a.merge(sigma_bs_mean_df, on=spatial_column) -nasc_biology_output["number_density"] = ( - nasc_biology_output["nasc"] - / (4.0 * np.pi * nasc_biology_output["sigma_bs_mean"]) -) -nasc_biology_output =nasc_biology_output.merge(general_weight_averages) -nasc_biology_output["biomass_density"] = nasc_biology_output["number_density"] * nasc_biology_output["average_weight"] -nasc_biology_output = nasc_biology_output.filter(["stratum", "x", "y", "longitude", "latitude", "number_density", "biomass_density"]) -nasc_biology_output = nasc_biology_output[nasc_biology_output["number_density"] > 0.0].reset_index() - -SQL(acoustic_db, "drop", table_name="reference") -SQL(acoustic_db, "drop", table_name="grid") - -SQL(acoustic_db, "create", table_name = "reference", dataframe=tt) -SQL(acoustic_db, "create", table_name = "grid", dataframe=nasc_biology_output_a) - -SQL(acoustic_db, "insert", table_name = "reference", dataframe=tt) -SQL(acoustic_db, "insert", table_name = "grid", dataframe=nasc_biology_output_a) - -SQL(acoustic_db, "select", table_name="grid") -SQL(acoustic_db, "select", table_name="reference") - -sql_group_update(acoustic_db, dataframe=nasc_biology_output, - table_name="grid", columns=["number_density", "biomass_density"], - unique_columns=["stratum", "x", "y", "longitude", "latitude"]) - -SQL(acoustic_db, "select", table_name="grid") - -from typing import List - -data_table = "grid" -grid_table = "reference" -column_pairs = [("number_density", "abundance"), ("biomass_density", "biomass")] - -dataframe = nasc_biology_output - -import sqlalchemy as sqla -grid_db_file = file_configuration["database"]["grid"] -survey_db_file = Path(file_configuration["data_root_dir"]) / "database" / "acoustics.db" -data_table = "survey_data_df" -grid_table = "grid_df" -coordinates = ["x", "y"] -from echopop.live.sql_methods import SQL - -SQL(grid_db_file, "select", table_name=grid_table) -SQL(survey_db_file, "select", table_name=data_table) -SQL(data_table, "map") - -gridding_column = self.config["gridding_column"] - -updated_survey_data = nasc_biology.copy() -# Get relevant table -previous_grid = query_dataset(grid_db_file, updated_survey_data, - table_name=grid_table, - data_columns=["x", "y", "area", "number_density_mean", - "biomass_density_mean", "abundance", "biomass"], - unique_columns=["x", "y"]) -previous_data = query_dataset(survey_db_file, updated_survey_data, - table_name=data_table, - data_columns=["x", "y", "number_density", "biomass_density"], - unique_columns=["x", "y"]) -# Get unique coordinates -update_keys = get_unique_identifiers(updated_survey_data, gridding_column).set_index(["x", "y"]) - - -# Index -previous_grid.set_index(["x", "y"], inplace=True) -previous_grid["biomass_density_mean"] = previous_data.groupby(["x", "y"])["biomass_density"].mean() -previous_grid["number_density_mean"] = previous_data.groupby(["x", "y"])["number_density"].mean() - -# Convert area from m^2 to nmi^2 -previous_grid["abundance"] = previous_grid["number_density_mean"] * previous_grid["area"] -previous_grid["biomass"] = previous_grid["biomass_density_mean"] * previous_grid["area"] -previous_grid = previous_grid.reset_index() - -sql_group_update(grid_db_file, dataframe=previous_grid, - table_name=grid_table, - columns=["number_density_mean", "biomass_density_mean", "abundance", "biomass"], - unique_columns=["x", "y"]) - -murr = SQL(grid_db_file, "select", table_name=grid_table) -murr[murr.abundance > 0] - -update_keys["number_density_mean"] = updated_survey_data.groupby(["x", "y"])["number_density"].mean() -update_keys["biomass_density_mean"] = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean() - -am = SQL(grid_db_file, "select", table_name="grid_df") -am[am.abundance > 0] -bm = SQL(grid_db_file, "select", table_name="grid_df") -bm[bm.abundance > 0] -number_density_mean = updated_survey_data.groupby(["x", "y"])["number_density"].mean() -biomass_density_mean = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean() - -SQL(grid_db_file, "select", table_name=grid_table) - - - -pulled_data = pd.concat([SQL(grid_db_file, "select", - table_name=grid_table, - condition=f"x = {t[0]} & y = {t[1]}") for t in unique_coord]) -previous_cell_data = pd.concat([SQL(survey_db_file, "select", - table_name=data_table, - condition=f"x = {t[0]} & y = {t[1]}") for t in unique_coord]) - -from echopop.live.live_data_processing import get_nasc_sql_data, get_sigma_bs_sql_data, get_average_strata_weights, summarize_strata -from echopop.live.sql_methods import sql_group_update -from typing import List -from shapely.geometry import box -SQL(grid_db_file, "select", table_name="grid_df") -# Compute means -number_density_mean = previous_cell_data.groupby(["x", "y"])["number_density"].mean() -previous_cell_data = previous_cell_data.groupby(["x", "y"])["biomass_density"].mean() - -[SQL(grid_db_file, "select", table_name=grid_table, condition=f"x = {xi} & y = {yi}") for xi, yi in zip(nasc_data_df["x"], nasc_data_df["y"])] - -# Write to the database file (for the grid) -# ---- Create engine -engine = sqla.create_engine(f"sqlite:///{db_filepath}") - -def update_population_grid(grid_db_file: str, - data_table: str, - grid_table: str, - dataframe: pd.DataFrame, - column_pairs: Union[List[tuple[str, str]], tuple[str, str]], - coordinates: List[str]): - - # Convert `column_pairs` to a list, if needed - if not isinstance(column_pairs, list): - column_pairs = [column_pairs] - - dataframe[coordinates] - # Format the coordinate pairs - # ---- Convert coordinate values into a list of tuples - coord_pairs = [tuple(row) for row in dataframe[coordinates].itertuples(index=False)] - # ---- Get unique pairs - coords = list(set(coord_pairs)) - - # Format the SQL script command - # ---- Initialize - sql_script = [] - # ---- Iteratively update - for input_column, output_column in column_pairs: - sql_script.append( - f""" - BEGIN TRANSACTION; - - -- Calculate averages for input_column and update grid_table - WITH avgs AS ( - SELECT - {coordinates[0]}, - {coordinates[1]}, - AVG(d.{input_column}) as avg_value - FROM {data_table} d - GROUP BY d.{coordinates[0]}, d.{coordinates[1]} - ) - - -- Update the grid_table with both average and computed total - UPDATE {grid_table} - SET - mean_{input_column} = ( - SELECT avg_value - FROM avgs - WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} - AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} - ), - {output_column} = ( - SELECT avg_value * {grid_table}.area - FROM avgs - WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} - AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} - ) - WHERE EXISTS ( - SELECT 1 - FROM avgs - WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} - AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} - ); - - COMMIT; - """ - ) - - # Create the engine - engine = create_engine(f"sqlite:///{db_file}") - - # Create the SQL database connection and send the script - with engine.connect() as connection: - dbapi_conn = connection.connection - _ = dbapi_conn.executescript("\n".join(sql_script)) - - - -def update_population_grid(db_file: str, - data_table: str, - grid_table: str, - dataframe: pd.DataFrame, - column_pairs: Union[List[tuple[str, str]], tuple[str, str]], - coordinates: List[str]): - - # Convert `column_pairs` to a list, if needed - if not isinstance(column_pairs, list): - column_pairs = [column_pairs] - - dataframe[coordinates] - # Format the coordinate pairs - # ---- Convert coordinate values into a list of tuples - coord_pairs = [tuple(row) for row in dataframe[coordinates].itertuples(index=False)] - # ---- Get unique pairs - coords = list(set(coord_pairs)) - - # Format the SQL script command - # ---- Initialize - sql_script = [] - # ---- Iteratively update - for input_column, output_column in column_pairs: - sql_script.append( - f""" - BEGIN TRANSACTION; - - -- Calculate averages for input_column and update grid_table - WITH avgs AS ( - SELECT - {coordinates[0]}, - {coordinates[1]}, - AVG(d.{input_column}) as avg_value - FROM {data_table} d - GROUP BY d.{coordinates[0]}, d.{coordinates[1]} - ) - - -- Update the grid_table with both average and computed total - UPDATE {grid_table} - SET - mean_{input_column} = ( - SELECT avg_value - FROM avgs - WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} - AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} - ), - {output_column} = ( - SELECT avg_value * {grid_table}.area - FROM avgs - WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} - AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} - ) - WHERE EXISTS ( - SELECT 1 - FROM avgs - WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} - AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} - ); - - COMMIT; - """ - ) - - # Create the engine - engine = create_engine(f"sqlite:///{db_file}") - - # Create the SQL database connection and send the script - with engine.connect() as connection: - dbapi_conn = connection.connection - _ = dbapi_conn.executescript("\n".join(sql_script)) - - -SQL(acoustic_db, "select", table_name=data_table) -SQL(acoustic_db, "select", table_name=grid_table) - - -SQL(acoustic_db, "update", table_name="grid", dataframe=nasc_biology_output, unique_columns=["stratum", "x", "y"], columns=["number_density", "biomass_density"]) -SQL(acoustic_db, "select", table_name="reference") - -source_db = acoustic_db -target_db = biology_db - -source_table = "grid" -target_table = "strata_summary_df" - -data_columns = ["number_density", "biomass_density"] -strata_columns = ["stratum"] -strata = [2] -stratum_list = ', '.join(map(str, stratum_values)) - -data_column = data_columns[0] -data_columns = data_columns[0] -def sql_update_strata_summary(source_db: str, - target_db: str, - arg_fun: str, - data_columns: List[tuple[str, str]], - strata: list): - - # Format strata list as a string - strata_str = ', '.join(map(str, strata)) - - # Function reference map - FUNCTION_MAP = { - "sum": {"function": "SUM", - "suffix": "sum"}, - "mean": {"function": "AVG", - "suffix": "mean"} - } - - # Prepare the SQL script - sql_script = f""" - -- Attach the source and target databases - ATTACH DATABASE '{source_db}' AS source; - ATTACH DATABASE '{target_db}' AS target; - - """ - - # Dynamically format the cross-database command - for data_column, method in data_columns: - # ----- Format the function-method-suffic keys - suffix = FUNCTION_MAP[method]["suffix"] - fun = FUNCTION_MAP[method]["function"] - # ---- Create the combined SQL command using f-strings - sql_script += f""" - -- Calculate averages and directly update the target table - UPDATE target.{target_table} - SET {data_column}_{suffix} = ( - SELECT {fun}({data_column}) - FROM source.{source_table} - WHERE stratum = target.{target_table}.stratum - ) - WHERE stratum IN ({strata_str}); - """ - # ----- Append DETACH commands only once at the end - sql_script += """ - -- Detach the databases - DETACH DATABASE source; - DETACH DATABASE target; - """ - - # Create the engine - engine = create_engine(f"sqlite:///{target_db}") - - # Create the SQL database connection and send the script - with engine.connect() as connection: - dbapi_conn = connection.connection - _ = dbapi_conn.executescript(sql_script) - -SQL(biology_db, "select", table_name=target_table) -SQL(acoustic_db, "select", table_name=source_table)["number_density"].mean() -connection.close() -dbapi_conn.close() - - -pairs = [(1, 2), (3, 4), (5, 6)] - -# Convert the pairs into a format suitable for SQL IN clause -pairs_placeholder = ', '.join(f'({x}, {y})' for x, y in pairs) - -# Construct the SQL command as a text string -sql_command = f''' -BEGIN TRANSACTION; - -UPDATE reference -SET total = ( - SELECT AVG(g.sigma_bs) * r.area - FROM grid g - WHERE g.stratum = r.stratum_x -) -WHERE (stratum_x, stratum_y) IN ({pairs_placeholder}); - -COMMIT; -''' - -psi = 10 ** (-21/10) -psi * 280**2 * 1500 * 128e-6 / 2 -psi / 3 * 280 ** 3 / 280 / 1852 ** 2 * nasc_biology["number_density"] - -psi * (280.0 ** 2) / 1852 ** 2 -depth_area = 280 ** 2 * psi -swath_length = 0.5 * 1852 -depth_area * swath_length / 1852 ** 2 * nasc_biology["number_density"] -280 ** 2 * psi / 1852 ** 2 * nasc_biology["number_density"] - -SQL(acoustic_db, "map") -beam_angle = 9.0 * np.pi / 180.0 -280.0 * np.tan(beam_angle) * 2.0 * swath_length / 1852 ** 2 * nasc_biology["number_density"] -280.0 * np.tan(beam_angle) * 2.0 ** 2 * np.pi * swath_length / 1852 ** 2 * nasc_biology["number_density"] -area = 2.0 * nasc_biology["center_of_mass"] ** 2 * np.tan(beam_angle) -area / 1852 ** 2 * nasc_biology["number_density"] -SQL(acoustic_db, "map") - -# Merge hake fraction data into `nasc_interval_df` -# ---- Initial merge -nasc_interval_df = nasc_interval_df.merge( - input_dict["spatial"]["strata_df"], on=[stratum_col, "haul_num"], how="outer" -) -# ---- Replace `fraction_hake` where NaN occurs -nasc_interval_df["fraction_hake"] = nasc_interval_df["fraction_hake"].fillna(0.0) -# ---- Drop NaN -nasc_interval_df.dropna(subset=["transect_num"], inplace=True) - -# Calculate the along-transect number density (animals per nmi^2) -# ---- Merge NASC measurements with mean sigma_bs for each stratum -nasc_biology = nasc_interval_df.merge(sigma_bs_strata, on=[stratum_col]) -# ---- Calculate the number densities -nasc_biology["number_density"] = ( - nasc_biology["fraction_hake"] - * nasc_biology["nasc"] - / (4.0 * np.pi * nasc_biology["sigma_bs_mean"]) -) - - -if working_dataset == "acoustic": - db_file = self.config["database"]["acoustic"] -elif working_dataset == "biology": - db_file = self.config["database"]["biology"] -else: - raise ValueError( - f"Argument for `working_dataset` [{working_dataset}] is invalid." - f" Value must either be 'acoustic' or 'biology'." - ) - -# Extract the necessary correct strata mean sigma_bs -sigma_bs_strata = analysis_dict["acoustics"]["sigma_bs"]["strata_mean_df"] - -# Pull out the length-weight conversion for each stratum -length_weight_strata = analysis_dict["biology"]["weight"]["weight_stratum_df"] - -# Get the name of the stratum column -stratum_col = settings_dict["transect"]["stratum_name"] - - -catch_data = self.input["biology"]["catch_df"] - -# Get the spatial column name, if there is one -spatial_column = file_configuration["spatial_column"] -# ---- Append additional columns that will be used -contrast_columns = spatial_column + ["sex", "species_id"] - -# Calculate grouped totals -# ---- Sum the net haul weights from station 1/unaged fish -catch_weights = catch_data.count_variable( - contrasts=["species_id"] + spatial_column, - variable="haul_weight", fun="sum" -) -# ---- Rename resulting columns for both -catch_weights.rename(columns={"count": "total_weight"}, inplace=True) - -# ---- Specimen -specimen_weights = specimen_weight_binned.sum().reset_index(name="total_weight") - -specimen_weight_binned -# Calculate the sexed and total stratum weights for each sex among unaged fish -# ---- Sum the net haul weights from station 1/unaged fish -catch_weights = catch_data.count_variable( - contrasts=["species_id"] + file_configuration["spatial_column"], - variable="haul_weight", fun="sum" -) -# ---- Rename resulting columns for both -catch_weights.rename(columns={"count": "total_weight"}, inplace=True) - -# For the specimen data -# ---- Sum the net haul weights from station 1/unaged fish -# ---- Specimen -specimen_weights_sex = ( - specimen_weight_binned - .groupby(contrast_columns)["weight"] - .sum() -) -# ---- Total (per stratum, if it exists) -specimen_weight_total = specimen_weights_sex.transpose().unstack(1).sum(axis=1) - -# For the length (unaged) dataset -length_weights_sex = ( - length_weight_binned - .groupby(contrast_columns)["weight_interp"] - .sum() -) -# ---- Further reduce to the grand total (per stratum, if it exists) -length_weight_total = length_weights_sex.transpose().unstack(1).sum(axis=1) - -# ---- Standardize the unaged sexed weights -length_weight_standardized = ( - (length_weights_sex / length_weight_total).unstack(0) - * catch_weights["total_weight"].to_numpy() -) - -# Calculate the specimen weight proportions -# ---- Pivot weight bins -specimen_weight_binned_pvt = ( - specimen_weight_binned.pivot_table( - columns=spatial_column, - index=["length_bin", "species_id", "sex"], - values="weight", - observed = False - ) -) -# ---- Divide by the aged stratum weights (relative to only aged fish) -specimen_weight_proportions_pvt = ( - specimen_weight_binned_pvt / specimen_weight_total.to_numpy() -) -# ---- Pivot back to the desired format -specimen_weight_proportion = ( - specimen_weight_proportions_pvt - .stack().reset_index(name="weight_proportion") - .pivot_table(columns=stratum_column + ["species_id", "sex"], - index="length_bin", values="weight_proportion") -) -# ---- Calculate the internal (i.e. only aged fish) for each sex -within_specimen_sex_proportions = ( - specimen_weight_proportion.sum() -) - -# Calculate the total strata weights -# ---- Index `catch_weights` -catch_weights_idx = catch_weights.set_index(stratum_column + ["species_id"]) -# ---- Compute the spatially-stratified/grouped weights -spatial_weights = ( - pd.concat([specimen_weight_total.to_frame("total_weight"), catch_weights_idx]) - .pivot_table( - columns=stratum_column, - aggfunc="sum", - values="total_weight", - observed=False - ) -) - -# Calculate the weight proportions relative to the overall stratum weights -# ---- Aged -# -------- Reformat into dataframe and merge with total stratum weights -specimen_weights_binned_df = ( - specimen_weight_binned_pvt.stack() - .to_frame("specimen_weight") - .reset_index() - .merge(spatial_weights.T.reset_index(), on=stratum_column) -) -# -------- Calculate proportions -specimen_weights_binned_df["weight_proportion_overall"] = ( - specimen_weights_binned_df["specimen_weight"] / specimen_weights_binned_df["total_weight"] -) -# -------- Consolidate to calculate the sexed proportions per stratum -specimen_weight_sex_proportions = specimen_weights_binned_df.groupby(stratum_column + ["species_id", "sex"])[ - "weight_proportion_overall" -].sum() -# ---- Unaged -# -------- Reformat into dataframe and merge with total stratum weights -length_weights_sex_standardized_df = ( - length_weight_standardized.stack() - .to_frame("catch_weight") - .reset_index() - .merge(spatial_weights.T.reset_index(), on=stratum_column) -) -# -------- Calculate proportions -length_weights_sex_standardized_df["weight_proportion_overall"] = ( - length_weights_sex_standardized_df["catch_weight"] - / length_weights_sex_standardized_df["total_weight"] -) -# -------- Back-calculate the sexed weight proportions relative to just unaged fish -# ------------ Aggregate proportions -length_total_sex_proportions = length_weights_sex_standardized_df.pivot_table( - columns=["species_id", "sex"], index=stratum_column, values="weight_proportion_overall" -).transpose().unstack(["species_id"]).sum(axis=0) -# ------------ Re-compute the proportions -length_weight_sex_proportions = ( - length_weights_sex_standardized_df.pivot_table( - index=["species_id", "sex"], columns=stratum_column, - values="weight_proportion_overall" - ) - / length_total_sex_proportions.to_numpy() -) - -# Compute the overall length-binned weight distributions among unaged fish -# ---- Extract the number proportions computed for unaged fish -length_number_proportions = length_number_proportion.copy() -# ---- Filter out values besides those computed for 'all' fish -length_number_proportions = length_number_proportions[length_number_proportions["sex"] == "all"] -# ---- Convert to a table -length_number_proportions_tbl = length_number_proportions.pivot_table( - columns=stratum_column + ["species_id"], - index=["length_bin"], - values="proportion_number_length", - aggfunc="sum", - observed=False, -) -# ---- Extract the fitted weight values calculated for all fish -length_weight_all = length_weight_df[length_weight_df["sex"] == "all"] -# ---- Generate the fitted weight array -fitted_weights = length_weight_all.copy() -# ---- Get actual length bins in dataset -fitted_weights = fitted_weights[fitted_weights["length_bin"].isin(length_number_proportions["length_bin"])] -# ---- Apportion the averaged weights -length_apportioned_weights = length_number_proportions_tbl.T * fitted_weights["weight_fitted"].to_numpy() -# ---- Compute the average weight proportions per length bin per stratum -average_length_bin_weights = length_apportioned_weights.T / length_apportioned_weights.sum(axis=1) -# ---- Convert back to a DataFrame -average_length_bin_weights_df = average_length_bin_weights.unstack().reset_index( - name="weight_proportion" -) - -# Calculate the aged and unaged weight proportions -# ---- Aged -aged_proportions = specimen_weight_sex_proportions.unstack("sex").sum(axis=1) -# ---- Unaged -unaged_proportions = 1 - aged_proportions -# -------- Re-weight the unaged sexed proportions -unaged_weight_sex_proportions_overall = ( - (length_weight_sex_proportions * unaged_proportions.unstack().transpose()).astype(float).fillna(0.0) -) - -unaged_proportions.unstack().transpose() -# Format the outputs -# ---- Aged: stratum-sex-age-length relative to aged and total weights -aged_overall_df = ( - specimen_weight_proportion.unstack() - .reset_index(name="weight_proportions") - .merge( - specimen_weights_binned_df[ - stratum_column + ["length_bin", "sex", "species_id", "weight_proportion_overall"] - ] - ) -) -# ---- Aged: stratum-sex relative to total weights -aged_sex_df =within_specimen_sex_proportions.reset_index(name="weight_proportion_aged").set_index( - stratum_column + ["species_id", "sex"] - ) -# ---- Add the aged sex proportiosn relative to the overall survey -aged_sex_df["weight_proportion_overall_aged"] = specimen_weight_sex_proportions -# ---- Consolidate the aged and unaged sexed dataframes -# -------- Initialize the dataframe -aged_unaged_sex_proportions = aged_sex_df.reset_index().set_index(["species_id", "sex"] + stratum_column) -# --------- Add the within-unaged weight proportions -aged_unaged_sex_proportions["weight_proportion_unaged"] = ( - length_weight_sex_proportions.stack() -) -# --------- Add the overall-unaged weight proportions -aged_unaged_sex_proportions["weight_proportion_overall_unaged"] = ( - unaged_weight_sex_proportions_overall.stack() -) -# ---- Overall aged and unaged proportions -aged_unaged_proportions = aged_proportions.reset_index(name="aged_proportions") -# ---- Set index -aged_unaged_proportions.set_index(stratum_column + ["species_id"], inplace=True) -# -------- Add unaged proportions -aged_unaged_proportions["unaged_proportions"] = unaged_proportions#.reset_index() -# ---- Reset the index -aged_unaged_proportions = aged_unaged_proportions.reset_index() -#################################################################################################### -# * Functionality for reading in processed acoustic data -# TODO: Expand data validator and limit cases to '*.zarr' (for now) -# TODO: Refactor "extra" components such as the validation steps, xarray-to-dataframe piping, etc. -# TODO: Documentation -file_settings = file_configuration["input_directories"]["acoustics"] -root_directory = file_configuration["data_root_dir"] - - -#################################################################################################### -def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str, List[str]]] = None): - - # Get all database files - database_files = file_configuration["database"] - - # Iterate through all keys - for _, db_file in database_files.items(): - # ---- Map the table names - table_names = SQL(db_file, "map") - # ---- Drop any noted exceptions - if not isinstance(table_exception, list): - table_exception = [table_exception] - # ---- Drop exception table name - if None not in table_exception: - table_names = list(set(table_names) - set(table_exception)) - _ = [SQL(db_file, "drop", table_name=table) for table in table_names] - # ---- Validate that all tables were removed - if set(table_names).intersection(set(SQL(table_names, "map"))): - raise ValueError( - f"Attempted reset of [{str(db_file)}] failed." - ) - -SPATIAL_CONFIG_MAP = { - "closest_haul": { - "proximity": { - "choices": ["distance", "time"], - }, - }, - "global" : {}, - "griddify": { - "bounds": { - "longitude": { - "types": [float] - }, - "latitude": { - "types": [float] - }, - "northings": { - "types": [float] - }, - "eastings": { - "types": [float] - }, - "pairs": [("longitude", "latitude"), ("northings", "eastings")], - }, - "grid_resolution": { - "x_distance": { - "types": float, - }, - "y_distance": { - "types": float, - }, - "d_longitude": { - "types": float, - }, - "d_latitude": { - "types": float, - }, - "grid_size_x": { - "types": int, - }, - "grid_size_y": { - "types": int, - }, - "pairs": [("x_distance", "y_distance"), ("d_longitude", "d_latitude"), - ("grid_size_x", "grid_size_y")], - }, - }, - "inpfc": { - "stratum_names": { - "types": [int, str] - }, - "latitude_max": { - "types": [float], - }, - }, - "weighted_haul": { - "proximity": { - "choices": ["distance", "time"] - }, - }, -} - - - -reset_db_files(file_configuration, table_exception = "files_read") -reset_db_files(file_configuration) - -stamp = 20240714194248 -stamp.astype(int) -int(stamp) -import re -from datetime import datetime - -def infer_datetime_format(timestamp_str: Union[int, str]): - patterns = { - r"^\d{14}$": "%Y%m%d%H%M%S", # YYYYMMDDHHMMSS - r"^\d{8}$": "%Y%m%d", # YYYYMMDD - r"^\d{6}$": "%H%M%S", # HHMMSS - r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$": "%Y-%m-%d %H:%M:%S", # YYYY-MM-DD HH:MM:SS - r"^\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}$": "%Y/%m/%d %H:%M:%S", # YYYY/MM/DD HH:MM:SS - r"^\d{4}-\d{2}-\d{2}$": "%Y-%m-%d", # YYYY-MM-DD - r"^\d{4}/\d{2}/\d{2}$": "%Y/%m/%d" # YYYY/MM/DD - } - - for pattern, date_format in patterns.items(): - if re.match(pattern, timestamp_str): - return date_format - - raise ValueError("Unknown timestamp format") - -filter_dict = dict(species_filer=species_filter, trawl_filter=trawl_filter) - -def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict): - - # Create dataframe copy - data_copy = biology_data.copy() - - # Iterate through dictionary to apply filters (if present) - for column, value in filter_dict.items(): - if column in data_copy.columns: - data_copy = data_copy[data_copy[column] == value] - - # Return output - return data_copy - - - -df[(df['species_id'] == species_filter if 'species_id' in df.columns else True)] -df[(df["species_id"] == 17 if "species_id" in df.columns)] - -(df[df["haul_num"] == 17 if "haul_num" in df.columns] else True) - - -from datetime import datetime - -df = biology_output["trawl_info_df"] -df.loc[(df['species_id'] == species_filter if 'species_id' in df.columns else True), :] -df.index - -biology_output["trawl_info_df"].reset_index().index -df = biology_output["catch_df"] -df = df.loc[0, :].to_frame().T -df.index -df.loc[(df['species_id'] == species_filter if 'species_id' in df.columns else True)] - -def convert_datetime(timestamp: Union[int, str, pd.Series]): - - if isinstance(timestamp, pd.Series): - test_timestamp = str(timestamp[0]) - else: - test_timestamp = str(timestamp) - - # Approximate the datetime format - datetime_format = infer_datetime_format(str(test_timestamp)) - - # - if isinstance(timestamp, pd.Series): - return timestamp.apply(lambda x: datetime.strptime(x, datetime_format)) - else: - return datetime.strptime(timestamp, datetime_format) - -infer_datetime_format(stamp) -convert_datetime(stamp) -infer_datetime_format(202407) - -# {'global': False, 'INPFC': True, 'closest_haul': False, 'weighted_haul': False} -file_configuration["geospatial"]["link_biology_acoustics"] = "INPFC" -file_configuration["geospatial"] -spatial_config = file_configuration["geospatial"] -############### - -acoustic_data = self.input["acoustics"] -biology_data = self.input["biology"] - - - -from echopop.live.live_core import SPATIAL_CONFIG_MAP - -def load_spatial_data(acoustic_data: dict, - biology_data: dict, - file_configuration: dict,): - - # Extract spatial strata *only* if spatial information from the configuration settings - # ---- Get (geo)spatial config - spatial_config = file_configuration["geospatial"] - # ---- Remove case sensitivity - spatial_config = {key.lower(): value for key, value in spatial_config.items()} - # ---- Extract the projection - projection = spatial_config["projection"] - # ---- Extract the biology-acoustics linking method options - acoustics_biology_link = spatial_config["link_biology_acoustics"] - - # Validate the configuration - validate_spatial_config(spatial_config) - - # Create spatial dictionary that will be added as an `input` - spatial_dict = {"link_method": acoustics_biology_link} - - # Assign the spatial link constraints to the acoustic and biological data - if acoustics_biology_link == "INPFC": - spatial_dict.update({"strata": create_inpfc_strata(spatial_config)}) - - # Return the dictionary as an output - return spatial_dict - - - - # Convert the DataFrame to a GeoDataFrame - acoustic_data_gdf = gpd.GeoDataFrame( - data=acoustic_data, - geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]), - crs=projection - ) - - # Validate the spatial biology-acoustics linking method - # ---- Get the biology-acoustics linking method - link_method = next(key for key, value in acoustics_biology_link.items() if value) - # ---- Flag Error if unexpected method - if link_method not in ["global", "closest_haul", "INPFC", "weighted_haul"]: - raise ValueError( - f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options " - f"include: 'global', 'closest_haul', 'weighted_haul', and 'INPFC'." - ) - -#################################################################################################### -# TEST: BIOLOGY FILE INGESTION CONFIGURATION -# NOTE: -# ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration` -biology_data, file_configuration = load_biology_data(file_configuration) -biology_data -#################################################################################################### -prc_nasc_df = acoustic_data["prc_nasc_df"] - -def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: dict, - echometrics: bool = True): - - # Integrate NASC (and compute the echometrics, if necessary) - nasc_data_df = ( - acoustic_data_df.groupby(["longitude", "latitude", "ping_time"]) - .apply(lambda group: integrate_nasc(group, echometrics)) - .reset_index() - ) - # ---- Amend the dtypes if echometrics were computed - if echometrics: - nasc_data_df = ( - nasc_data_df - .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float, - "center_of_mass": float, "dispersion": float, "evenness": float, - "aggregation": float, "occupied_area": float}) - ) - - # Get the name of the associated db file - acoustics_db = file_configuration["database"]["acoustics"] - # ---- Get current tables - tables = SQL(acoustics_db, "inspect") - - # - if "nasc_df" not in tables: - _ = SQL(acoustics_db, "insert", table_name="nasc_df", dataframe=nasc_data_df) - else: - # ---- - nasc_sql = SQL(acoustics_db, "select", table_name="nasc_df") - # ---- - index_equiv = nasc_data_df[["longitude", "latitude", "ping_time"]].isin(nasc_sql) - # ---- - bool_idx = index_equiv.apply(lambda x: np.all(x), axis=1) - # ---- - _ = SQL(acoustics_db, "insert", table_name="nasc_df", dataframe=nasc_data_df.loc[~bool_idx]) - # ---- - nasc_data_df = pd.concat([nasc_sql, nasc_data_df], ignore_index=True) - - # Return the output - return nasc_data_df - - -SQL(acoustics_db, command="drop", table_name="nasc_df") -SQL(acoustics_db, "inspect") - -nasc_analysis = process_acoustic_data(acoustic_data["prc_nasc_df"], file_configuration) - -SQL(acoustics_db, command="select", table_name="nasc_df") - -TS_SLOPE = 20.0 -TS_INTERCEPT = -68.0 - -# CONVERT TO TS -comb_lengths["ts"] = TS_SLOPE * np.log10(comb_lengths["length"]) + TS_INTERCEPT -# TO SIGMA_BS -comb_lengths["sigma_bs"] = 10 ** (comb_lengths["ts"] / 10) -# WEIGHTED MEAN SIGMA_BS -sigma_mean = np.average(comb_lengths["sigma_bs"], weights=comb_lengths["length_count"]) - -from typing import Optional -from echopop.utils import operations -from echopop.acoustics import ts_length_regression, to_linear, to_dB - -__all__ = ["operations"] - -# Meld bio datasets -length_datasets = biology_data["specimen_df"].meld(biology_data["length_df"], - contrasts=["haul_num", "sex", "species_id", "length"]) - -# Create distribution -distrib_params = file_configuration["biology"]["length_distribution"]["bins"] - -length_bins = np.linspace(**{key: value for key, value in zip(["start", "stop", "num"], distrib_params)}, dtype=float) -binwidth = np.diff(length_bins / 2.0).mean() -intervals = np.concatenate([length_bins[:1] - binwidth, length_bins + binwidth]) -length_bins_df = pd.DataFrame({"bin": length_bins, "interval": pd.cut(length_bins, intervals)}) -# -length_datasets["length_bin"] = pd.cut(length_datasets["length"], bins=intervals, labels=length_bins_df["bin"]) - -stratify_key = file_configuration["geospatial"]["link_biology_acoustics"] - -if stratify_key == "global": - length_distribution = ( - length_datasets.pivot_table(columns=["sex"], index=["length_bin"], - values="length_count", aggfunc="sum", observed=False) - ) - # - length_distribution["total"] = length_distribution.sum(axis=1) - -length_distribution.transpose() -SQL(biology_db, "drop", table_name="length_distribution") -# Get the name of the associated db file -biology_db = file_configuration["database"]["biology"] -# ---- Get current tables -tables = SQL(biology_db, "inspect") - - -if "length_distribution" not in tables: - _ = SQL(biology_db, "insert", table_name="length_distribution", - dataframe=length_distribution.transpose()) - - -SQL(biology_db, "select", table_name="length_distribution") -SQL(biology_db, "drop", table_name="length_distribution") -SQL(biology_db, "replace", table_name="length_distribution", dataframe=length_distribution.unstack().reset_index(name="count")) -length_distribution.unstack().reset_index(name="count") -mixed = SQL(biology_db, "select", table_name="length_distribution") -length_bins[:1] -from typing import Optional -from echopop.utils import operations -from echopop.acoustics import ts_length_regression, to_linear, to_dB - -__all__ = ["operations"] - -biology_data = self.input["biology"] - -# Meld bio datasets -length_datasets = biology_data["specimen_df"].meld(biology_data["length_df"], - contrasts=["haul_num", "species_id", "length"]) - -ts_length_parameters_spp = [ - spp - for spp in file_configuration["acoustics"]["TS_length_regression_parameters"].values() - if spp["number_code"] in np.unique(length_datasets.species_id).astype(int) -] - -# ---- get species info -target_species = pd.DataFrame.from_dict(ts_length_parameters_spp) - -ts_lengths_df = length_datasets.merge( - target_species.drop("length_units", axis=1), - left_on=["species_id"], - right_on=["number_code"], -) -# ---- filter out other spp -length_datasets[length_datasets["species_id"].isin(target_species["number_code"])] - +# import contextlib +# import copy +# import glob +# import os +# import re +# from datetime import datetime +# from pathlib import Path +# from typing import Optional, Tuple, Union + +# import geopandas as gpd +# import matplotlib.cm as cm +# import matplotlib.colors as colors +# import matplotlib.pyplot as plt +# import numpy as np +# import pandas as pd +# import shapely.geometry +# import xarray as xr +# import yaml +# from geopy.distance import distance +# from matplotlib.colors import ListedColormap +# from shapely import wkt +# from shapely.geometry import box +# from sqlalchemy import Engine, create_engine, inspect, text + +# from echopop.acoustics import to_dB, to_linear, ts_length_regression +# from echopop.live import live_data_loading as eldl, live_data_processing as eldp +# from echopop.live.live_acoustics import configure_transmit_frequency, integrate_nasc +# from echopop.live.live_biology import preprocess_biology_data +# from echopop.live.live_core import ( +# LIVE_DATA_STRUCTURE, +# LIVE_FILE_FORMAT_MAP, +# LIVE_INPUT_FILE_CONFIG_MAP, +# SPATIAL_CONFIG_MAP, +# ) +# from echopop.live.live_data_loading import validate_data_directory +# from echopop.live.live_data_processing import get_unique_identifiers, query_dataset +# from echopop.live.live_survey import LiveSurvey +# from echopop.live.sql_methods import ( +# SQL, +# SQL_COMMANDS, +# format_sql_columns, +# initialize_database, +# query_processed_files, +# sql_data_exchange, +# sql_group_update, +# sql_update_strata_summary, +# ) +# from echopop.spatial.projection import utm_string_generator +# from echopop.survey import Survey + +# self = realtime_survey +# spatial_config = self.config["geospatial"] +# dataset = self.input["acoustics"]["nasc_df"] + + +# survey_2019 = Survey("C:/Users/Brandyn/Documents/GitHub/echopop/config_files/initialization +# _config.yml", "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/survey_year_2019_config +# .yml") +# survey_2019.transect_analysis() +# survey_2019.analysis["transect"]["biology"]["weight"]["weight_stratum_df"] +# analysis_dict = survey_2019.analysis["transect"] +# SQL(acoustic_db, "select", table_name="sigma_bs_mean_df") +# proportions_dict=analysis_dict["biology"]["proportions"]["number"] +# length_weight_dict = analysis_dict["biology"]["weight"] +# stratum_proportions_sexed["proportion_aged"] + stratum_proportions_sexed["proportion_unaged"] + +# updated_survey_data = nasc_biology.copy() +# gridding_column = file_configuration["gridding_column"] + +# unique_keys = get_unique_identifiers(updated_survey_data, gridding_column) + + +# file_configuration = self.config +# grid_settings["grid_resolution"]["x"] = 50 +# grid_settings["grid_resolution"]["y"] = 50 +# lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters +# lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters +# self = realtime_survey +# file_configuration = self.config + +# def initialize_grid(): + +# # Get root directory, if defined +# if "data_root_dir" in file_configuration: +# root_dir = Path(file_configuration["data_root_dir"]) +# else: +# root_dir = Path() + +# # Get `grid` settings +# grid_database = file_configuration["input_directories"]["grid"]["database_name"] + +# # Create full filepath +# db_filepath = root_dir / "database" / grid_database + +# # Create if file doesn't already exist +# if not db_filepath.exists(): + +# # Get projection +# projection = file_configuration["geospatial"]["projection"] + +# # Get grid settings +# grid_settings = file_configuration["geospatial"]["griddify"] + +# # Get the resolution +# resolution = grid_settings["grid_resolution"] +# # ---- Convert from nmi to m +# resolution_m = {key: distance(nautical=dist).meters for key, dist in resolution.items()} + +# # Get boundary coordinates +# boundary = grid_settings["bounds"] +# # ---- x +# x = boundary["longitude"] +# # ---- y +# y = boundary["latitude"] +# # ---- Create DataFrame +# boundary_df = pd.DataFrame({ +# "x": np.array([np.min(x), np.max(x), np.max(x), np.min(x), np.min(x)]), +# "y": np.array([np.min(y), np.min(y), np.max(y), np.max(y), np.min(y)]) +# }) + +# # Create GeoDataFrame +# boundary_gdf = gpd.GeoDataFrame( +# data = boundary_df, +# geometry=gpd.points_from_xy(boundary_df["x"], boundary_df["y"]), +# crs = projection +# ) + +# # Convert to UTM (decimal degrees to m) +# # ---- Create UTM code +# utm_code = utm_string_generator((boundary_df.x.min() + boundary_df.x.max()) / 2, +# (boundary_df.y.min() + boundary_df.y.max()) / 2) +# # ---- Create number code +# utm_num = int(utm_code) +# # ---- Create string code +# utm_str = f"epsg:{utm_num}" +# # ---- UTM conversion +# boundary_gdf_utm = boundary_gdf.to_crs(utm_num) + +# # Get step sizes for each grid cell +# # ---- x +# x_step = resolution_m["x_distance"] +# # ---- y +# y_step = resolution_m["y_distance"] + +# # Prepare grid cell generation +# # ---- Get new boundaries +# xmin, ymin, xmax, ymax = boundary_gdf_utm.total_bounds +# # ---- Initialize empty list +# grid_cells = [] +# # ---- Initialize coordinate counter +# y_ct = 0 +# x_coord = []; y_coord = [] +# # ---- Iterate through to generate cells +# for y0 in np.arange(ymin, ymax, y_step): +# y_ct += 1 +# x_ct = 0 +# for x0 in np.arange(xmin, xmax, x_step): +# x_ct += 1 +# # ---- Step forward +# x_coord.append(x_ct) +# y_coord.append(y_ct) +# x1 = x0 - x_step +# y1 = y0 + y_step +# # ---- Append to list +# grid_cells.append(shapely.geometry.box(x0, y0, x1, y1)) + +# # Convert to a GeoDataFrame +# cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=utm_code) +# # ---- Add coordinates +# cells_gdf.loc[:, "x"] = np.array(x_coord) +# cells_gdf.loc[:, "y"] = np.array(y_coord) + +# # Get coastline shapefile directory, if defined +# if "coastline" in file_configuration["input_directories"]: + +# # Get coastline settings +# coast_settings = file_configuration["input_directories"]["coastline"] +# # ---- Create filepath +# shp_filepath = ( +# root_dir / coast_settings["directory"] +# / coast_settings["coastline_name"] / f"{coast_settings["coastline_name"]}.shp" +# ) +# # ---- Validate existence +# if not shp_filepath.exists(): +# raise FileNotFoundError( +# f"{shp_filepath} does not exist!" +# ) + +# # Get original lat/lon geometry boundaries +# xmin0, ymin0, xmax0, ymax0 = boundary_gdf.total_bounds + +# # Read in file +# full_coast = gpd.read_file(shp_filepath) +# # ---- Convert to UTM +# full_coast_utm = full_coast.to_crs(utm_code) +# # ---- Remove empty +# full_coast_utm = full_coast_utm[~full_coast_utm.is_empty] + +# # Create bounding box with a buffer +# boundary_box = box(xmin0 - 5, ymin0 - 5, xmax0 + 5, ymax0 + 5) +# # ---- Create an unbuffered copy +# boundary_box_unbuffered = box(xmin0, ymin0, xmax0, ymax0) +# # ---- Convert to a GeoDataFrame +# boundary_box_unbuffered_gdf = ( +# gpd.GeoDataFrame(geometry=[boundary_box_unbuffered], crs=projection) +# ) +# # ---- Clip the coastline for saving +# clipped_coast_original = ( +# gpd.clip(full_coast, box(xmin0 + 1, ymin0 + 1, xmax0 + 1, ymax0 + 1)) +# ) + +# # Clip the coastline shapefile +# clipped_coast = gpd.clip(full_coast, boundary_box).to_crs(utm_code) + +# # Clip the grid cells +# cells_gdf.loc[:, "geometry"] = ( +# cells_gdf["geometry"].difference(clipped_coast.geometry.union_all()) +# ) + +# # Calculate area per cell +# cells_gdf.loc[:, "area"] = cells_gdf.area + +# # Convert back to original projection and clip +# clipped_cells_latlon = ( +# gpd.clip(cells_gdf.to_crs(projection), boundary_box_unbuffered_gdf) +# .reset_index(drop=True) +# ) + +# # Initialize empty columns that can be added to later on +# clipped_cells_latlon.loc[:, ["number_density_mean", "biomass_density_mean", +# "abundance", "biomass"]] = 0.0 + +# # Create output DataFrame +# output_df = pd.DataFrame({ +# "geometry": clipped_cells_latlon["geometry"].apply(lambda geom: geom.wkt) +# }) +# # ---- Add the required columns +# output_df = pd.concat([output_df, clipped_cells_latlon.loc[:, ["x", "y", "area"]]], +# axis=1) +# # ---- Initialize empty columns that can be added to later on +# output_df.loc[:, ["number_density_mean", "biomass_density_mean", "abundance", +# "biomass"]] = 0.0 + +# # Write to the database file (for the grid) +# # ---- Create engine +# engine = sqla.create_engine(f"sqlite:///{db_filepath}") +# # ---- Connect and create table +# _ = output_df.to_sql("grid_df", engine, if_exists="replace") + +# # Write to the database file (for the coastline shapefile) +# # ---- Create output copy +# coastline_out = pd.DataFrame({ +# "geometry": clipped_coast_original["geometry"].apply(lambda geom: geom.wkt) +# }) +# # ---- Concatenate +# coastline_out = ( +# pd.concat([coastline_out, clipped_coast_original.drop(columns="geometry")], +# axis=1) +# ) +# # ---- Connect and create table +# _ = coastline_out.to_sql("coastline_df", engine, if_exists="replace") + +# ################################################################################################## +# # TEST: YAML FILE CONFIGURATION +# # ---- Define filepaths +# self = LiveSurvey +# live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initia +# lization_config.yml" +# live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_ +# year_2019_config.yml" +# # ---- Run function: `live_configuration` +# file_configuration = self.config +# files = biology_files + +# biology_output = initial_biology_output +# file_configuration = self.config +# table_name = "length_df" +# df = filtered_biology_output[table_name] +# database_file = biology_db +# kwargs = dict(dataframe=df, table_name=table_name, id_columns=["id"], primary_keys=["id"], +# output_type=pd.DataFrame) + +# # NOTE: ARGUMENT: {working_dataset: Literal["acoustics", "biology"]} +# working_dataset = "acoustics" +# self = realtime_survey +# file_configuration = self.config +# self.results["biology"] = self.input["biology_processed"] +# self.results["acoustics"] = self.input["nasc_df"] + +# # Get spatial column +# spatial_column = file_configuration["spatial_column"] + +# # Initialize the working data dictionary +# working_data = copy.deepcopy(self.results) +# contrast_columns = [] +# # ---- Define unique columns +# unique_columns = spatial_column + contrast_columns + +# acoustic_db = file_configuration["database"][working_dataset] +# self = realtime_survey +# acoustic_dict = self.input["acoustics"] +# verbose = True +# contrast_columns = [] +# db_file = acoustic_db +# table_name="survey_data_df" +# data_columns = data_columns +# unique_columns=unique_columns +# constraint="nasc > 0.0" +# data_dict = self.input["acoustics"] +# data_dict["nasc_df"]["stratum"] = 1 +# data_dict["prc_nasc_df"]["stratum"] = 2 +# table_name = "sigma_bs_mean_df" +# data_columns=["sigma_bs", "sigma_bs_count"] +# biology_db +# strata_df = self.input["spatial"]["strata"] + +# def biology_pipeline(biology_dict: dict, +# strata_df: pd.DataFrame, +# file_configuration: dict, +# verbose: bool, +# contrast_columns: List[str] = []): + +# # Get spatial column +# spatial_column = file_configuration["spatial_column"] +# unique_columns = spatial_column + contrast_columns + +# # Get database file +# acoustic_db = file_configuration["database"]["acoustics"] + +# # Get biology database file +# biology_db = file_configuration["database"]["biology"] + +# # Check for data completion +# # ---- List of boolean values +# full_biology_data = ( +# [True for _, df in biology_dict.items() if isinstance(df, pd.DataFrame) and df is # -file_configuration["acoustics"]["TS_length_regression_parameters"][target_species["text_code"]] - -def average_sigma_bs(length: Union[pd.DataFrame, float, int], - TS_L_slope: Optional[float] = None, - TS_L_intercept: Optional[float] = None, - weighted: Optional[Union[float, int, str]] = None): - - # - if isinstance(length, pd.DataFrame): - if "length" not in length.columns: - raise ValueError( - "Column [`length`] missing from dataframe input `length`." - ) - if "TS_L_slope" not in length.columns and TS_L_slope is None: - raise ValueError( - "Value [`TS_L_slope`] missing from dataframe input `length` and optional " - "separate argument `TS_L_slope`." - ) - if "TS_L_intercept" not in length.columns and TS_L_intercept is None: - raise ValueError( - "Value [`TS_L_intercept`] missing from dataframe input `length` and optional " - "separate argument `TS_L_intercept`." - ) - elif isinstance(length, float) or isinstance(length, int): - if TS_L_slope is None: - raise ValueError( - "Argument [`TS_L_slope`] missing." - ) - elif TS_L_slope is not None and not isinstance(TS_L_slope, float): - raise TypeError( - "Argument `TS_L_slope` must be type `float`." - ) - if "TS_L_intercept" not in length.columns and TS_L_intercept is None: - raise ValueError( - "Argument [`TS_L_intercept`] missing." - ) - elif TS_L_intercept is not None and not isinstance(TS_L_intercept, float): - raise TypeError( - "Argument `TS_L_intercept` must be type `float`." - ) - - # - if TS_L_slope is None: - TS_L_slope = length["TS_L_slope"] - - # - if TS_L_intercept is None: - TS_L_intercept = length["TS_L_intercept"] - - # - if isinstance(length, pd.DataFrame): - length_val = length["length"] - - ts_value = ts_length_regression(length_val, TS_L_slope, TS_L_intercept) - sigma_bs_value = to_linear(ts_value) - - - - if isinstance(weighted, str): - if weighted not in length.columns: - raise ValueError( - f"Argument [`weighted` (str)], '{weighted}', is not a column in argument `length` " - f"(DataFrame)." - ) - else: - return (sigma_bs_value * length[weighted]).sum() / length[weighted].sum() - elif weighted is not None: - if weighted.size != sigma_bs_value.size: - raise ValueError( - f"Argument [`weighted` (float|int)] of size {weighted.size} does not match size of " - f"argument [`length` (float|int)`] of size {sigma_bs_value.size}." - ) - else: - return (sigma_bs_value * weighted).sum() / weighted.sum() - else: - return sigma_bs_value.mean() - -def parse_condition(condition): - # Handle nested conditions and logical operators - condition = condition.replace('&', ' AND ').replace('|', ' OR ') - - # Handle "IN" lists and replace square brackets with parentheses - condition = re.sub(r'(\w+)\s*IN\s*\[(.*?)\]', lambda m: f"{m.group(1)} IN ({m.group(2)})", condition, flags=re.IGNORECASE) - - # Handle range conditions for BETWEEN, including floats - condition = re.sub(r'(\d*\.\d+|\d+)\s*<=\s*(\w+)\s*<=\s*(\d*\.\d+|\d+)', - lambda m: f"{m.group(2)} BETWEEN {m.group(1)} AND {m.group(3)}", condition) - - # Handle individual comparisons - condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\d*\.\d+|\d+)', lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", condition) - condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\'[^\']*\')', lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", condition) - - # Handle single equal sign - condition = re.sub(r'(\w+)\s*=\s*(\d*\.\d+|\d+)', lambda m: f"{m.group(1)} = {m.group(2)}", condition) - - # Remove redundant spaces - condition = re.sub(r'\s+', ' ', condition).strip() - - return condition - -#################################################################################################### -def load_spatial_data(file_configuration: dict, - acoustic_data: pd.DataFrame, - coordinate_metadata: xr.Dataset): - - # Extract spatial strata *only* if spatial information from the configuration settings - # ---- Extract the projection - projection = file_configuration["geospatial"]["projection"] - # ---- Extract the biology-acoustics linking method options - acoustics_biology_link = file_configuration["geospatial"]["link_biology_acoustics"] - - # Convert the DataFrame to a GeoDataFrame - acoustic_data_gdf = gpd.GeoDataFrame( - data=acoustic_data, - geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]), - crs=projection - ) - - # Validate the spatial biology-acoustics linking method - # ---- Get the biology-acoustics linking method - link_method = next(key for key, value in acoustics_biology_link.items() if value) - # ---- Flag Error if unexpected method - if link_method not in ["global", "closest_haul", "INPFC", "weighted_haul"]: - raise ValueError( - f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options " - f"include: 'global', 'closest_haul', 'weighted_haul', and 'INPFC'." - ) - - # Create INPFC stratum dataframe - # ---- Extract - - # Validate projection information - # ---- Create a dummy GeoDataFrame to extract CRS information - # geo_crs = gpd.GeoDataFrame(geometry=[], crs=projection) - # ---- Extract coordinate limits from the acoustic data - # lat_min = coordinate_metadata.attrs['geospatial_lat_min'] - # lat_max = coordinate_metadata.attrs['geospatial_lat_max'] - # lon_min = coordinate_metadata.attrs['geospatial_lon_min'] - # lon_max = coordinate_metadata.attrs['geospatial_lon_max'] - # # ---- Create boundary box string - # boundary_box_str = ( - # f"POLYGON(({lon_min} {lat_min}, {lon_max} {lat_min}, {lon_max} {lat_max}, " - # f"{lon_min} {lat_max}, {lon_min} {lat_min}))" - # ) - - # data_gdf = gpd.GeoDataFrame(acoustic_data, geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]),crs=f"epsg:{utm_string_generator(lon_min, lat_min)}") - # gpd.GeoDataFrame(acoustic_data, geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]),crs=f"epsg:4326").to_crs("epsg:32610") - - # from pyproj import CRS - # from pyproj.aoi import AreaOfInterest - # from pyproj.database import query_utm_crs_info - - # utm_crs_list = query_utm_crs_info( - # datum_name="WGS 84", - # area_of_interest=AreaOfInterest( - # west_lon_degree=lon_min, - # south_lat_degree=lat_min, - # east_lon_degree=-lon_max, - # north_lat_degree=lat_max, - # ), - # ) - # CRS.from_epsg(utm_crs_list[0].code).to_epsg("+proj=latlon") - -#################################################################################################### -def live_data(file_configuration: dict): - - # Extract the file directories (or from the configuration) containing acoustic, biological, and - # spatial definitions/data/parameters - # ---- Acoustic data - acoustic_data = load_validated_acoustic_data(file_configuration) - # ---- Biological data - # ---- Spatial data - - - -#################################################################################################### -# * Define `LIVE_DATA_STRUCTURE` configuration mapping (this will be in an equivalent `core.py`) -# TODO: Update structure with additional information (as needed) -# TODO: Documentation -LIVE_DATA_STRUCTURE = { - "meta": { - "provenance": dict(), - "date": list(), - }, - "input": { - "acoustics": { - "nasc_df": pd.DataFrame(), - }, - "biology": { - "catch_df": pd.DataFrame(), - "distributions": { - "length_bins_df": pd.DataFrame(), - }, - "length_df": pd.DataFrame(), - "specimen_df": pd.DataFrame(), - }, - }, - "results": { - "acoustics": dict(), - "biology": dict(), - "stratified": dict(), - }, -} -#################################################################################################### -# * Define `LiveSurvey` class structure -# TODO: Incorporate validators -# TODO: Scope out full structure including accessors, attributes, and methods -# TODO: Configure input arguments (for initialization) -# TODO: Documentation -class LiveSurvey: - """ - A real-time processing version of the `echopop` base `Survey` class that ingests biological, - acoustic, and event meta data to provide population estimates when generated. - """ - - def __init__( - self, - live_init_config_path: Union[str, Path], - live_file_config_path: Union[str, Path], - ): - # Initialize `meta` attribute - self.meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"]) - - # Loading the configuration settings and definitions that are used for defining the - # configuration settings - self.config = live_configuration(live_file_config_path, live_file_config_path) - - # Loading the datasets defined in the configuration files - self.input = el.load_survey_data(self.config) - - # Initialize the `results` data attribute - self.results = copy.deepcopy(LIVE_DATA_STRUCTURE["results"]) - -current_units = zarr_data_ds["frequency_nominal"].units -acoustic_analysis_settings["transmit"] -file_configuration - -specimen_df = pd.DataFrame( - { - "haul_num": np.repeat([1,2,3], 4), - "station": "specimen", - "sex": np.tile(["male", "female"], 6), - "length": np.array([11, 11, 11, 18, 21, 23, 13, 11, 19, 25, 18, 9]), - "weight": np.array([11, 14, 16, 18, 21, 23, 13, 11, 19, 25, 18, 9]) / 3.5, - }, -) - -length_df = pd.DataFrame( - { - "haul_num": np.repeat([1,2,3], 4), - "station": "length", - "sex": np.tile(["male", "female"], 6), - "length": np.array([16, 15, 19, 14, 9, 10, 18, 15, 16, 22, 17, 11]), - "length_count": np.array([103, 123, 257, 106, 52, 329, 131, 72, 101, 212, 93, 81]), - }, -) - -catch_df = pd.DataFrame( - { - "haul_num": np.array([1, 2, 3]), - "weight": np.array([503.12, 684.32, 978.54]) - } -) - -TS_SLOPE = 20.0 -TS_INTERCEPT = -68.0 - -acoustic_db = realtime_survey.config["database"]["acoustics"] -SQL(acoustic_db, "select", table_name="files_processed") -biology_db = realtime_survey.config["database"]["biology"] -SQL(biology_db, "select", table_name="files_processedk") -#### -# CONCATENATE FILE SOURCES -specimen_reframed = specimen_df.groupby(["haul_num", "station", "sex", "length"])["length"].value_counts().to_frame("length_count").reset_index() -specimen_reframed -# MELD -all_lengths = pd.concat([length_df, specimen_reframed]) -# COMBINE -comb_lengths = all_lengths.groupby(["haul_num", "sex", "length"])["length_count"].sum().to_frame("length_count").reset_index() - - -from echopop.live.sql_methods import SQL - -# Assuming that you have a LiveSurvey object defined -# ---- Get the database file name (and path) -biology_db = livesurvey_object.config["database"]["biology"] -# ---- -# CONVERT TO TS -comb_lengths["ts"] = TS_SLOPE * np.log10(comb_lengths["length"]) + TS_INTERCEPT -# TO SIGMA_BS -comb_lengths["sigma_bs"] = 10 ** (comb_lengths["ts"] / 10) -# WEIGHTED MEAN SIGMA_BS -sigma_mean = np.average(comb_lengths["sigma_bs"], weights=comb_lengths["length_count"]) - -# INTEGRATE NASC -path2file = "C:/Users/15052/Downloads/win_1720457505_1720460000_NASC.zarr" - -Path(path2file).exists() -xds = xr.open_dataset(path2file, engine="zarr") -xds -xdf = xds.to_dataframe().reset_index() -xdf["NASC"] = xdf["NASC"].fillna(0.0) -# convert frequency -xdf["frequency_nominal"] = (xdf["frequency_nominal"] * 1e-3).astype(int) -# filter -xdf_38 = xdf[xdf["frequency_nominal"] == nasc_frequency] - -xdf_38.plot.scatter(x="distance", y="depth", c="NASC") -plt.show() - -xdf_int = xdf_38.groupby(["distance", "longitude", "latitude"])["NASC"].sum().reset_index() - -plt.scatter(xdf_int["longitude"], xdf_int["latitude"], c=xdf_int["NASC"]) -plt.plot(xdf_int["longitude"], xdf_int["latitude"]) -plt.show() - -# CONVERT TO NUMBER DENSITY -xdf_int["number_density"] = xdf_int["NASC"] / (4.0 * np.pi * sigma_mean) - - -################### -from geopy.distance import distance -from shapely.geometry import Polygon, Point, box -import geopandas as gpd -from shapely.ops import unary_union -import pyproj - - -grid_settings = file_configuration["geospatial"]["griddify"] -grid = [] -lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters -lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters -lat_min = grid_settings["bounds"]["latitude"][0] -lat_max = grid_settings["bounds"]["latitude"][1] -lon_min = grid_settings["bounds"]["longitude"][0] -lon_max = grid_settings["bounds"]["longitude"][1] - -utm_str = utm_string_generator((lon_max + lon_min)/2, (lat_max + lat_min)/2) -utm_proj = pyproj.Proj(f"epsg:{utm_str}") -x_min, y_min = utm_proj(lon_min, lat_min) -x_max, y_max = utm_proj(lon_max, lat_max) - -lat = 55.5000 -lon = -134.2500 -utm_code = int(utm_string_generator(lon, lat)) -utm_proj = pyproj.Proj(f"epsg:{utm_code}") -utm_proj(lon, lat) -gpd.GeoDataFrame(geometry=gpd.points_from_xy(np.array([lon]), np.array([lat])), crs=projection).to_crs(utm_code) - - -num_lon_steps = int((x_max - x_min) / lon_step) -num_lat_steps = int((y_max - y_min) / lat_step) - -lon1 = np.linspace(x_min, x_max - lon_step, num_lon_steps) -lat1 = np.linspace(y_min, y_max - lat_step, num_lat_steps) -lon2 = lon1 + lon_step -lat2 = lat1 + lat_step - -# Convert UTM coordinates back to degrees -lon_min_grid, lat_min_grid = np.meshgrid(lon1, lat1) -lon_max_grid, lat_max_grid = np.meshgrid(lon2, lat2) - -# Convert UTM coordinates back to degrees with adjusted resolution -lon1_deg, lat1_deg = utm_proj(lon_min_grid.ravel(), lat_min_grid.ravel(), inverse=True) -lon2_deg, lat2_deg = utm_proj(lon_max_grid.ravel(), lat_max_grid.ravel(), inverse=True) - -polygons = [box(lon1, lat1, lon2, lat2) for lon1, lat1, lon2, lat2 in zip(lon1_deg, lat1_deg, lon2_deg, lat2_deg)] -grid_gdf = gpd.GeoDataFrame({'geometry': polygons}, crs="epsg:4326") - -world = gpd.read_file("C:/Users/15052/Documents/GitHub/echopop_data/live_2019_files/coastline/ne_110m_land/ne_110m_land.shp") -bbox = box(lon_min - 0.25, lat_min - 0.25, lon_max + 0.25, lat_max + 0.25) -shapefile = world -clipped_shapefile = gpd.clip(shapefile, bbox).to_crs(utm_proj.srs) -clipped_shapefile.to_crs(utm_proj.srs) -# clipped_geometry = bbox.intersection(world.union_all()) -# clipped_gdf = gpd.GeoDataFrame(geometry=[clipped_geometry], crs=world.crs) - -from shapely.geometry import MultiPolygon -# Create an empty list to store clipped geometries -# clipped_geometries = [] - -# # Iterate over each grid polygon -# for index, row in grid_gdf.iterrows(): -# # Intersect grid polygon with land shape -# intersection = row['geometry'].intersection(clipped_shapefile.unary_union) - -# # If intersection is a MultiPolygon, get the difference with the land shape -# if isinstance(intersection, MultiPolygon): -# clipped = row['geometry'].difference(clipped_shapefile.unary_union) -# if clipped.is_empty: -# continue -# clipped_geometries.append(clipped) +# not None] +# ) +# # ---- Validation +# if not all(full_biology_data): +# # ---- Print, if verbose +# if verbose: +# print( +# f"No new processed biology data available for processing." +# ) +# else: +# # Get related biology data +# acoustic_df = get_nasc_sql_data(acoustic_db, +# biology_dict, +# unique_columns=unique_columns) + +# # Get the corresopding `sigma_bs` data (and also compute the s +# ample-number weighted average) +# sigma_bs_df = get_sigma_bs_sql_data(acoustic_db, +# biology_dict, +# unique_columns=unique_columns) + +# # Calculate population estimates if valid data are available +# if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]): +# # ---- Merge the NASC and sigma_bs datasets +# nasc_biology = acoustic_df.merge(sigma_bs_df, on=unique_columns) +# # ---- Compute the number densities (animals nmi^-2) +# nasc_biology["number_density"] = ( +# nasc_biology["nasc"] +# / (4.0 * np.pi * nasc_biology["sigma_bs_mean"]) +# ) + +# # Get the corresponding average strata weights (computed for all fish) +# weight_spatial_averages = get_average_strata_weights(biology_db, +# biology_dict, +# unique_columns=unique_columns) + +# if weight_spatial_averages is not None: +# # Merge average weights with number density estimates +# nasc_biology = nasc_biology.merge(weight_spatial_averages, on=unique_columns) + +# # Compute biomass densities +# nasc_biology["biomass_density"] = ( +# nasc_biology["number_density"] * nasc_biology["average_weight"] +# ) + +# # Update the survey population estimate DataFrame with the newly computed densities +# if not nasc_biology.empty: +# sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df", +# columns=["number_density", "biomass_density"], +# unique_columns=["stratum", "longitude", "latitude", "ping_time"]) + +# # Summarize strata +# summarize_strata(nasc_biology, strata_df, file_configuration) + +# db_file=acoustic_db +# dataframe=nasc_biology +# table_name="survey_data_df" +# columns=["number_density", "biomass_density"] +# unique_columns=["stratum", "longitude", "latitude", "ping_time"] +# nasc_biology["number_density"].sum() / 2 +# nasc_biology["number_density"] +# SQL(acoustic_db, "select", table_name="survey_data_df") +# SQL(biology_db, "select", table_name="strata_summary_df") +# strata_df = self.input["spatial"]["strata"].copy() +# strata_df[["length_mean", "weight_mean", "TS_mean", "number_density_mean", +# "biomass_density_mean", "abundance_sum", "biomass_sum"]] = np.nan +# strata_df.drop(columns=["latitude_interval"], inplace=True) +# SQL(acoustic_db, "select", table_name="survey_data_df") + +# SQL(biology_db, "drop", table_name="strata_summary_df") +# SQL(biology_db, "create", table_name="strata_summary_df", dataframe=strata_df, +# primary_keys=["stratum"]) +# SQL(biology_db, "insert", table_name="strata_summary_df", dataframe=strata_df, +# id_columns=["stratum"]) + +# tt = pd.DataFrame({ +# "x": np.array([1, 1, 1, 2, 2, 2, 3, 3, 3]), +# "y": np.array([1, 2, 3, 1, 2, 3, 1, 2, 3]), +# "area": 50 ** 2, +# "mean_number_density": 0.0, +# "mean_biomass_density": 0.0, +# "abundance": 0.0, +# "biomass": 0.0 +# }) + +# nasc_biology_output_a = self.input["nasc_df"].assign(x=1, y=1).reset_index(drop=True) +# nasc_biology_output_a.loc[3, "x"] = 2 +# nasc_biology_output_a.loc[3, "y"] = 3 +# nasc_biology_output_a = nasc_biology_output_a.filter(["stratum", "x", "y", "longitude", +# "latitude", "nasc", "number_density", "biomass_density"]) +# nasc_biology_output = nasc_biology_output_a.merge(sigma_bs_mean_df, on=spatial_column) +# nasc_biology_output["number_density"] = ( +# nasc_biology_output["nasc"] +# / (4.0 * np.pi * nasc_biology_output["sigma_bs_mean"]) +# ) +# nasc_biology_output =nasc_biology_output.merge(general_weight_averages) +# nasc_biology_output["biomass_density"] = nasc_biology_output["number_density"] +# * nasc_biology_output["average_weight"] +# nasc_biology_output = nasc_biology_output.filter(["stratum", "x", "y", "longitude", "latitude" +# , "number_density", "biomass_density"]) +# nasc_biology_output = nasc_biology_output[nasc_biology_output["number_density"] > 0.0] +# .reset_index() + +# SQL(acoustic_db, "drop", table_name="reference") +# SQL(acoustic_db, "drop", table_name="grid") + +# SQL(acoustic_db, "create", table_name = "reference", dataframe=tt) +# SQL(acoustic_db, "create", table_name = "grid", dataframe=nasc_biology_output_a) + +# SQL(acoustic_db, "insert", table_name = "reference", dataframe=tt) +# SQL(acoustic_db, "insert", table_name = "grid", dataframe=nasc_biology_output_a) + +# SQL(acoustic_db, "select", table_name="grid") +# SQL(acoustic_db, "select", table_name="reference") + +# sql_group_update(acoustic_db, dataframe=nasc_biology_output, +# table_name="grid", columns=["number_density", "biomass_density"], +# unique_columns=["stratum", "x", "y", "longitude", "latitude"]) + +# SQL(acoustic_db, "select", table_name="grid") + +# from typing import List + +# data_table = "grid" +# grid_table = "reference" +# column_pairs = [("number_density", "abundance"), ("biomass_density", "biomass")] + +# dataframe = nasc_biology_output + +# import sqlalchemy as sqla + +# grid_db_file = file_configuration["database"]["grid"] +# survey_db_file = Path(file_configuration["data_root_dir"]) / "database" / "acoustics.db" +# data_table = "survey_data_df" +# grid_table = "grid_df" +# coordinates = ["x", "y"] +# from echopop.live.sql_methods import SQL + +# SQL(grid_db_file, "select", table_name=grid_table) +# SQL(survey_db_file, "select", table_name=data_table) +# SQL(data_table, "map") + +# gridding_column = self.config["gridding_column"] + +# updated_survey_data = nasc_biology.copy() +# # Get relevant table +# previous_grid = query_dataset(grid_db_file, updated_survey_data, +# table_name=grid_table, +# data_columns=["x", "y", "area", "number_density_mean", +# "biomass_density_mean", "abundance", "biomass"], +# unique_columns=["x", "y"]) +# previous_data = query_dataset(survey_db_file, updated_survey_data, +# table_name=data_table, +# data_columns=["x", "y", "number_density", "biomass_density"], +# unique_columns=["x", "y"]) +# # Get unique coordinates +# update_keys = get_unique_identifiers(updated_survey_data, gridding_column).set_index(["x", "y"]) + + +# # Index +# previous_grid.set_index(["x", "y"], inplace=True) +# previous_grid["biomass_density_mean"] = previous_data.groupby(["x", "y"])["biomass_density"] +# .mean() +# previous_grid["number_density_mean"] = previous_data.groupby(["x", "y"])["number_density"].mean() + +# # Convert area from m^2 to nmi^2 +# previous_grid["abundance"] = previous_grid["number_density_mean"] * previous_grid["area"] +# previous_grid["biomass"] = previous_grid["biomass_density_mean"] * previous_grid["area"] +# previous_grid = previous_grid.reset_index() + +# sql_group_update(grid_db_file, dataframe=previous_grid, +# table_name=grid_table, +# columns=["number_density_mean", "biomass_density_mean", "abundance", "biomass"], +# unique_columns=["x", "y"]) + +# myrrh = SQL(grid_db_file, "select", table_name=grid_table) +# myrrh[myrrh.abundance > 0] + +# update_keys["number_density_mean"] = updated_survey_data.groupby(["x", "y"]) +# ["number_density"].mean() +# update_keys["biomass_density_mean"] = updated_survey_data.groupby(["x", "y"]) +# ["biomass_density"].mean() + +# am = SQL(grid_db_file, "select", table_name="grid_df") +# am[am.abundance > 0] +# bm = SQL(grid_db_file, "select", table_name="grid_df") +# bm[bm.abundance > 0] +# number_density_mean = updated_survey_data.groupby(["x", "y"])["number_density"].mean() +# biomass_density_mean = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean() + +# SQL(grid_db_file, "select", table_name=grid_table) + + +# pulled_data = pd.concat([SQL(grid_db_file, "select", +# table_name=grid_table, +# condition=f"x = {t[0]} & y = {t[1]}") for t in unique_coord]) +# previous_cell_data = pd.concat([SQL(survey_db_file, "select", +# table_name=data_table, +# condition=f"x = {t[0]} & y = {t[1]}") for t in unique_coord]) + +# from typing import List + +# from shapely.geometry import box + +# from echopop.live.live_data_processing import ( +# get_average_strata_weights, +# get_nasc_sql_data, +# get_sigma_bs_sql_data, +# summarize_strata, +# ) +# from echopop.live.sql_methods import sql_group_update + +# SQL(grid_db_file, "select", table_name="grid_df") +# # Compute means +# number_density_mean = previous_cell_data.groupby(["x", "y"])["number_density"].mean() +# previous_cell_data = previous_cell_data.groupby(["x", "y"])["biomass_density"].mean() + +# [SQL(grid_db_file, "select", table_name=grid_table, condition=f"x = +# {xi} & y = {yi}") for xi, yi in zip(nasc_data_df["x"], nasc_data_df["y"])] + +# # Write to the database file (for the grid) +# # ---- Create engine +# engine = sqla.create_engine(f"sqlite:///{db_filepath}") + +# def update_population_grid(grid_db_file: str, +# data_table: str, +# grid_table: str, +# dataframe: pd.DataFrame, +# column_pairs: Union[List[tuple[str, str]], tuple[str, str]], +# coordinates: List[str]): + +# # Convert `column_pairs` to a list, if needed +# if not isinstance(column_pairs, list): +# column_pairs = [column_pairs] + +# dataframe[coordinates] +# # Format the coordinate pairs +# # ---- Convert coordinate values into a list of tuples +# coord_pairs = [tuple(row) for row in dataframe[coordinates].itertuples(index=False)] +# # ---- Get unique pairs +# coords = list(set(coord_pairs)) + +# # Format the SQL script command +# # ---- Initialize +# sql_script = [] +# # ---- Iteratively update +# for input_column, output_column in column_pairs: +# sql_script.append( +# f""" +# BEGIN TRANSACTION; + +# -- Calculate averages for input_column and update grid_table +# WITH avgs AS ( +# SELECT +# {coordinates[0]}, +# {coordinates[1]}, +# AVG(d.{input_column}) as avg_value +# FROM {data_table} d +# GROUP BY d.{coordinates[0]}, d.{coordinates[1]} +# ) + +# -- Update the grid_table with both average and computed total +# UPDATE {grid_table} +# SET +# mean_{input_column} = ( +# SELECT avg_value +# FROM avgs +# WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} +# AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} +# ), +# {output_column} = ( +# SELECT avg_value * {grid_table}.area +# FROM avgs +# WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} +# AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} +# ) +# WHERE EXISTS ( +# SELECT 1 +# FROM avgs +# WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} +# AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} +# ); + +# COMMIT; +# """ +# ) + +# # Create the engine +# engine = create_engine(f"sqlite:///{db_file}") + +# # Create the SQL database connection and send the script +# with engine.connect() as connection: +# dbapi_conn = connection.connection +# _ = dbapi_conn.executescript("\n".join(sql_script)) + + +# def update_population_grid(db_file: str, +# data_table: str, +# grid_table: str, +# dataframe: pd.DataFrame, +# column_pairs: Union[List[tuple[str, str]], tuple[str, str]], +# coordinates: List[str]): + +# # Convert `column_pairs` to a list, if needed +# if not isinstance(column_pairs, list): +# column_pairs = [column_pairs] + +# dataframe[coordinates] +# # Format the coordinate pairs +# # ---- Convert coordinate values into a list of tuples +# coord_pairs = [tuple(row) for row in dataframe[coordinates].itertuples(index=False)] +# # ---- Get unique pairs +# coords = list(set(coord_pairs)) + +# # Format the SQL script command +# # ---- Initialize +# sql_script = [] +# # ---- Iteratively update +# for input_column, output_column in column_pairs: +# sql_script.append( +# f""" +# BEGIN TRANSACTION; + +# -- Calculate averages for input_column and update grid_table +# WITH avgs AS ( +# SELECT +# {coordinates[0]}, +# {coordinates[1]}, +# AVG(d.{input_column}) as avg_value +# FROM {data_table} d +# GROUP BY d.{coordinates[0]}, d.{coordinates[1]} +# ) + +# -- Update the grid_table with both average and computed total +# UPDATE {grid_table} +# SET +# mean_{input_column} = ( +# SELECT avg_value +# FROM avgs +# WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} +# AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} +# ), +# {output_column} = ( +# SELECT avg_value * {grid_table}.area +# FROM avgs +# WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} +# AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} +# ) +# WHERE EXISTS ( +# SELECT 1 +# FROM avgs +# WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]} +# AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]} +# ); + +# COMMIT; +# """ +# ) + +# # Create the engine +# engine = create_engine(f"sqlite:///{db_file}") + +# # Create the SQL database connection and send the script +# with engine.connect() as connection: +# dbapi_conn = connection.connection +# _ = dbapi_conn.executescript("\n".join(sql_script)) + + +# SQL(acoustic_db, "select", table_name=data_table) +# SQL(acoustic_db, "select", table_name=grid_table) + + +# SQL(acoustic_db, "update", table_name="grid", dataframe=nasc_biology_output, +# unique_columns=["stratum", "x", "y"], columns=["number_density", "biomass_density"]) +# SQL(acoustic_db, "select", table_name="reference") + +# source_db = acoustic_db +# target_db = biology_db + +# source_table = "grid" +# target_table = "strata_summary_df" + +# data_columns = ["number_density", "biomass_density"] +# strata_columns = ["stratum"] +# strata = [2] +# stratum_list = ', '.join(map(str, stratum_values)) + +# data_column = data_columns[0] +# data_columns = data_columns[0] +# def sql_update_strata_summary(source_db: str, +# target_db: str, +# arg_fun: str, +# data_columns: List[tuple[str, str]], +# strata: list): + +# # Format strata list as a string +# strata_str = ', '.join(map(str, strata)) + +# # Function reference map +# FUNCTION_MAP = { +# "sum": {"function": "SUM", +# "suffix": "sum"}, +# "mean": {"function": "AVG", +# "suffix": "mean"} +# } + +# # Prepare the SQL script +# sql_script = f""" +# -- Attach the source and target databases +# ATTACH DATABASE '{source_db}' AS source; +# ATTACH DATABASE '{target_db}' AS target; + +# """ + +# # Dynamically format the cross-database command +# for data_column, method in data_columns: +# # ----- Format the function-method-suffic keys +# suffix = FUNCTION_MAP[method]["suffix"] +# fun = FUNCTION_MAP[method]["function"] +# # ---- Create the combined SQL command using f-strings +# sql_script += f""" +# -- Calculate averages and directly update the target table +# UPDATE target.{target_table} +# SET {data_column}_{suffix} = ( +# SELECT {fun}({data_column}) +# FROM source.{source_table} +# WHERE stratum = target.{target_table}.stratum +# ) +# WHERE stratum IN ({strata_str}); +# """ +# # ----- Append DETACH commands only once at the end +# sql_script += """ +# -- Detach the databases +# DETACH DATABASE source; +# DETACH DATABASE target; +# """ + +# # Create the engine +# engine = create_engine(f"sqlite:///{target_db}") + +# # Create the SQL database connection and send the script +# with engine.connect() as connection: +# dbapi_conn = connection.connection +# _ = dbapi_conn.executescript(sql_script) + +# SQL(biology_db, "select", table_name=target_table) +# SQL(acoustic_db, "select", table_name=source_table)["number_density"].mean() +# connection.close() +# dbapi_conn.close() + + +# pairs = [(1, 2), (3, 4), (5, 6)] + +# # Convert the pairs into a format suitable for SQL IN clause +# pairs_placeholder = ', '.join(f'({x}, {y})' for x, y in pairs) + +# # Construct the SQL command as a text string +# sql_command = f''' +# BEGIN TRANSACTION; + +# UPDATE reference +# SET total = ( +# SELECT AVG(g.sigma_bs) * r.area +# FROM grid g +# WHERE g.stratum = r.stratum_x +# ) +# WHERE (stratum_x, stratum_y) IN ({pairs_placeholder}); + +# COMMIT; +# ''' + +# psi = 10 ** (-21/10) +# psi * 280**2 * 1500 * 128e-6 / 2 +# psi / 3 * 280 ** 3 / 280 / 1852 ** 2 * nasc_biology["number_density"] + +# psi * (280.0 ** 2) / 1852 ** 2 +# depth_area = 280 ** 2 * psi +# swath_length = 0.5 * 1852 +# depth_area * swath_length / 1852 ** 2 * nasc_biology["number_density"] +# 280 ** 2 * psi / 1852 ** 2 * nasc_biology["number_density"] + +# SQL(acoustic_db, "map") +# beam_angle = 9.0 * np.pi / 180.0 +# 280.0 * np.tan(beam_angle) * 2.0 * swath_length / 1852 ** 2 * nasc_biology["number_density"] +# 280.0 * np.tan(beam_angle) * 2.0 ** 2 * np.pi * swath_length / 1852 ** 2 * +# nasc_biology["number_density"] +# area = 2.0 * nasc_biology["center_of_mass"] ** 2 * np.tan(beam_angle) +# area / 1852 ** 2 * nasc_biology["number_density"] +# SQL(acoustic_db, "map") + +# # Merge hake fraction data into `nasc_interval_df` +# # ---- Initial merge +# nasc_interval_df = nasc_interval_df.merge( +# input_dict["spatial"]["strata_df"], on=[stratum_col, "haul_num"], how="outer" +# ) +# # ---- Replace `fraction_hake` where NaN occurs +# nasc_interval_df["fraction_hake"] = nasc_interval_df["fraction_hake"].fillna(0.0) +# # ---- Drop NaN +# nasc_interval_df.dropna(subset=["transect_num"], inplace=True) + +# # Calculate the along-transect number density (animals per nmi^2) +# # ---- Merge NASC measurements with mean sigma_bs for each stratum +# nasc_biology = nasc_interval_df.merge(sigma_bs_strata, on=[stratum_col]) +# # ---- Calculate the number densities +# nasc_biology["number_density"] = ( +# nasc_biology["fraction_hake"] +# * nasc_biology["nasc"] +# / (4.0 * np.pi * nasc_biology["sigma_bs_mean"]) +# ) + + +# if working_dataset == "acoustic": +# db_file = self.config["database"]["acoustic"] +# elif working_dataset == "biology": +# db_file = self.config["database"]["biology"] +# else: +# raise ValueError( +# f"Argument for `working_dataset` [{working_dataset}] is invalid." +# f" Value must either be 'acoustic' or 'biology'." +# ) + +# # Extract the necessary correct strata mean sigma_bs +# sigma_bs_strata = analysis_dict["acoustics"]["sigma_bs"]["strata_mean_df"] + +# # Pull out the length-weight conversion for each stratum +# length_weight_strata = analysis_dict["biology"]["weight"]["weight_stratum_df"] + +# # Get the name of the stratum column +# stratum_col = settings_dict["transect"]["stratum_name"] + + +# catch_data = self.input["biology"]["catch_df"] + +# # Get the spatial column name, if there is one +# spatial_column = file_configuration["spatial_column"] +# # ---- Append additional columns that will be used +# contrast_columns = spatial_column + ["sex", "species_id"] + +# # Calculate grouped totals +# # ---- Sum the net haul weights from station 1/unaged fish +# catch_weights = catch_data.count_variable( +# contrasts=["species_id"] + spatial_column, +# variable="haul_weight", fun="sum" +# ) +# # ---- Rename resulting columns for both +# catch_weights.rename(columns={"count": "total_weight"}, inplace=True) + +# # ---- Specimen +# specimen_weights = specimen_weight_binned.sum().reset_index(name="total_weight") + +# specimen_weight_binned +# # Calculate the sexed and total stratum weights for each sex among unaged fish +# # ---- Sum the net haul weights from station 1/unaged fish +# catch_weights = catch_data.count_variable( +# contrasts=["species_id"] + file_configuration["spatial_column"], +# variable="haul_weight", fun="sum" +# ) +# # ---- Rename resulting columns for both +# catch_weights.rename(columns={"count": "total_weight"}, inplace=True) + +# # For the specimen data +# # ---- Sum the net haul weights from station 1/unaged fish +# # ---- Specimen +# specimen_weights_sex = ( +# specimen_weight_binned +# .groupby(contrast_columns)["weight"] +# .sum() +# ) +# # ---- Total (per stratum, if it exists) +# specimen_weight_total = specimen_weights_sex.transpose().unstack(1).sum(axis=1) + +# # For the length (unaged) dataset +# length_weights_sex = ( +# length_weight_binned +# .groupby(contrast_columns)["weight_interp"] +# .sum() +# ) +# # ---- Further reduce to the grand total (per stratum, if it exists) +# length_weight_total = length_weights_sex.transpose().unstack(1).sum(axis=1) + +# # ---- Standardize the unaged sexed weights +# length_weight_standardized = ( +# (length_weights_sex / length_weight_total).unstack(0) +# * catch_weights["total_weight"].to_numpy() +# ) + +# # Calculate the specimen weight proportions +# # ---- Pivot weight bins +# specimen_weight_binned_pvt = ( +# specimen_weight_binned.pivot_table( +# columns=spatial_column, +# index=["length_bin", "species_id", "sex"], +# values="weight", +# observed = False +# ) +# ) +# # ---- Divide by the aged stratum weights (relative to only aged fish) +# specimen_weight_proportions_pvt = ( +# specimen_weight_binned_pvt / specimen_weight_total.to_numpy() +# ) +# # ---- Pivot back to the desired format +# specimen_weight_proportion = ( +# specimen_weight_proportions_pvt +# .stack().reset_index(name="weight_proportion") +# .pivot_table(columns=stratum_column + ["species_id", "sex"], +# index="length_bin", values="weight_proportion") +# ) +# # ---- Calculate the internal (i.e. only aged fish) for each sex +# within_specimen_sex_proportions = ( +# specimen_weight_proportion.sum() +# ) + +# # Calculate the total strata weights +# # ---- Index `catch_weights` +# catch_weights_idx = catch_weights.set_index(stratum_column + ["species_id"]) +# # ---- Compute the spatially-stratified/grouped weights +# spatial_weights = ( +# pd.concat([specimen_weight_total.to_frame("total_weight"), catch_weights_idx]) +# .pivot_table( +# columns=stratum_column, +# aggfunc="sum", +# values="total_weight", +# observed=False +# ) +# ) + +# # Calculate the weight proportions relative to the overall stratum weights +# # ---- Aged +# # -------- Reformat into dataframe and merge with total stratum weights +# specimen_weights_binned_df = ( +# specimen_weight_binned_pvt.stack() +# .to_frame("specimen_weight") +# .reset_index() +# .merge(spatial_weights.T.reset_index(), on=stratum_column) +# ) +# # -------- Calculate proportions +# specimen_weights_binned_df["weight_proportion_overall"] = ( +# specimen_weights_binned_df["specimen_weight"] / specimen_weights_binned_df["total_weight"] +# ) +# # -------- Consolidate to calculate the sexed proportions per stratum +# specimen_weight_sex_proportions = specimen_weights_binned_df.groupby(stratum_column +# + ["species_id", "sex"])[ +# "weight_proportion_overall" +# ].sum() +# # ---- Unaged +# # -------- Reformat into dataframe and merge with total stratum weights +# length_weights_sex_standardized_df = ( +# length_weight_standardized.stack() +# .to_frame("catch_weight") +# .reset_index() +# .merge(spatial_weights.T.reset_index(), on=stratum_column) +# ) +# # -------- Calculate proportions +# length_weights_sex_standardized_df["weight_proportion_overall"] = ( +# length_weights_sex_standardized_df["catch_weight"] +# / length_weights_sex_standardized_df["total_weight"] +# ) +# # -------- Back-calculate the sexed weight proportions relative to just unaged fish +# # ------------ Aggregate proportions +# length_total_sex_proportions = length_weights_sex_standardized_df.pivot_table( +# columns=["species_id", "sex"], index=stratum_column, values="weight_proportion_overall" +# ).transpose().unstack(["species_id"]).sum(axis=0) +# # ------------ Re-compute the proportions +# length_weight_sex_proportions = ( +# length_weights_sex_standardized_df.pivot_table( +# index=["species_id", "sex"], columns=stratum_column, +# values="weight_proportion_overall" +# ) +# / length_total_sex_proportions.to_numpy() +# ) + +# # Compute the overall length-binned weight distributions among unaged fish +# # ---- Extract the number proportions computed for unaged fish +# length_number_proportions = length_number_proportion.copy() +# # ---- Filter out values besides those computed for 'all' fish +# length_number_proportions = length_number_proportions[length_number_proportions["sex"] == "all"] +# # ---- Convert to a table +# length_number_proportions_tbl = length_number_proportions.pivot_table( +# columns=stratum_column + ["species_id"], +# index=["length_bin"], +# values="proportion_number_length", +# aggfunc="sum", +# observed=False, +# ) +# # ---- Extract the fitted weight values calculated for all fish +# length_weight_all = length_weight_df[length_weight_df["sex"] == "all"] +# # ---- Generate the fitted weight array +# fitted_weights = length_weight_all.copy() +# # ---- Get actual length bins in dataset +# fitted_weights = fitted_weights[fitted_weights["length_bin"]. +# isin(length_number_proportions["length_bin"])] +# # ---- Apportion the averaged weights +# length_apportioned_weights = length_number_proportions_tbl.T * fitted_weights["weight_fitted"] +# .to_numpy() +# # ---- Compute the average weight proportions per length bin per stratum +# average_length_bin_weights = length_apportioned_weights.T / length_apportioned_weights +# .sum(axis=1) +# # ---- Convert back to a DataFrame +# average_length_bin_weights_df = average_length_bin_weights.unstack().reset_index( +# name="weight_proportion" +# ) + +# # Calculate the aged and unaged weight proportions +# # ---- Aged +# aged_proportions = specimen_weight_sex_proportions.unstack("sex").sum(axis=1) +# # ---- Unaged +# unaged_proportions = 1 - aged_proportions +# # -------- Re-weight the unaged sexed proportions +# unaged_weight_sex_proportions_overall = ( +# (length_weight_sex_proportions * unaged_proportions.unstack().transpose()).astype(float). +# fillna(0.0) +# ) + +# unaged_proportions.unstack().transpose() +# # Format the outputs +# # ---- Aged: stratum-sex-age-length relative to aged and total weights +# aged_overall_df = ( +# specimen_weight_proportion.unstack() +# .reset_index(name="weight_proportions") +# .merge( +# specimen_weights_binned_df[ +# stratum_column + ["length_bin", "sex", "species_id", "weight_proportion_overall"] +# ] +# ) +# ) +# # ---- Aged: stratum-sex relative to total weights +# aged_sex_df =within_specimen_sex_proportions.reset_index(name="weight_proportion_aged").set_index( +# stratum_column + ["species_id", "sex"] +# ) +# # ---- Add the aged sex proportiosn relative to the overall survey +# aged_sex_df["weight_proportion_overall_aged"] = specimen_weight_sex_proportions +# # ---- Consolidate the aged and unaged sexed dataframes +# # -------- Initialize the dataframe +# aged_unaged_sex_proportions = aged_sex_df.reset_index().set_index(["species_id", "sex"] +# + stratum_column) +# # --------- Add the within-unaged weight proportions +# aged_unaged_sex_proportions["weight_proportion_unaged"] = ( +# length_weight_sex_proportions.stack() +# ) +# # --------- Add the overall-unaged weight proportions +# aged_unaged_sex_proportions["weight_proportion_overall_unaged"] = ( +# unaged_weight_sex_proportions_overall.stack() +# ) +# # ---- Overall aged and unaged proportions +# aged_unaged_proportions = aged_proportions.reset_index(name="aged_proportions") +# # ---- Set index +# aged_unaged_proportions.set_index(stratum_column + ["species_id"], inplace=True) +# # -------- Add unaged proportions +# aged_unaged_proportions["unaged_proportions"] = unaged_proportions#.reset_index() +# # ---- Reset the index +# aged_unaged_proportions = aged_unaged_proportions.reset_index() +# ################################################################################################## +# # * Functionality for reading in processed acoustic data +# # TODO: Expand data validator and limit cases to '*.zarr' (for now) +# # TODO: Refactor "extra" components such as the validation steps, xarray-to-dataframe piping, etc. +# # TODO: Documentation +# file_settings = file_configuration["input_directories"]["acoustics"] +# root_directory = file_configuration["data_root_dir"] + + +# ################################################################################################## +# def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str, +# List[str]]] = None): + +# # Get all database files +# database_files = file_configuration["database"] + +# # Iterate through all keys +# for _, db_file in database_files.items(): +# # ---- Map the table names +# table_names = SQL(db_file, "map") +# # ---- Drop any noted exceptions +# if not isinstance(table_exception, list): +# table_exception = [table_exception] +# # ---- Drop exception table name +# if None not in table_exception: +# table_names = list(set(table_names) - set(table_exception)) +# _ = [SQL(db_file, "drop", table_name=table) for table in table_names] +# # ---- Validate that all tables were removed +# if set(table_names).intersection(set(SQL(table_names, "map"))): +# raise ValueError( +# f"Attempted reset of [{str(db_file)}] failed." +# ) + +# SPATIAL_CONFIG_MAP = { +# "closest_haul": { +# "proximity": { +# "choices": ["distance", "time"], +# }, +# }, +# "global" : {}, +# "griddify": { +# "bounds": { +# "longitude": { +# "types": [float] +# }, +# "latitude": { +# "types": [float] +# }, +# "northings": { +# "types": [float] +# }, +# "eastings": { +# "types": [float] +# }, +# "pairs": [("longitude", "latitude"), ("northings", "eastings")], +# }, +# "grid_resolution": { +# "x_distance": { +# "types": float, +# }, +# "y_distance": { +# "types": float, +# }, +# "d_longitude": { +# "types": float, +# }, +# "d_latitude": { +# "types": float, +# }, +# "grid_size_x": { +# "types": int, +# }, +# "grid_size_y": { +# "types": int, +# }, +# "pairs": [("x_distance", "y_distance"), ("d_longitude", "d_latitude"), +# ("grid_size_x", "grid_size_y")], +# }, +# }, +# "inpfc": { +# "stratum_names": { +# "types": [int, str] +# }, +# "latitude_max": { +# "types": [float], +# }, +# }, +# "weighted_haul": { +# "proximity": { +# "choices": ["distance", "time"] +# }, +# }, +# } + + +# reset_db_files(file_configuration, table_exception = "files_read") +# reset_db_files(file_configuration) + +# stamp = 20240714194248 +# stamp.astype(int) +# int(stamp) +# import re +# from datetime import datetime + + +# def infer_datetime_format(timestamp_str: Union[int, str]): +# patterns = { +# r"^\d{14}$": "%Y%m%d%H%M%S", # YYYYMMDDHHMMSS +# r"^\d{8}$": "%Y%m%d", # YYYYMMDD +# r"^\d{6}$": "%H%M%S", # HHMMSS +# r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$": "%Y-%m-%d %H:%M:%S", # YYYY-MM-DD HH:MM:SS +# r"^\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}$": "%Y/%m/%d %H:%M:%S", # YYYY/MM/DD HH:MM:SS +# r"^\d{4}-\d{2}-\d{2}$": "%Y-%m-%d", # YYYY-MM-DD +# r"^\d{4}/\d{2}/\d{2}$": "%Y/%m/%d" # YYYY/MM/DD +# } + +# for pattern, date_format in patterns.items(): +# if re.match(pattern, timestamp_str): +# return date_format + +# raise ValueError("Unknown timestamp format") + +# filter_dict = dict(species_filer=species_filter, trawl_filter=trawl_filter) + +# def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict): + +# # Create dataframe copy +# data_copy = biology_data.copy() + +# # Iterate through dictionary to apply filters (if present) +# for column, value in filter_dict.items(): +# if column in data_copy.columns: +# data_copy = data_copy[data_copy[column] == value] + +# # Return output +# return data_copy + + +# df[(df['species_id'] == species_filter if 'species_id' in df.columns else True)] +# df[(df["species_id"] == 17 if "species_id" in df.columns)] + +# (df[df["haul_num"] == 17 if "haul_num" in df.columns] else True) + + +# from datetime import datetime + +# df = biology_output["trawl_info_df"] +# df.loc[(df['species_id'] == species_filter if 'species_id' in df.columns else True), :] +# df.index + +# biology_output["trawl_info_df"].reset_index().index +# df = biology_output["catch_df"] +# df = df.loc[0, :].to_frame().T +# df.index +# df.loc[(df['species_id'] == species_filter if 'species_id' in df.columns else True)] + +# def convert_datetime(timestamp: Union[int, str, pd.Series]): + +# if isinstance(timestamp, pd.Series): +# test_timestamp = str(timestamp[0]) +# else: +# test_timestamp = str(timestamp) + +# # Approximate the datetime format +# datetime_format = infer_datetime_format(str(test_timestamp)) + +# # +# if isinstance(timestamp, pd.Series): +# return timestamp.apply(lambda x: datetime.strptime(x, datetime_format)) +# else: +# return datetime.strptime(timestamp, datetime_format) + +# infer_datetime_format(stamp) +# convert_datetime(stamp) +# infer_datetime_format(202407) + +# # {'global': False, 'INPFC': True, 'closest_haul': False, 'weighted_haul': False} +# file_configuration["geospatial"]["link_biology_acoustics"] = "INPFC" +# file_configuration["geospatial"] +# spatial_config = file_configuration["geospatial"] +# ############### + +# acoustic_data = self.input["acoustics"] +# biology_data = self.input["biology"] + + +# from echopop.live.live_core import SPATIAL_CONFIG_MAP + + +# def load_spatial_data(acoustic_data: dict, +# biology_data: dict, +# file_configuration: dict,): + +# # Extract spatial strata *only* if spatial information from the configuration settings +# # ---- Get (geo)spatial config +# spatial_config = file_configuration["geospatial"] +# # ---- Remove case sensitivity +# spatial_config = {key.lower(): value for key, value in spatial_config.items()} +# # ---- Extract the projection +# projection = spatial_config["projection"] +# # ---- Extract the biology-acoustics linking method options +# acoustics_biology_link = spatial_config["link_biology_acoustics"] + +# # Validate the configuration +# validate_spatial_config(spatial_config) + +# # Create spatial dictionary that will be added as an `input` +# spatial_dict = {"link_method": acoustics_biology_link} + +# # Assign the spatial link constraints to the acoustic and biological data +# if acoustics_biology_link == "INPFC": +# spatial_dict.update({"strata": create_inpfc_strata(spatial_config)}) + +# # Return the dictionary as an output +# return spatial_dict + + +# # Convert the DataFrame to a GeoDataFrame +# acoustic_data_gdf = gpd.GeoDataFrame( +# data=acoustic_data, +# geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]), +# crs=projection +# ) + +# # Validate the spatial biology-acoustics linking method +# # ---- Get the biology-acoustics linking method +# link_method = next(key for key, value in acoustics_biology_link.items() if value) +# # ---- Flag Error if unexpected method +# if link_method not in ["global", "closest_haul", "INPFC", "weighted_haul"]: +# raise ValueError( +# f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options " +# f"include: 'global', 'closest_haul', 'weighted_haul', and 'INPFC'." +# ) + +# ################################################################################################## +# # TEST: BIOLOGY FILE INGESTION CONFIGURATION +# # NOTE: +# # ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration` +# biology_data, file_configuration = load_biology_data(file_configuration) +# biology_data +# ################################################################################################## +# prc_nasc_df = acoustic_data["prc_nasc_df"] + +# def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: dict, +# echometrics: bool = True): + +# # Integrate NASC (and compute the echometrics, if necessary) +# nasc_data_df = ( +# acoustic_data_df.groupby(["longitude", "latitude", "ping_time"]) +# .apply(lambda group: integrate_nasc(group, echometrics)) +# .reset_index() +# ) +# # ---- Amend the dtypes if echometrics were computed +# if echometrics: +# nasc_data_df = ( +# nasc_data_df +# .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float, +# "center_of_mass": float, "dispersion": float, "evenness": float, +# "aggregation": float, "occupied_area": float}) +# ) + +# # Get the name of the associated db file +# acoustics_db = file_configuration["database"]["acoustics"] +# # ---- Get current tables +# tables = SQL(acoustics_db, "inspect") + +# # +# if "nasc_df" not in tables: +# _ = SQL(acoustics_db, "insert", table_name="nasc_df", dataframe=nasc_data_df) # else: -# # If intersection is a single Polygon, directly add to clipped geometries -# clipped_geometries.append(intersection) +# # ---- +# nasc_sql = SQL(acoustics_db, "select", table_name="nasc_df") +# # ---- +# index_equiv = nasc_data_df[["longitude", "latitude", "ping_time"]].isin(nasc_sql) +# # ---- +# bool_idx = index_equiv.apply(lambda x: np.all(x), axis=1) +# # ---- +# _ = SQL(acoustics_db, "insert", table_name="nasc_df", +# dataframe=nasc_data_df.loc[~bool_idx]) +# # ---- +# nasc_data_df = pd.concat([nasc_sql, nasc_data_df], ignore_index=True) + +# # Return the output +# return nasc_data_df + + +# SQL(acoustics_db, command="drop", table_name="nasc_df") +# SQL(acoustics_db, "inspect") + +# nasc_analysis = process_acoustic_data(acoustic_data["prc_nasc_df"], file_configuration) + +# SQL(acoustics_db, command="select", table_name="nasc_df") + +# TS_SLOPE = 20.0 +# TS_INTERCEPT = -68.0 + +# # CONVERT TO TS +# comb_lengths["ts"] = TS_SLOPE * np.log10(comb_lengths["length"]) + TS_INTERCEPT +# # TO SIGMA_BS +# comb_lengths["sigma_bs"] = 10 ** (comb_lengths["ts"] / 10) +# # WEIGHTED MEAN SIGMA_BS +# sigma_mean = np.average(comb_lengths["sigma_bs"], weights=comb_lengths["length_count"]) + +# from typing import Optional + +# from echopop.acoustics import to_dB, to_linear, ts_length_regression +# from echopop.utils import operations + +# __all__ = ["operations"] + +# # Meld bio datasets +# length_datasets = biology_data["specimen_df"].meld(biology_data["length_df"], +# contrasts=["haul_num", "sex", +# "species_id", "length"]) + +# # Create distribution +# distrib_params = file_configuration["biology"]["length_distribution"]["bins"] + +# length_bins = np.linspace(**{key: value for key, value in zip(["start", "stop", "num"], +# distrib_params)}, dtype=float) +# binwidth = np.diff(length_bins / 2.0).mean() +# intervals = np.concatenate([length_bins[:1] - binwidth, length_bins + binwidth]) +# length_bins_df = pd.DataFrame({"bin": length_bins, "interval": pd.cut(length_bins, intervals)}) +# # +# length_datasets["length_bin"] = pd.cut(length_datasets["length"], bins=intervals, +# labels=length_bins_df["bin"]) + +# stratify_key = file_configuration["geospatial"]["link_biology_acoustics"] + +# if stratify_key == "global": +# length_distribution = ( +# length_datasets.pivot_table(columns=["sex"], index=["length_bin"], +# values="length_count", aggfunc="sum", observed=False) +# ) +# # +# length_distribution["total"] = length_distribution.sum(axis=1) + +# length_distribution.transpose() +# SQL(biology_db, "drop", table_name="length_distribution") +# # Get the name of the associated db file +# biology_db = file_configuration["database"]["biology"] +# # ---- Get current tables +# tables = SQL(biology_db, "inspect") + + +# if "length_distribution" not in tables: +# _ = SQL(biology_db, "insert", table_name="length_distribution", +# dataframe=length_distribution.transpose()) + + +# SQL(biology_db, "select", table_name="length_distribution") +# SQL(biology_db, "drop", table_name="length_distribution") +# SQL(biology_db, "replace", table_name="length_distribution", +# dataframe=length_distribution.unstack().reset_index(name="count")) +# length_distribution.unstack().reset_index(name="count") +# mixed = SQL(biology_db, "select", table_name="length_distribution") +# length_bins[:1] +# from typing import Optional + +# from echopop.acoustics import to_dB, to_linear, ts_length_regression +# from echopop.utils import operations + +# __all__ = ["operations"] + +# biology_data = self.input["biology"] + +# # Meld bio datasets +# length_datasets = biology_data["specimen_df"].meld(biology_data["length_df"], +# contrasts=["haul_num", "species_id", "length"]) + +# ts_length_parameters_spp = [ +# spp +# for spp in file_configuration["acoustics"]["TS_length_regression_parameters"].values() +# if spp["number_code"] in np.unique(length_datasets.species_id).astype(int) +# ] + +# # ---- get species info +# target_species = pd.DataFrame.from_dict(ts_length_parameters_spp) + +# ts_lengths_df = length_datasets.merge( +# target_species.drop("length_units", axis=1), +# left_on=["species_id"], +# right_on=["number_code"], +# ) +# # ---- filter out other spp +# length_datasets[length_datasets["species_id"].isin(target_species["number_code"])] + +# # +# file_configuration["acoustics"]["TS_length_regression_parameters"][target_species["text_code"]] + +# def average_sigma_bs(length: Union[pd.DataFrame, float, int], +# TS_L_slope: Optional[float] = None, +# TS_L_intercept: Optional[float] = None, +# weighted: Optional[Union[float, int, str]] = None): + +# # +# if isinstance(length, pd.DataFrame): +# if "length" not in length.columns: +# raise ValueError( +# "Column [`length`] missing from dataframe input `length`." +# ) +# if "TS_L_slope" not in length.columns and TS_L_slope is None: +# raise ValueError( +# "Value [`TS_L_slope`] missing from dataframe input `length` and optional " +# "separate argument `TS_L_slope`." +# ) +# if "TS_L_intercept" not in length.columns and TS_L_intercept is None: +# raise ValueError( +# "Value [`TS_L_intercept`] missing from dataframe input `length` and optional " +# "separate argument `TS_L_intercept`." +# ) +# elif isinstance(length, float) or isinstance(length, int): +# if TS_L_slope is None: +# raise ValueError( +# "Argument [`TS_L_slope`] missing." +# ) +# elif TS_L_slope is not None and not isinstance(TS_L_slope, float): +# raise TypeError( +# "Argument `TS_L_slope` must be type `float`." +# ) +# if "TS_L_intercept" not in length.columns and TS_L_intercept is None: +# raise ValueError( +# "Argument [`TS_L_intercept`] missing." +# ) +# elif TS_L_intercept is not None and not isinstance(TS_L_intercept, float): +# raise TypeError( +# "Argument `TS_L_intercept` must be type `float`." +# ) + +# # +# if TS_L_slope is None: +# TS_L_slope = length["TS_L_slope"] + +# # +# if TS_L_intercept is None: +# TS_L_intercept = length["TS_L_intercept"] + +# # +# if isinstance(length, pd.DataFrame): +# length_val = length["length"] + +# ts_value = ts_length_regression(length_val, TS_L_slope, TS_L_intercept) +# sigma_bs_value = to_linear(ts_value) + + +# if isinstance(weighted, str): +# if weighted not in length.columns: +# raise ValueError( +# f"Argument [`weighted` (str)], '{weighted}', is not a column in argument +# `length` " +# f"(DataFrame)." +# ) +# else: +# return (sigma_bs_value * length[weighted]).sum() / length[weighted].sum() +# elif weighted is not None: +# if weighted.size != sigma_bs_value.size: +# raise ValueError( +# f"Argument [`weighted` (float|int)] of size {weighted.size} does not +# match size of " +# f"argument [`length` (float|int)`] of size {sigma_bs_value.size}." +# ) +# else: +# return (sigma_bs_value * weighted).sum() / weighted.sum() +# else: +# return sigma_bs_value.mean() + +# def parse_condition(condition): +# # Handle nested conditions and logical operators +# condition = condition.replace('&', ' AND ').replace('|', ' OR ') + +# # Handle "IN" lists and replace square brackets with parentheses +# condition = re.sub(r'(\w+)\s*IN\s*\[(.*?)\]', lambda m: f"{m.group(1)} IN ({m.group(2)})", +# condition, flags=re.IGNORECASE) + +# # Handle range conditions for BETWEEN, including floats +# condition = re.sub(r'(\d*\.\d+|\d+)\s*<=\s*(\w+)\s*<=\s*(\d*\.\d+|\d+)', +# lambda m: f"{m.group(2)} BETWEEN {m.group(1)} AND {m.group(3)}", condition) + +# # Handle individual comparisons +# condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\d*\.\d+|\d+)', lambda m: f"{m.group(1)} +# {m.group(2)} {m.group(3)}", condition) +# condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\'[^\']*\')', lambda m: f"{m.group(1)} +# {m.group(2)} {m.group(3)}", condition) + +# # Handle single equal sign +# condition = re.sub(r'(\w+)\s*=\s*(\d*\.\d+|\d+)', lambda m: f"{m.group(1)} = {m.group(2)}", +# condition) + +# # Remove redundant spaces +# condition = re.sub(r'\s+', ' ', condition).strip() + +# return condition + +# ################################################################################################## +# def load_spatial_data(file_configuration: dict, +# acoustic_data: pd.DataFrame, +# coordinate_metadata: xr.Dataset): + +# # Extract spatial strata *only* if spatial information from the configuration settings +# # ---- Extract the projection +# projection = file_configuration["geospatial"]["projection"] +# # ---- Extract the biology-acoustics linking method options +# acoustics_biology_link = file_configuration["geospatial"]["link_biology_acoustics"] + +# # Convert the DataFrame to a GeoDataFrame +# acoustic_data_gdf = gpd.GeoDataFrame( +# data=acoustic_data, +# geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]), +# crs=projection +# ) + +# # Validate the spatial biology-acoustics linking method +# # ---- Get the biology-acoustics linking method +# link_method = next(key for key, value in acoustics_biology_link.items() if value) +# # ---- Flag Error if unexpected method +# if link_method not in ["global", "closest_haul", "INPFC", "weighted_haul"]: +# raise ValueError( +# f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options " +# f"include: 'global', 'closest_haul', 'weighted_haul', and 'INPFC'." +# ) + +# # Create INPFC stratum dataframe +# # ---- Extract + +# # Validate projection information +# # ---- Create a dummy GeoDataFrame to extract CRS information +# # geo_crs = gpd.GeoDataFrame(geometry=[], crs=projection) +# # ---- Extract coordinate limits from the acoustic data +# # lat_min = coordinate_metadata.attrs['geospatial_lat_min'] +# # lat_max = coordinate_metadata.attrs['geospatial_lat_max'] +# # lon_min = coordinate_metadata.attrs['geospatial_lon_min'] +# # lon_max = coordinate_metadata.attrs['geospatial_lon_max'] +# # # ---- Create boundary box string +# # boundary_box_str = ( +# # f"POLYGON(({lon_min} {lat_min}, {lon_max} {lat_min}, {lon_max} {lat_max}, " +# # f"{lon_min} {lat_max}, {lon_min} {lat_min}))" +# # ) + +# # data_gdf = gpd.GeoDataFrame(acoustic_data, geometry=gpd.points_from_xy( +# acoustic_data["longitude"], acoustic_data["latitude"]),crs=f"epsg:{ +# utm_string_generator(lon_min, lat_min)}") +# # gpd.GeoDataFrame(acoustic_data, geometry=gpd.points_from_xy(acoustic_data["longitude"], +# acoustic_data["latitude"]),crs=f"epsg:4326").to_crs("epsg:32610") + +# # from pyproj import CRS +# # from pyproj.aoi import AreaOfInterest +# # from pyproj.database import query_utm_crs_info + +# # utm_crs_list = query_utm_crs_info( +# # datum_name="WGS 84", +# # area_of_interest=AreaOfInterest( +# # west_lon_degree=lon_min, +# # south_lat_degree=lat_min, +# # east_lon_degree=-lon_max, +# # north_lat_degree=lat_max, +# # ), +# # ) +# # CRS.from_epsg(utm_crs_list[0].code).to_epsg("+proj=latlon") + +# ################################################################################################## +# def live_data(file_configuration: dict): + +# # Extract the file directories (or from the configuration) containing acoustic, biological,and +# # spatial definitions/data/parameters +# # ---- Acoustic data +# acoustic_data = load_validated_acoustic_data(file_configuration) +# # ---- Biological data +# # ---- Spatial data + + +# ################################################################################################## +# # * Define `LIVE_DATA_STRUCTURE` configuration mapping (this will be in an equivalent `core.py`) +# # TODO: Update structure with additional information (as needed) +# # TODO: Documentation +# LIVE_DATA_STRUCTURE = { +# "meta": { +# "provenance": dict(), +# "date": list(), +# }, +# "input": { +# "acoustics": { +# "nasc_df": pd.DataFrame(), +# }, +# "biology": { +# "catch_df": pd.DataFrame(), +# "distributions": { +# "length_bins_df": pd.DataFrame(), +# }, +# "length_df": pd.DataFrame(), +# "specimen_df": pd.DataFrame(), +# }, +# }, +# "results": { +# "acoustics": dict(), +# "biology": dict(), +# "stratified": dict(), +# }, +# } +# ################################################################################################## +# # * Define `LiveSurvey` class structure +# # TODO: Incorporate validators +# # TODO: Scope out full structure including accessors, attributes, and methods +# # TODO: Configure input arguments (for initialization) +# # TODO: Documentation +# class LiveSurvey: +# """ +# A real-time processing version of the `echopop` base `Survey` class that ingests biological, +# acoustic, and event meta data to provide population estimates when generated. +# """ + +# def __init__( +# self, +# live_init_config_path: Union[str, Path], +# live_file_config_path: Union[str, Path], +# ): +# # Initialize `meta` attribute +# self.meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"]) + +# # Loading the configuration settings and definitions that are used for defining the +# # configuration settings +# self.config = live_configuration(live_file_config_path, live_file_config_path) + +# # Loading the datasets defined in the configuration files +# self.input = el.load_survey_data(self.config) + +# # Initialize the `results` data attribute +# self.results = copy.deepcopy(LIVE_DATA_STRUCTURE["results"]) + +# current_units = zarr_data_ds["frequency_nominal"].units +# acoustic_analysis_settings["transmit"] +# file_configuration + +# specimen_df = pd.DataFrame( +# { +# "haul_num": np.repeat([1,2,3], 4), +# "station": "specimen", +# "sex": np.tile(["male", "female"], 6), +# "length": np.array([11, 11, 11, 18, 21, 23, 13, 11, 19, 25, 18, 9]), +# "weight": np.array([11, 14, 16, 18, 21, 23, 13, 11, 19, 25, 18, 9]) / 3.5, +# }, +# ) + +# length_df = pd.DataFrame( +# { +# "haul_num": np.repeat([1,2,3], 4), +# "station": "length", +# "sex": np.tile(["male", "female"], 6), +# "length": np.array([16, 15, 19, 14, 9, 10, 18, 15, 16, 22, 17, 11]), +# "length_count": np.array([103, 123, 257, 106, 52, 329, 131, 72, 101, 212, 93, 81]), +# }, +# ) + +# catch_df = pd.DataFrame( +# { +# "haul_num": np.array([1, 2, 3]), +# "weight": np.array([503.12, 684.32, 978.54]) +# } +# ) + +# TS_SLOPE = 20.0 +# TS_INTERCEPT = -68.0 + +# acoustic_db = realtime_survey.config["database"]["acoustics"] +# SQL(acoustic_db, "select", table_name="files_processed") +# biology_db = realtime_survey.config["database"]["biology"] +# SQL(biology_db, "select", table_name="files_processedk") +# #### +# # CONCATENATE FILE SOURCES +# specimen_reframed = specimen_df.groupby(["haul_num", "station", "sex", "length"])["length"].val +# ue_counts().to_frame("length_count").reset_index() +# specimen_reframed +# # MELD +# all_lengths = pd.concat([length_df, specimen_reframed]) +# # COMBINE +# comb_lengths = all_lengths.groupby(["haul_num", "sex", "length"])["length_count"].sum().to_fra +# me("length_count").reset_index() + + +# from echopop.live.sql_methods import SQL + +# # Assuming that you have a LiveSurvey object defined +# # ---- Get the database file name (and path) +# biology_db = livesurvey_object.config["database"]["biology"] +# # ---- +# # CONVERT TO TS +# comb_lengths["ts"] = TS_SLOPE * np.log10(comb_lengths["length"]) + TS_INTERCEPT +# # TO SIGMA_BS +# comb_lengths["sigma_bs"] = 10 ** (comb_lengths["ts"] / 10) +# # WEIGHTED MEAN SIGMA_BS +# sigma_mean = np.average(comb_lengths["sigma_bs"], weights=comb_lengths["length_count"]) + +# # INTEGRATE NASC +# path2file = "C:/Users/15052/Downloads/win_1720457505_1720460000_NASC.zarr" + +# Path(path2file).exists() +# xds = xr.open_dataset(path2file, engine="zarr") +# xds +# xdf = xds.to_dataframe().reset_index() +# xdf["NASC"] = xdf["NASC"].fillna(0.0) +# # convert frequency +# xdf["frequency_nominal"] = (xdf["frequency_nominal"] * 1e-3).astype(int) +# # filter +# xdf_38 = xdf[xdf["frequency_nominal"] == nasc_frequency] + +# xdf_38.plot.scatter(x="distance", y="depth", c="NASC") +# plt.show() + +# xdf_int = xdf_38.groupby(["distance", "longitude", "latitude"])["NASC"].sum().reset_index() + +# plt.scatter(xdf_int["longitude"], xdf_int["latitude"], c=xdf_int["NASC"]) +# plt.plot(xdf_int["longitude"], xdf_int["latitude"]) +# plt.show() + +# # CONVERT TO NUMBER DENSITY +# xdf_int["number_density"] = xdf_int["NASC"] / (4.0 * np.pi * sigma_mean) + + +# import geopandas as gpd +# import pyproj + +# ################### +# from geopy.distance import distance +# from shapely.geometry import Point, Polygon, box +# from shapely.ops import unary_union + +# grid_settings = file_configuration["geospatial"]["griddify"] +# grid = [] +# lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters +# lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters +# lat_min = grid_settings["bounds"]["latitude"][0] +# lat_max = grid_settings["bounds"]["latitude"][1] +# lon_min = grid_settings["bounds"]["longitude"][0] +# lon_max = grid_settings["bounds"]["longitude"][1] + +# utm_str = utm_string_generator((lon_max + lon_min)/2, (lat_max + lat_min)/2) +# utm_proj = pyproj.Proj(f"epsg:{utm_str}") +# x_min, y_min = utm_proj(lon_min, lat_min) +# x_max, y_max = utm_proj(lon_max, lat_max) + +# lat = 55.5000 +# lon = -134.2500 +# utm_code = int(utm_string_generator(lon, lat)) +# utm_proj = pyproj.Proj(f"epsg:{utm_code}") +# utm_proj(lon, lat) +# gpd.GeoDataFrame(geometry=gpd.points_from_xy(np.array([lon]), np.array([lat])), crs=project +# ion).to_crs(utm_code) + + +# num_lon_steps = int((x_max - x_min) / lon_step) +# num_lat_steps = int((y_max - y_min) / lat_step) + +# lon1 = np.linspace(x_min, x_max - lon_step, num_lon_steps) +# lat1 = np.linspace(y_min, y_max - lat_step, num_lat_steps) +# lon2 = lon1 + lon_step +# lat2 = lat1 + lat_step + +# # Convert UTM coordinates back to degrees +# lon_min_grid, lat_min_grid = np.meshgrid(lon1, lat1) +# lon_max_grid, lat_max_grid = np.meshgrid(lon2, lat2) + +# # Convert UTM coordinates back to degrees with adjusted resolution +# lon1_deg, lat1_deg = utm_proj(lon_min_grid.ravel(), lat_min_grid.ravel(), inverse=True) +# lon2_deg, lat2_deg = utm_proj(lon_max_grid.ravel(), lat_max_grid.ravel(), inverse=True) + + +# polygons = [box(lon1, lat1, lon2, lat2) for lon1, lat1, lon2, lat2 in zip(lon1_deg, lat1_deg, lo +# n2_deg, lat2_deg)] +# grid_gdf = gpd.GeoDataFrame({'geometry': polygons}, crs="epsg:4326") + + +# world = gpd.read_file("C:/Users/15052/Documents/GitHub/echopop_data/live_2019_files/coastline/ +# ne_110m_land/ne_110m_land.shp") +# bbox = box(lon_min - 0.25, lat_min - 0.25, lon_max + 0.25, lat_max + 0.25) +# shapefile = world +# clipped_shapefile = gpd.clip(shapefile, bbox).to_crs(utm_proj.srs) +# clipped_shapefile.to_crs(utm_proj.srs) +# # clipped_geometry = bbox.intersection(world.union_all()) +# # clipped_gdf = gpd.GeoDataFrame(geometry=[clipped_geometry], crs=world.crs) + +# from shapely.geometry import MultiPolygon + +# # Create an empty list to store clipped geometries +# # clipped_geometries = [] + +# # # Iterate over each grid polygon +# # for index, row in grid_gdf.iterrows(): +# # # Intersect grid polygon with land shape +# # intersection = row['geometry'].intersection(clipped_shapefile.unary_union) + +# # # If intersection is a MultiPolygon, get the difference with the land shape +# # if isinstance(intersection, MultiPolygon): +# # clipped = row['geometry'].difference(clipped_shapefile.unary_union) +# # if clipped.is_empty: +# # continue +# # clipped_geometries.append(clipped) +# # else: +# # # If intersection is a single Polygon, directly add to clipped geometries +# # clipped_geometries.append(intersection) -# clipped_grid = gpd.GeoDataFrame(geometry=clipped_geometries, crs=grid_gdf.crs) +# # clipped_grid = gpd.GeoDataFrame(geometry=clipped_geometries, crs=grid_gdf.crs) -clipped_geometries = grid_gdf['geometry'].to_crs(utm_proj.srs).difference(clipped_shapefile.geometry.union_all()) -clipped_gdf = gpd.GeoDataFrame(geometry=clipped_geometries) -clipped_gdf.to_crs(epsg=32610) +# clipped_geometries = grid_gdf['geometry'].to_crs(utm_proj.srs).difference(clipped_shapefile +# .geometry.union_all()) +# clipped_gdf = gpd.GeoDataFrame(geometry=clipped_geometries) +# clipped_gdf.to_crs(epsg=32610) -invalid_geometries = clipped_gdf[~clipped_gdf.is_valid] -clipped_gdf = clipped_gdf.buffer(0.001) -clipped_gdf['area_sqm'] = clipped_gdf.area / 46300.00000000001**2 +# invalid_geometries = clipped_gdf[~clipped_gdf.is_valid] +# clipped_gdf = clipped_gdf.buffer(0.001) +# clipped_gdf['area_sqm'] = clipped_gdf.area / 46300.00000000001**2 -clipped_gdf.area +# clipped_gdf.area -fig, ax = plt.subplots(figsize=(10, 8)) -clipped_gdf.plot(ax=ax, facecolor="none", edgecolor="black") -clipped_shapefile.plot(ax=ax, edgecolor='black', linewidth=0.5) -plt.tight_layout() -plt.show() +# fig, ax = plt.subplots(figsize=(10, 8)) +# clipped_gdf.plot(ax=ax, facecolor="none", edgecolor="black") +# clipped_shapefile.plot(ax=ax, edgecolor='black', linewidth=0.5) +# plt.tight_layout() +# plt.show() -bbox.crs = {"init": "epsg:4326"} -intersection = gpd.overlay(bbox, world, how='intersection') +# bbox.crs = {"init": "epsg:4326"} +# intersection = gpd.overlay(bbox, world, how='intersection') -world_cut = gpd.sjoin(world, gpd.GeoDataFrame(geometry=[bbox]), how='inner', op='intersects') +# world_cut = gpd.sjoin(world, gpd.GeoDataFrame(geometry=[bbox]), how='inner', op='intersects') -world_cut = world[world.geometry.intersects(bbox)] -world_cut.to_crs("epsg:4326") +# world_cut = world[world.geometry.intersects(bbox)] +# world_cut.to_crs("epsg:4326") -import matplotlib.pyplot as plt -fig, ax = plt.subplots(figsize=(10, 10)) -grid_gdf.plot(ax=ax, facecolor="none", edgecolor="black") -world_cut.plot(ax=ax, linewidth=2, color='blue') -plt.show() +# import matplotlib.pyplot as plt -for cell in grid_gdf: +# fig, ax = plt.subplots(figsize=(10, 10)) +# grid_gdf.plot(ax=ax, facecolor="none", edgecolor="black") +# world_cut.plot(ax=ax, linewidth=2, color='blue') +# plt.show() - x, y = cell.exterior.xy # Extract x and y coordinates of the cell - ax.fill(x, y, facecolor='none', edgecolor='black') # Plot the cell as a polygon patch -# Plot coastline -# world.plot(ax=ax, linewidth=2, color='blue') -plt.show() +# for cell in grid_gdf: +# x, y = cell.exterior.xy # Extract x and y coordinates of the cell +# ax.fill(x, y, facecolor='none', edgecolor='black') # Plot the cell as a polygon patch +# # Plot coastline +# # world.plot(ax=ax, linewidth=2, color='blue') +# plt.show() -bbox = (lat_min, lon_min, lat_max, lon_max) -G = ox.graph_from_bbox(bbox[2], bbox[3], bbox[0], bbox[1], network_type='none', simplify=False) -G = ox.geometries_from_bbox(north=bbox[2], south=bbox[0], east=bbox[3], west=bbox[1], tags={'natural': ['coastline']}) +# bbox = (lat_min, lon_min, lat_max, lon_max) +# G = ox.graph_from_bbox(bbox[2], bbox[3], bbox[0], bbox[1], network_type='none', simplify=False) +# G = ox.geometries_from_bbox(north=bbox[2], south=bbox[0], east=bbox[3], west=bbox[1], +# tags={'natural': ['coastline']}) -latitudes = range(int(lat_min), int(lat_max) + 1, int(lat_step)) -longitudes = range(int(lon_min), int(lon_max) + 1, int(lon_step)) +# latitudes = range(int(lat_min), int(lat_max) + 1, int(lat_step)) +# longitudes = range(int(lon_min), int(lon_max) + 1, int(lon_step)) -# Initialize `meta` attribute -meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"]) +# # Initialize `meta` attribute +# meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"]) -# Loading the configuration settings and definitions that are used to -# initialize the Survey class object -config = yaml.safe_load(Path(initialization_config).read_text()) +# # Loading the configuration settings and definitions that are used to +# # initialize the Survey class object +# config = yaml.safe_load(Path(initialization_config).read_text()) -nasc_frequency = config["acoustics"]["nasc_frequency"] \ No newline at end of file +# nasc_frequency = config["acoustics"]["nasc_frequency"] From 218df8aac98409fd90e6e89816bbe86b58c1e8eb Mon Sep 17 00:00:00 2001 From: Brandyn Lucca Date: Wed, 28 Aug 2024 09:32:16 -0700 Subject: [PATCH 77/81] Pruned `test_workflow.py` --- echopop/test_workflow.py | 556 ++++++++++++--------------------------- 1 file changed, 163 insertions(+), 393 deletions(-) diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index 7c462db8..f85abd4b 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -1,393 +1,163 @@ -# from echopop.live.live_survey import LiveSurvey -# from echopop.live.sql_methods import SQL -# import echopop.live.live_visualizer as elv -# from pathlib import Path -# from echopop.live import live_data_processing as eldp -# from echopop.live import live_data_loading as eldl -# from echopop.live.live_core import( -# LIVE_DATA_STRUCTURE, LIVE_INPUT_FILE_CONFIG_MAP -# ) -# import boto3 -# from botocore.exceptions import NoCredentialsError, ClientError -# import pandas as pd -# import numpy as np -# from echopop.live.sql_methods import SQL, sql_data_exchange, get_table_key_names, -# sql_group_update, query_processed_files, sql_update_strata_summary -# from echopop.live.live_spatial_methods import apply_spatial_definitions -# from echopop.live.live_acoustics import average_sigma_bs, compute_nasc -# from echopop.live.live_biology import compute_sigma_bs -# from echopop.acoustics import ts_length_regression, to_dB, to_linear -# from echopop.utils.operations import group_interpolator_creator -# from functools import reduce -# from echopop.live.live_data_loading import filter_filenames, read_biology_csv - -# ################################################################################################## -# # TEST: Set up `LiveSurvey` object -# # NOTE: General initialization parameter configuration -# live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initializat -# ion_config.yml" -# # NOTE: File configuration -# live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_yea -# r_2019_config.yml" -# # NOTE: Create object -# realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True) -# realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path, verbose=True) - -# # NOTE: String-representation via `LiveSurvey.__repr__`: -# # NOTE: Lists current files being processed and linked databases (WIP) -# self = realtime_survey -# file_configuration = self.config - -# input_filenames = ["202407_003_operation_info.csv", "202407_22500_003_lf.csv", -# "202407_22500_003_spec.csv", "202407_003_catch_perc.csv"] -# realtime_survey.config["input_directories"]["biology"]["directory"] = -# "s3://sh2407-upload/data/Echopop-biology" - -# survey_data = SQL("C:/Users/Brandyn/Downloads/acoustics.db", "select", -# table_name="survey_data_df") - - -# del realtime_survey.config["data_root_dir"] -# self = realtime_survey - -# # realtime_survey.config["storage_options"] = aws_credentials -# realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True) -# realtime_survey.load_biology_data(input_filenames=input_filenames) -# realtime_survey.input["biology"] -# def is_s3_path(path): -# """Check if a path is an S3 path.""" -# return path.startswith("s3://") - -# dataset_directory = realtime_survey.config["input_directories"]["biology"]["directory"] -# s3_path = dataset_directory -# is_s3_path(dataset_directory) - -# cloud_credentials = aws_credentials -# cloud_credentials = {} -# def validate_s3_path(s3_path: str, cloud_credentials: dict): -# """Check if (parts of) S3 path exists.""" - -# # Redundant validation that S3 object validation is appropriate -# if not is_s3_path(s3_path): -# raise ValueError("The path is not an S3 path.") - -# # Validate credentials -# if not all([True if param in cloud_credentials.keys() else False -# for param in ["key", "secret"]]): -# # ---- Find missing credentials -# missing_creds = set(["key", "secret"]) - set(cloud_credentials) -# # ---- Format into string -# missing_creds_str = ", ".join(["'{}'".format(x.replace("'", "''")) for x in -# missing_creds]) -# # ---- Raise Error -# raise PermissionError( -# f"Required S3 credentials missing: {missing_creds_str}." -# ) - -# # Remove the s3:// prefix -# s3_path_reduced = s3_path[len("s3://"):] - -# # Split into bucket and key -# parts = s3_path_reduced.split("/", 1) -# if len(parts) < 2: -# raise ValueError(f"Invalid S3 path format for '{s3_path}'.") - -# # Get bucket name and directory keys -# bucket_name, directory = parts - -# # Initialize the S3 client -# s3_client = boto3.client("s3", -# aws_access_key_id=cloud_credentials["key"], -# aws_secret_access_key=cloud_credentials["secret"]) - -# # Check if the bucket exists -# try: -# s3_client.head_bucket(Bucket=bucket_name) -# except ClientError as e: -# raise FileNotFoundError( -# f"S3 bucket '{bucket_name}' does not exist or you do not have access." -# ) - -# # Check if the S3 directory exists -# try: -# # ---- Ping a response from the bucket -# response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=directory, MaxKeys=1) -# # ---- Check for `Contents` -# if "Contents" not in response: -# raise FileNotFoundError(f"S3 path '{s3_path}' does not exist.") -# except ClientError as e: -# # --- Raise Error and propagate it upwards -# raise e - -# validate_s3_path(s3_path, cloud_credentials) - -# import pandas as pd - -# self = realtime_survey -# biology_files = self.meta["provenance"]["biology_files_read"] -# file_configuration = self.config -# dataset = "biology" - -# # Get the dataset file settings -# file_settings = file_configuration["input_directories"][dataset] - -# def construct_directorypath(file_configuration: dict, file_settings: dict): -# """Construct the root directory path.""" - -# # Get the general root_directory, if present -# if "data_root_dir" in file_configuration: -# root_directory = file_configuration["data_root_dir"] -# else: -# root_directory = "" - -# # Get the local directory (or this may be the root directory depending on the config) -# data_directory = file_settings["directory"] - -# # Return the directory path -# if root_directory != "": -# return "/".join([root_directory, data_directory]) -# else: -# return data_directory - -# directory_path = construct_directorypath(file_configuration, file_settings) - -# def validate_local_path(directory_path: str): - -# # Validate filepath -# # ---- Error evaluation (if applicable) -# if not Path(directory_path).exists(): -# raise FileNotFoundError( -# f"The acoustic data directory [{directory_path}] does not exist." -# ) - -# # Validate that files even exist -# # ---- List available files of target extension -# data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}")) -# # ---- Error evaluation (if applicable) -# if not data_files: -# raise FileNotFoundError( -# f"No `*.{file_settings['extension']}` files found in [{directory_path}]!" -# ) - - -# # Get the biology data file settings -# file_settings = file_configuration["input_directories"]["biology"] - -# # Get the file-specific settings, datatypes, columns, etc. -# # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP` -# biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] -# # ---- Extract the expected file name ID's -# biology_file_ids = file_settings["file_name_formats"] -# # ---- Extract all of the file ids -# biology_config_ids = list(biology_file_ids.keys()) -# # ---- Initialize the dictionary that will define this key in the `input` attribute -# biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids} - - -# # Initialize a session with AWS credentials -# s3_client = boto3.client( -# 's3', -# aws_access_key_id=aws_credentials["key"], -# aws_secret_access_key=aws_credentials["secret"] -# ) -# response = s3_client.list_buckets() -# buckets = response.get('Buckets', []) -# for bucket in buckets: -# print(f"Bucket Name: {bucket['Name']}") -# s3_client.head_bucket(Bucket="sh2407-upload") -# realtime_survey.load_biology_data(pandas_kwargs=aws_credentials, input_filenames=input_filenames) -# realtime_survey.config["ship_id"] -# grid_data = SQL(realtime_survey.config["database"]["grid"], "select", table_name="grid_df") -# grid_data[grid_data.abundance > 0] -# bucket = boto3.client("s3", region_name=None) -# bucket.head_bucket(Bucket=realtime_survey.config["input_directories"]["biology"]["directory"] -# +"/") -# bucket.list_objects_v2(Bucket=realtime_survey.config["input_directories"]["biology"]["directory"], -# Prefix=path, MaxKeys=1) -# ################################################################################################# -# # TEST: TRIGGER --> NEW ACOUSTIC DATA -# # NOTE: Load new acoustic data (Either glob file search or `input_filenames Optional[List[str]]`) -# realtime_survey.load_acoustic_data() -# # NOTE: Process new acoustic data -# # NOTE: This will update linked database tables -# realtime_survey.process_acoustic_data() -# # NOTE: Generate population estimates (or pass if there are no biological data) -# # NOTE: `working_dataset = Literal["acoustic", "biology"]` -# realtime_survey.estimate_population(working_dataset="acoustic") -# # NOTE: String-representation via `LiveSurvey.__repr__`: -# # NOTE: Lists current files being processed and linked databases (WIP) -# realtime_survey.input["acoustics"] -# ################################################################################################## -# # TEST: TRIGGER --> NEW BIOLOGY DATA -# # NOTE: Load new biological data (Either glob file search or `input_filenames Optional[List[str]]` -# realtime_survey.load_biology_data() -# len(realtime_survey.meta["provenance"]["biology_files_checkpoint1"]) -# realtime_survey.meta["provenance"]["biology_files_checkpoint3"] -# # NOTE: Process new biological data -# # NOTE: This will update linked database tables -# realtime_survey.process_biology_data() -# # NOTE: Generate population estimates (or pass if there are no acoustic data) -# # NOTE: `working_dataset = Literal["acoustic", "biology"]` -# realtime_survey.estimate_population(working_dataset="biology") -# # NOTE: String-representation via `LiveSurvey.__repr__`: -# # NOTE: Lists current files being processed and linked databases (WIP) -# realtime_survey -# ################################################################################################## -# # TEST: `LiveSurvey` --[`files_processed`]--> `Echodataflow` -# # NOTE: `LiveSurvey.meta` attribute -# # ---- ACOUSTIC -# realtime_survey.meta["provenance"]["acoustic_files"] -# # ---- BIOLOGICAL -# realtime_survey.meta["provenance"]["biology_files"] -# # NOTE: SQL function query from database file [cumulative list] -# # ---- ACOUSTIC -# SQL(db_file=realtime_survey.config["database"]["acoustics"], -# command="select", table_name="files_processed") -# dat = SQL(db_file=realtime_survey.config["database"]["acoustics"],command="select", -# table_name="files_processed") -# # ---- BIOLOGICAL -# SQL(db_file=realtime_survey.config["database"]["biology"],command="select", -# table_name="files_processed") -# dat.loc[0:, "filepath"][105] -# ################################################################################################## -# # TEST: `LiveSurvey` --[(key) SQL tables]--> Users -# # !!! The SQL functions will fail if the tables have not yet been created/initialized -# # ---- ACOUSTICS -# # NOTE: Mean linear backscatter coefficient (`sigma_bs`) keyed for each haul and stratum -# SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df") -# SQL(realtime_survey.config["database"]["biology"], "select", table_name="specimen_df") -# .latitude.max() -# realtime_survey.input["spatial"]["strata"] -# # NOTE: Along-track acoustically-derived number/biomass densities and NASC -# SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") -# # ---- BIOLOGICAL -# # NOTE: Fitted (discretized) length-weight relationship -# SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_fitted_df") -# # NOTE: Quantized length-binned weights (summed) -# SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df") -# # NOTE: Average weights per stratum -# SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df") -# # NOTE: Stratum summary tables -# SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df") -# ################################################################################################## -# # FROM THE `LiveSurvey` object ! -# # ---- Convert to a Panel -# import panel as pn -# # ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table -# survey_data_db = Path(realtime_survey.config["database"]["acoustics"]) -# # grid_db = Path(realtime_survey.config["database"]["grid"]) -# grid_db = Path("C:/Users/Brandyn/Downloads/grid.db") -# dat = SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") -# dat -# dat1 = SQL(grid_db, "select", table_name="grid_df") -# SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df") - -# sql_cmd = "SELECT * FROM sigma_bs_mean_df ORDER BY stratum, haul_num, species_id" -# # Create the engine -# engine = create_engine(f"sqlite:///{"C:/Users/Brandyn/Downloads/biology.db"}") -# # Create the SQL database connection and send the script -# with engine.connect() as connection: -# table = connection.execute(text(sql_cmd)) - -# data = table.fetchall() -# dd = pd.DataFrame(data, columns=table.keys()).loc[0:1, :] -# dd = dd[["stratum", "haul_num", "species_id", "sigma_bs", "sigma_bs_count", "sigma_bs_sum", "id"]] -# dd.loc[:, "id"] = pd.Series([f"{(4,4,4)}", f"{(5,5,5)}"]) -# SQL("C:/Users/Brandyn/Downloads/biology.db", "insert", table_name="sigma_bs_mean_df", -# dataframe=dd) -# SQL("C:/Users/Brandyn/Downloads/biology.db", "map") -# SQL(biology_db, "drop", table_name="sigma_bs_mean_df") -# SQL(biology_db, "select", table_name="sigma_bs_mean_df") -# dd.loc[:, "haul_num"] = pd.Series([101, 103]) -# dd = dd[["species_id", "haul_num", "id", "stratum", "sigma_bs", "sigma_bs_count", "sigma_bs_sum"]] -# SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=dd, id_columns=key_list+["id"]) -# SQL(biology_db, "select", table_name="sigma_bs_mean_df") -# import numpy as np; import pandas as pd -# SQL("C:/Users/Brandyn/Downloads/biology.db", "select", table_name="length_weight_df") -# sigma_bs_df = SQL("C:/Users/Brandyn/Downloads/biology.db", "select", -# table_name="sigma_bs_mean_df") -# table_df = SQL(realtime_survey.config["database"]["biology"], "select", -# table_name="sigma_bs_mean_df") -# sigma_bs_df = table_df -# # ---- Check the table keys -# table_keys = np.unique(table_df["id"]).tolist() -# # ---- Get unique values -# current_keys = np.unique(sigma_bs_df["id"]).tolist() -# # ---- Get INSERTION keys -# insertion_keys = list(set(current_keys).difference(set(table_keys))) -# # ---- Get UPDATE keys -# update_keys = list(set(current_keys).intersection(set(table_keys))) -# insertion_df = sigma_bs_df[sigma_bs_df["id"].isin(insertion_keys)] -# insertion_df.loc[0, "species_id"] = 22500 -# insertion_df.loc[0, "stratum"] = 5 -# insertion_df.loc[0, "haul_num"] = 100 -# insertion_df.loc[0, "sigma_bs"] = 1e-10 -# insertion_df.loc[0, "sigma_bs_count"] = 100 -# insertion_df.loc[0, "sigma_bs_sum"] = 1e10 * 100 -# insertion_df.loc[0, "id"] = f"{(1,1,1)}" -# SQL(realtime_survey.config["database"]["biology"], "insert", table_name="sigma_bs_mean_df", -# dataframe=insertion_df) -# SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df") -# survey_data = SQL(realtime_survey.config["database"]["acoustics"], "select", -# table_name="survey_data_df") -# dat1[dat1.abundance > 0] -# dat[dat.number_density > 0] -# coast_db = grid_db -# biology_db = Path(realtime_survey.config["database"]["biology"]) -# projection = realtime_survey.config["geospatial"]["projection"] -# # NOTE: PLOTS -# # Ensure Panel is initialized -# pn.extension() -# # ---- Helper function -# def plt_to_pn(fig): -# # Convert to a panel object -# panel = pn.panel(fig) -# # Display -# panel.show() # OR panel.servable() if you want to serve it in a Panel server -# # ---- PLOT GRID -# fig = elv.plot_livesurvey_grid(grid_db, projection, coast_db) -# fig.show() -# plt_to_pn(fig) -# # ---- PLOT TRACK -# from echopop.live.live_visualizer import plot_livesurvey_track -# fig1 = plot_livesurvey_track(survey_data, projection, coast_db) -# fig1.show() -# plt_to_pn(fig1) -# # ---- PLOT DISTRIBUTIONS -# weight_table = SQL(biology_db, "select", -# table_name="length_weight_df") -# stratum_table = SQL(biology_db, "select", -# table_name="strata_summary_df") -# specimen_table = SQL(biology_db, "select", -# table_name="specimen_data_df") -# length_table = SQL(biology_db, "select", -# table_name="length_df") -# fig2 = elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, -# length_table) -# plt_to_pn(fig2) -# ### MULTIPANEL -# panel0 = pn.panel(fig, name='Gridded population estimates') -# panel1 = pn.panel(fig1, name='Alongtrack population estimates') -# panel2 = pn.panel(fig2, name='Length and weight distributions') - -# def serve_panels(): -# # Create links to each panel -# home = pn.Column( -# pn.pane.Markdown("# Main Page"), -# pn.pane.Markdown("[Gridded population estimates](gridded_population_estimates)", -# sizing_mode="stretch_width"), -# pn.pane.Markdown("[Alongtrack population estimates](alongtrack_population_estimates)", -# sizing_mode="stretch_width"), -# pn.pane.Markdown("[Length and weight distributions](length_weight_distributions)", -# sizing_mode="stretch_width") -# ) - -# # Serve the home page and individual panels -# pn.serve({ -# 'Main Page': home, -# 'gridded_population_estimates': panel0, -# 'alongtrack_population_estimates': panel1, -# 'length_weight_distributions': panel2 -# }, show=True) -# # Run the function to serve panels -# serve_panels() +from echopop.live.live_survey import LiveSurvey +from echopop.live.sql_methods import SQL +import echopop.live.live_visualizer as elv +from pathlib import Path +from echopop.live import live_data_processing as eldp +from echopop.live import live_data_loading as eldl +from echopop.live.live_core import( + LIVE_DATA_STRUCTURE, LIVE_INPUT_FILE_CONFIG_MAP +) +import boto3 +from botocore.exceptions import NoCredentialsError, ClientError +import pandas as pd +import numpy as np +from echopop.live.sql_methods import SQL, sql_data_exchange, get_table_key_names, +sql_group_update, query_processed_files, sql_update_strata_summary +from echopop.live.live_spatial_methods import apply_spatial_definitions +from echopop.live.live_acoustics import average_sigma_bs, compute_nasc +from echopop.live.live_biology import compute_sigma_bs +from echopop.acoustics import ts_length_regression, to_dB, to_linear +from echopop.utils.operations import group_interpolator_creator +from functools import reduce +from echopop.live.live_data_loading import filter_filenames, read_biology_csv + +################################################################################################## +# TEST: Set up `LiveSurvey` object +# NOTE: General initialization parameter configuration +live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml" +# NOTE: File configuration +live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml" +# NOTE: Create object +realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True) + +# NOTE: String-representation via `LiveSurvey.__repr__`: +# NOTE: Lists current files being processed and linked databases (WIP) +realtime_survey +################################################################################################# +# TEST: TRIGGER --> NEW ACOUSTIC DATA +# NOTE: Load new acoustic data (Either glob file search or `input_filenames Optional[List[str]]`) +realtime_survey.load_acoustic_data() +# NOTE: Process new acoustic data +# NOTE: This will update linked database tables +realtime_survey.process_acoustic_data() +# NOTE: Generate population estimates (or pass if there are no biological data) +# NOTE: `working_dataset = Literal["acoustic", "biology"]` +realtime_survey.estimate_population(working_dataset="acoustic") +# NOTE: String-representation via `LiveSurvey.__repr__`: +# NOTE: Lists current files being processed and linked databases (WIP) +realtime_survey.input["acoustics"] +################################################################################################## +# TEST: TRIGGER --> NEW BIOLOGY DATA +# NOTE: Load new biological data (Either glob file search or `input_filenames Optional[List[str]]` +realtime_survey.load_biology_data() +# NOTE: Process new biological data +# NOTE: This will update linked database tables +realtime_survey.process_biology_data() +# NOTE: Generate population estimates (or pass if there are no acoustic data) +# NOTE: `working_dataset = Literal["acoustic", "biology"]` +realtime_survey.estimate_population(working_dataset="biology") +# NOTE: String-representation via `LiveSurvey.__repr__`: +# NOTE: Lists current files being processed and linked databases (WIP) +realtime_survey +################################################################################################## +# TEST: `LiveSurvey` --[`files_processed`]--> `Echodataflow` +# NOTE: `LiveSurvey.meta` attribute +# ---- ACOUSTIC +realtime_survey.meta["provenance"]["acoustic_files"] +# ---- BIOLOGICAL +realtime_survey.meta["provenance"]["biology_files"] +# NOTE: SQL function query from database file [cumulative list] +# ---- ACOUSTIC +SQL(db_file=realtime_survey.config["database"]["acoustics"], + command="select", table_name="files_processed") +dat = SQL(db_file=realtime_survey.config["database"]["acoustics"],command="select", +table_name="files_processed") +# ---- BIOLOGICAL +SQL(db_file=realtime_survey.config["database"]["biology"],command="select", +table_name="files_processed") +################################################################################################## +# TEST: `LiveSurvey` --[(key) SQL tables]--> Users +# !!! The SQL functions will fail if the tables have not yet been created/initialized +# ---- ACOUSTICS +# NOTE: Mean linear backscatter coefficient (`sigma_bs`) keyed for each haul and stratum +SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df") +SQL(realtime_survey.config["database"]["biology"], "select", table_name="specimen_df") +.latitude.max() +realtime_survey.input["spatial"]["strata"] +# NOTE: Along-track acoustically-derived number/biomass densities and NASC +SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df") +# ---- BIOLOGICAL +# NOTE: Fitted (discretized) length-weight relationship +SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_fitted_df") +# NOTE: Quantized length-binned weights (summed) +SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df") +# NOTE: Average weights per stratum +SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df") +# NOTE: Stratum summary tables +SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df") +################################################################################################## +# FROM THE `LiveSurvey` object ! +# ---- Convert to a Panel +import panel as pn +# ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table +survey_data_db = Path(realtime_survey.config["database"]["acoustics"]) +grid_db = Path(realtime_survey.config["database"]["grid"]) +coast_db = grid_db +biology_db = Path(realtime_survey.config["database"]["biology"]) +projection = realtime_survey.config["geospatial"]["projection"] +# NOTE: PLOTS +# Ensure Panel is initialized +pn.extension() +# ---- Helper function +def plt_to_pn(fig): + # Convert to a panel object + panel = pn.panel(fig) + # Display + panel.show() # OR panel.servable() if you want to serve it in a Panel server +# ---- PLOT GRID +fig = elv.plot_livesurvey_grid(grid_db, projection, coast_db) +fig.show() +plt_to_pn(fig) +# ---- PLOT TRACK +from echopop.live.live_visualizer import plot_livesurvey_track +fig1 = plot_livesurvey_track(survey_data, projection, coast_db) +fig1.show() +plt_to_pn(fig1) +# ---- PLOT DISTRIBUTIONS +weight_table = SQL(biology_db, "select", + table_name="length_weight_df") +stratum_table = SQL(biology_db, "select", + table_name="strata_summary_df") +specimen_table = SQL(biology_db, "select", + table_name="specimen_data_df") +length_table = SQL(biology_db, "select", + table_name="length_df") +fig2 = elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, +length_table) +plt_to_pn(fig2) +### MULTIPANEL +panel0 = pn.panel(fig, name='Gridded population estimates') +panel1 = pn.panel(fig1, name='Alongtrack population estimates') +panel2 = pn.panel(fig2, name='Length and weight distributions') + +def serve_panels(): + # Create links to each panel + home = pn.Column( + pn.pane.Markdown("# Main Page"), + pn.pane.Markdown("[Gridded population estimates](gridded_population_estimates)", +sizing_mode="stretch_width"), + pn.pane.Markdown("[Alongtrack population estimates](alongtrack_population_estimates)", +sizing_mode="stretch_width"), + pn.pane.Markdown("[Length and weight distributions](length_weight_distributions)", +sizing_mode="stretch_width") + ) + + # Serve the home page and individual panels + pn.serve({ + 'Main Page': home, + 'gridded_population_estimates': panel0, + 'alongtrack_population_estimates': panel1, + 'length_weight_distributions': panel2 + }, show=True) +# Run the function to serve panels +serve_panels() From 5a0eac8634cec200f05642dc7f62ab6f16af2f22 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 28 Aug 2024 16:32:39 +0000 Subject: [PATCH 78/81] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- echopop/test_workflow.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py index f85abd4b..d274d021 100644 --- a/echopop/test_workflow.py +++ b/echopop/test_workflow.py @@ -1,25 +1,25 @@ -from echopop.live.live_survey import LiveSurvey -from echopop.live.sql_methods import SQL -import echopop.live.live_visualizer as elv from pathlib import Path -from echopop.live import live_data_processing as eldp -from echopop.live import live_data_loading as eldl -from echopop.live.live_core import( - LIVE_DATA_STRUCTURE, LIVE_INPUT_FILE_CONFIG_MAP -) + import boto3 -from botocore.exceptions import NoCredentialsError, ClientError -import pandas as pd import numpy as np -from echopop.live.sql_methods import SQL, sql_data_exchange, get_table_key_names, +import pandas as pd +from botocore.exceptions import ClientError, NoCredentialsError + +import echopop.live.live_visualizer as elv +from echopop.live import live_data_loading as eldl, live_data_processing as eldp +from echopop.live.live_core import LIVE_DATA_STRUCTURE, LIVE_INPUT_FILE_CONFIG_MAP +from echopop.live.live_survey import LiveSurvey +from echopop.live.sql_methods import SQL, get_table_key_names, sql_data_exchange + sql_group_update, query_processed_files, sql_update_strata_summary -from echopop.live.live_spatial_methods import apply_spatial_definitions +from functools import reduce + +from echopop.acoustics import to_dB, to_linear, ts_length_regression from echopop.live.live_acoustics import average_sigma_bs, compute_nasc from echopop.live.live_biology import compute_sigma_bs -from echopop.acoustics import ts_length_regression, to_dB, to_linear -from echopop.utils.operations import group_interpolator_creator -from functools import reduce from echopop.live.live_data_loading import filter_filenames, read_biology_csv +from echopop.live.live_spatial_methods import apply_spatial_definitions +from echopop.utils.operations import group_interpolator_creator ################################################################################################## # TEST: Set up `LiveSurvey` object @@ -99,6 +99,7 @@ # FROM THE `LiveSurvey` object ! # ---- Convert to a Panel import panel as pn + # ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table survey_data_db = Path(realtime_survey.config["database"]["acoustics"]) grid_db = Path(realtime_survey.config["database"]["grid"]) @@ -120,6 +121,7 @@ def plt_to_pn(fig): plt_to_pn(fig) # ---- PLOT TRACK from echopop.live.live_visualizer import plot_livesurvey_track + fig1 = plot_livesurvey_track(survey_data, projection, coast_db) fig1.show() plt_to_pn(fig1) From 2a277a63c7421e2506103ca2034cc9741ef4a40e Mon Sep 17 00:00:00 2001 From: Sohambutala Date: Sun, 27 Oct 2024 02:46:54 +0000 Subject: [PATCH 79/81] add echopop live viz cmap and fig seq tweaks --- echopop/live/live_visualizer.py | 84 ++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 39 deletions(-) diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py index a1d55a26..a23baa88 100644 --- a/echopop/live/live_visualizer.py +++ b/echopop/live/live_visualizer.py @@ -66,28 +66,28 @@ def plot_livesurvey_grid( "number_density_mean": { "name": "Mean number density", "units": "fish $\\mathregular{nmi^{-2}}$", - "colormap": "viridis", + "colormap": "cividis", "color_threshold": {"minimum": 1e1, "maximum": 1e6}, }, "biomass_density_mean": { "name": "Mean biomass density", "units": "kg $\\mathregular{nmi^{-2}}$", - "colormap": "plasma", + "colormap": "magma", "color_threshold": {"minimum": 1e1, "maximum": 1e6}, }, - "biomass": { - "name": "Biomass", - "units": "kg", - "colormap": "cividis", + "abundance": { + "name": "Abundance", + "units": "$\\it{N}$", + "colormap": "viridis", "color_threshold": { "minimum": 1e1 * grid_gdf["area"].max(), "maximum": 1e6 * grid_gdf["area"].max(), }, }, - "abundance": { - "name": "Abundance", - "units": "$\\it{N}$", - "colormap": "inferno", + "biomass": { + "name": "Biomass", + "units": "kg", + "colormap": "plasma", "color_threshold": { "minimum": 1e1 * grid_gdf["area"].max(), "maximum": 1e6 * grid_gdf["area"].max(), @@ -106,7 +106,7 @@ def plot_livesurvey_grid( # ---- Get the colormap colormap = plt.colormaps.get_cmap(VARIABLE_MAP[var]["colormap"]).resampled(256) # ---- Invert - newcolors = colormap(np.linspace(0, 1, 256))[::-1] + newcolors = colormap(np.linspace(0, 1, 256))#[::-1] # ---- Define `white` white = np.array([1, 1, 1, 1]) # ---- Replace "start" color @@ -244,12 +244,21 @@ def plot_livesurvey_track( # Variable label dictionary map VARIABLE_MAP = { + "nasc": { + "name": "Nautical area scattering coefficient", + "units": "$\\mathregular{m^{2}~nmi^{-2}}$", + "colormap": "YlOrRd", + "minimum": 0.0, + "cbar_reverse": False, + "color_threshold": {"minimum": 1e2, "maximum": 1e4}, + "size": [25, 150], + }, "number_density": { "name": "Mean number density", "units": "fish $\\mathregular{nmi^{-2}}$", - "colormap": "inferno", + "colormap": "Purples", "minimum": 0.0, - "cbar_reverse": True, + "cbar_reverse": False, "color_threshold": { "minimum": 1e1, "maximum": 1e6, @@ -259,30 +268,21 @@ def plot_livesurvey_track( "biomass_density": { "name": "Mean biomass density", "units": "kg $\\mathregular{nmi^{-2}}$", - "colormap": "plasma", + "colormap": "Greens", "minimum": 0.0, - "cbar_reverse": True, + "cbar_reverse": False, "color_threshold": { "minimum": 1e1, "maximum": 1e6, }, "size": [25, 150], }, - "nasc": { - "name": "Nautical area scattering coefficient", - "units": "$\\mathregular{m^{2}~nmi^{-2}}$", - "colormap": "viridis", - "minimum": 0.0, - "cbar_reverse": False, - "color_threshold": {"minimum": 1e2, "maximum": 1e4}, - "size": [25, 150], - }, "max_Sv": { "name": "Max $\\mathregular{S_V}$", "units": "dB re. 1 $\\mathregular{m^-1}$", - "colormap": "viridis", + "colormap": "Blues", "minimum": -999, - "cbar_reverse": True, + "cbar_reverse": False, "color_threshold": {"minimum": -80.0, "maximum": -36.0}, "size": [5, 100], }, @@ -331,13 +331,16 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): # Iterate through and plot all subplots for ax, var in zip(axes.flat, intact_variables): - # ---- Get the colormap - colormap = plt.colormaps.get_cmap(VARIABLE_MAP[var]["colormap"]).resampled(256) - # ---- Invert - if VARIABLE_MAP[var]["cbar_reverse"]: - newcolors = colormap(np.linspace(0, 1, 256))[::-1] - # ---- Create the new custom colormap - custom_cmap = ListedColormap(newcolors) + # # ---- Get the colormap + # colormap = plt.colormaps.get_cmap(VARIABLE_MAP[var]["colormap"]).resampled(256) + # # ---- Invert + # if VARIABLE_MAP[var]["cbar_reverse"]: + # newcolors = colormap(np.linspace(0, 1, 256))[::-1] + # else: + # newcolors = colormap + # # ---- Create the new custom colormap + # custom_cmap = ListedColormap(newcolors) + custom_cmap = VARIABLE_MAP[var]["colormap"] # ---- Plot cruisetrack # survey_gdf.plot(ax=ax, color="dimgray", linewidth=0.25, linestyle="-") # ax.plot(survey_gdf.geometry.x, survey_gdf.geometry.y, color="dimgray", @@ -346,11 +349,11 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): for ship_id, group in survey_gdf.groupby("ship_id"): # Sort the group by latitude or longitude # group = group.sort_values(by=["latitude", "longitude"]) - color = ship_id_colors.get(ship_id, "gray") + # color = ship_id_colors.get(ship_id, "gray") (line_handle,) = ax.plot( group.geometry.x, group.geometry.y, - color=color, + color="gray", linewidth=0.25, linestyle="-", label=ship_id, @@ -375,6 +378,7 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): [geom.x for geom in sub_gdf.geometry], [geom.y for geom in sub_gdf.geometry], c=sub_gdf[var], + # s=20, s=scale_sizes( values=sub_gdf[var], min_value=min_value, @@ -548,9 +552,10 @@ def plot_livesurvey_distributions( ax_weight.plot( group["length_bin"], group["proportions"], - marker="o", + marker=".", label=f"Stratum {stratum}", color=color, + lw=1, ms=ms, ) if i == 0: @@ -561,7 +566,7 @@ def plot_livesurvey_distributions( ax_weight.set_ylabel("Within-stratum proportion [0, 1]") if i == num_sexes - 1: # Bottom plot ax_weight.set_xlabel("Length bin (cm)") - ax_weight.set_ylim(0.0, 1.0) + ax_weight.set_ylim(0.0, 0.8) # Add label in the top-left corner ax_weight.text( 0.05, @@ -582,9 +587,10 @@ def plot_livesurvey_distributions( ax_count.plot( group["length_bin"], group["number_proportion"], - marker="o", + marker=".", label=f"Stratum {stratum}", color=color, + lw=1, ms=ms, ) if i == 0: @@ -593,7 +599,7 @@ def plot_livesurvey_distributions( ax_count.set_xlabel("") if i == num_sexes - 1: # Bottom plot ax_count.set_xlabel("Length bin (cm)") - ax_count.set_ylim(0.0, 1.0) + ax_count.set_ylim(0.0, 0.8) # Add label in the top-left corner ax_count.text( 0.05, From 30c05ffff87061f76ce358b643c794ab508962dc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 27 Oct 2024 02:47:28 +0000 Subject: [PATCH 80/81] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- echopop/live/live_visualizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py index a23baa88..1bba7fff 100644 --- a/echopop/live/live_visualizer.py +++ b/echopop/live/live_visualizer.py @@ -106,7 +106,7 @@ def plot_livesurvey_grid( # ---- Get the colormap colormap = plt.colormaps.get_cmap(VARIABLE_MAP[var]["colormap"]).resampled(256) # ---- Invert - newcolors = colormap(np.linspace(0, 1, 256))#[::-1] + newcolors = colormap(np.linspace(0, 1, 256)) # [::-1] # ---- Define `white` white = np.array([1, 1, 1, 1]) # ---- Replace "start" color From 99a91a7d91a833b05004dd6c0d4cc8d9b9d398eb Mon Sep 17 00:00:00 2001 From: Sohambutala Date: Sun, 27 Oct 2024 03:02:36 +0000 Subject: [PATCH 81/81] tweak zorder of scatter and line in track plot, tweak units --- echopop/live/live_visualizer.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py index a23baa88..35fa8bbc 100644 --- a/echopop/live/live_visualizer.py +++ b/echopop/live/live_visualizer.py @@ -65,7 +65,7 @@ def plot_livesurvey_grid( VARIABLE_MAP = { "number_density_mean": { "name": "Mean number density", - "units": "fish $\\mathregular{nmi^{-2}}$", + "units": "Number of fish per $\\mathregular{nmi^2}$", "colormap": "cividis", "color_threshold": {"minimum": 1e1, "maximum": 1e6}, }, @@ -77,7 +77,7 @@ def plot_livesurvey_grid( }, "abundance": { "name": "Abundance", - "units": "$\\it{N}$", + "units": "Number of fish", "colormap": "viridis", "color_threshold": { "minimum": 1e1 * grid_gdf["area"].max(), @@ -255,7 +255,7 @@ def plot_livesurvey_track( }, "number_density": { "name": "Mean number density", - "units": "fish $\\mathregular{nmi^{-2}}$", + "units": "Number of fish per $\\mathregular{nmi^2}$", "colormap": "Purples", "minimum": 0.0, "cbar_reverse": False, @@ -279,7 +279,7 @@ def plot_livesurvey_track( }, "max_Sv": { "name": "Max $\\mathregular{S_V}$", - "units": "dB re. 1 $\\mathregular{m^-1}$", + "units": "dB re 1 $\\mathregular{m^-1}$", "colormap": "Blues", "minimum": -999, "cbar_reverse": False, @@ -357,7 +357,7 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): linewidth=0.25, linestyle="-", label=ship_id, - zorder=1, + zorder=2, ) handles.append(line_handle) # Add handle to legend # ax.plot(group.geometry.x, group.geometry.y, label=ship_id, linewidth=0.25, @@ -388,7 +388,9 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250): ), cmap=custom_cmap, norm=norm, - zorder=2, + zorder=1, + alpha=0.6, + lw=0, # edgecolor="black", # linewidths=0.1 )