From b56b618a45e89914f1c097274b51665b5ebf95e3 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 15 Jul 2024 08:44:28 -0700
Subject: [PATCH 01/81] Establishing inital `LiveSurvey`

---
 config_files/live_initialization_config.yml   |  57 ++++++++++
 config_files/live_survey_year_2019_config.yml |  91 ++++++++++++++++
 echopop/live/__init__.py                      |   1 +
 echopop/live/acoustics.py                     |   0
 echopop/live/biology.py                       |   0
 echopop/live/core.py                          |  28 +++++
 echopop/live/livesurvey.py                    |  41 +++++++
 echopop/live/load.py                          |   0
 echopop/live/spatial.py                       |   0
 echopop/live/write.py                         |   0
 echopop/zarr_read_ingest_test.py              | 103 ++++++++++++++++++
 11 files changed, 321 insertions(+)
 create mode 100644 config_files/live_initialization_config.yml
 create mode 100644 config_files/live_survey_year_2019_config.yml
 create mode 100644 echopop/live/__init__.py
 create mode 100644 echopop/live/acoustics.py
 create mode 100644 echopop/live/biology.py
 create mode 100644 echopop/live/core.py
 create mode 100644 echopop/live/livesurvey.py
 create mode 100644 echopop/live/load.py
 create mode 100644 echopop/live/spatial.py
 create mode 100644 echopop/live/write.py
 create mode 100644 echopop/zarr_read_ingest_test.py

diff --git a/config_files/live_initialization_config.yml b/config_files/live_initialization_config.yml
new file mode 100644
index 00000000..ba1474aa
--- /dev/null
+++ b/config_files/live_initialization_config.yml
@@ -0,0 +1,57 @@
+# This YAML file is a configuration file for all
+# initialization parameters used for the `LiveSurvey`
+# class in Echopop
+
+---
+  #####################################################################################################################
+  # Biological data processing#
+  ########################
+  # Jolly and Hampton (1990) stratified mean calculation
+  bio_hake_len_bin: [                   # length sequence array 2 - 80 cm. This specifies the bin centers!
+                  2,                    # start of bin centers
+                  80,                   # end of bin centers
+                  40                    # number of bins in total
+  ]
+
+  #####################################################################################################################
+  # Geospatial settings#
+  ########################
+  inpfc:                                  # INPFC northern latitude limits and labels
+    latitude_max: [36.0, 40.5, 43.0,    
+                  45.7667, 48.50, 55.0]
+    stratum_names: [1, 2, 3, 4, 5, 6]     
+  geospatial:
+    init: epsg:4326                       # EPSG integer code for geodetic parameter dataset
+
+  #####################################################################################################################
+  # Analysis settings#
+  ########################
+  acoustics:
+    nasc_frequency: 38                    # kHz
+  biology:
+    separate_stations:
+      station_id: ["length", "specimen"]
+  ## NOTE: `link_biology_acoustics` defines how biological and acoustic data are linked with one another. This 
+  ## comprises True/False statements that denote the desired association. All values set to "True" will be output.
+  ## `global`        --> NASC associated with sigma_bs calculated from all survey data
+  ## `INPFC`         --> NASC for each INPFC stratum associated with matched stratum-specific sigma_bs 
+  ## `closest_haul`  --> NASC associated with sigma_bs calculated from the closest (spatially) trawls
+  ## `weighted_haul` --> NASC associated with sigma_bs calculated from all survey data weighted by distance from haul coordinates
+  link_biology_acoustics:         
+    global: False
+    INPFC: True
+    closest_haul: False
+    weighted_haul: False
+  ## NOTE: `biological_processing` 
+
+  #####################################################################################################################
+  # Log-linear regression#
+  ########################
+  # Target strength (TS) - length (L) regression: TS=m*log10(L)+b
+  TS_length_regression_parameters:
+    pacific_hake:                         # corresponding species text code
+      number_code: 22500                  # species number code
+      TS_L_slope: 20.0                    # the 'm' or 'slope' parameter
+      TS_L_intercept: -68.0               # the 'b' or 'y-intercept'
+      length_units: cm                    # units for L used in regression/relationship
+...
diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml
new file mode 100644
index 00000000..f374e624
--- /dev/null
+++ b/config_files/live_survey_year_2019_config.yml
@@ -0,0 +1,91 @@
+# This YAML file is a configuration file specifying
+# input filenames & some process parameter settings.
+# Relative file paths defined below are concatenated
+# with the data_root_dir path also set below.
+
+---
+##############################################################################
+# Parameters
+
+survey_year: 2019            # survey year being considered
+
+##############################################################################
+# Directory path that contains all input data needed
+
+data_root_dir: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files
+
+##############################################################################
+# Input data directories
+
+acoustic:
+  directory: acoustic/
+  extension: zarr
+  sheetname: null
+biological:
+  directory: biology/
+  extension: csv 
+  sheetname: null 
+
+  length: 
+    directory: Biological
+    filename: Biological/
+  length:
+    US:
+      filename: Biological/US/2019_biodata_length.xlsx
+      sheetname: biodata_length
+    CAN:
+      filename: Biological/CAN/2019_biodata_length_CAN.xlsx
+      sheetname: biodata_length_CAN
+  specimen:
+    US:
+      filename: Biological/US/2019_biodata_specimen_AGES.xlsx
+      sheetname: biodata_specimen
+    CAN:
+      filename: Biological/CAN/2019_biodata_specimen_CAN_AGES.xlsx
+      sheetname: biodata_specimen_CAN
+  catch:
+    US:
+      filename: Biological/US/2019_biodata_catch.xlsx
+      sheetname: biodata_catch
+    CAN:
+      filename: Biological/CAN/2019_biodata_catch_CAN.xlsx
+      sheetname: biodata_catch_CAN
+  haul_to_transect:
+    US:
+      filename: Biological/US/haul_to_transect_mapping_2019.xlsx
+      sheetname: Sheet1
+    CAN:
+      filename: Biological/CAN/haul_to_transect_mapping_2019_CAN.xlsx
+      sheetname: Sheet1
+stratification:
+  strata:
+    # The two stratification types are found in two sheets: "Base KS" and "INPFC"
+    filename: Stratification/US_CAN strata 2019_final.xlsx
+    sheetname: Base KS
+  geo_strata:
+    # The two stratification types are found in two sheets: "stratification1" and "INPFC"
+    filename: Stratification/Stratification_geographic_Lat_2019_final.xlsx
+    sheetname: [ INPFC , stratification1 ]
+NASC:
+  # NASC values
+  no_age1:
+    # file that excludes age1 values
+    filename: Exports/US_CAN_detailsa_2019_table2y+_ALL_final - updated.xlsx
+    sheetname: Sheet1
+  all_ages:
+    # file that includes all ages
+    filename: Exports/US_CAN_detailsa_2019_table1y+_ALL_final - updated.xlsx
+    sheetname: Sheet1
+kriging:
+  mesh:
+    filename: Kriging_files/Kriging_grid_files/krig_grid2_5nm_cut_centroids_2013.xlsx
+    sheetname: krigedgrid2_5nm_forChu
+  isobath_200m:
+    # filename: Kriging_files/Kriging_grid_files/Smoothing_EasyKrig.xlsx
+    filename: Kriging_files/Kriging_grid_files/transformation_isobath_coordinates.xlsx
+    sheetname: Smoothing_EasyKrig
+  vario_krig_para:
+    # NOTE: This file is not currently used
+    filename: Kriging_files/default_vario_krig_settings_2019_US_CAN.xlsx
+    sheetname: Sheet1
+...
diff --git a/echopop/live/__init__.py b/echopop/live/__init__.py
new file mode 100644
index 00000000..b8585ba9
--- /dev/null
+++ b/echopop/live/__init__.py
@@ -0,0 +1 @@
+from _echopop_version import version as __version__  # noqa
\ No newline at end of file
diff --git a/echopop/live/acoustics.py b/echopop/live/acoustics.py
new file mode 100644
index 00000000..e69de29b
diff --git a/echopop/live/biology.py b/echopop/live/biology.py
new file mode 100644
index 00000000..e69de29b
diff --git a/echopop/live/core.py b/echopop/live/core.py
new file mode 100644
index 00000000..de066ae3
--- /dev/null
+++ b/echopop/live/core.py
@@ -0,0 +1,28 @@
+from datetime import datetime
+
+import pandas as pd
+
+LIVE_DATA_STRUCTURE = {
+    "meta": {
+        "provenance": dict(),
+        "date": list(),
+    },
+    "input": {
+        "acoustics": {
+            "nasc_df": pd.DataFrame(),
+        },
+        "biology": {
+            "catch_df": pd.DataFrame(),
+            "distributions": {
+                "length_bins_df": pd.DataFrame(),
+            },
+            "length_df": pd.DataFrame(),
+            "specimen_df": pd.DataFrame(),
+        },
+    },
+    "results": {
+        "acoustics": dict(),
+        "biology": dict(),
+        "stratified": dict(),        
+    },
+}
\ No newline at end of file
diff --git a/echopop/live/livesurvey.py b/echopop/live/livesurvey.py
new file mode 100644
index 00000000..70765b0f
--- /dev/null
+++ b/echopop/live/livesurvey.py
@@ -0,0 +1,41 @@
+from typing import Union
+from pathlib import Path
+import copy
+import yaml
+
+from .core import(
+    DATA_STRUCTURE
+)
+
+from ..acoustics import (
+    ts_length_regression,
+    to_dB,
+    to_linear
+)
+
+class LiveSurvey:
+    """
+    A real-time processing version of the `echopop` base 
+    `Survey` class that ingests biological, acoustic, and
+    event meta data to provide population estimates when 
+    generated.
+    """
+
+    def __init__(
+        self
+    ):
+        # Initialize `meta` attribute
+        self.meta = copy.deepcopy(DATA_STRUCTURE["meta"])
+
+        # Loading the configuration settings and definitions that are used to
+        # initialize the Survey class object
+        self.config = el.load_configuration(Path(init_config_path), Path(survey_year_config_path))
+
+        # Loading the datasets defined in the configuration files
+        self.input = el.load_survey_data(self.config)
+
+        # Initialize the `analysis` data attribute
+        self.analysis = copy.deepcopy(DATA_STRUCTURE["analysis"])
+
+        # Initialize the `results` data attribute
+        self.results = copy.deepcopy(DATA_STRUCTURE["results"])
\ No newline at end of file
diff --git a/echopop/live/load.py b/echopop/live/load.py
new file mode 100644
index 00000000..e69de29b
diff --git a/echopop/live/spatial.py b/echopop/live/spatial.py
new file mode 100644
index 00000000..e69de29b
diff --git a/echopop/live/write.py b/echopop/live/write.py
new file mode 100644
index 00000000..e69de29b
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
new file mode 100644
index 00000000..9201adc7
--- /dev/null
+++ b/echopop/zarr_read_ingest_test.py
@@ -0,0 +1,103 @@
+import zarr
+import xarray as xr
+import shutil 
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+specimen_df = pd.DataFrame(
+    {
+        "haul_num": np.repeat([1,2,3], 4),
+        "station": "specimen",
+        "sex": np.tile(["male", "female"], 6),
+        "length": np.array([11, 11, 11, 18, 21, 23, 13, 11, 19, 25, 18, 9]), 
+        "weight": np.array([11, 14, 16, 18, 21, 23, 13, 11, 19, 25, 18, 9]) / 3.5,
+    },
+)
+
+length_df = pd.DataFrame(
+    {
+        "haul_num": np.repeat([1,2,3], 4),
+        "station": "length",
+        "sex": np.tile(["male", "female"], 6),
+        "length": np.array([16, 15, 19, 14, 9, 10, 18, 15, 16, 22, 17, 11]), 
+        "length_count": np.array([103, 123, 257, 106, 52, 329, 131, 72, 101, 212, 93, 81]),
+    },
+)
+
+catch_df = pd.DataFrame(
+    {
+        "haul_num": np.array([1, 2, 3]),
+        "weight": np.array([503.12, 684.32, 978.54])
+    }
+)
+
+TS_SLOPE = 20.0
+TS_INTERCEPT = -68.0
+
+####
+# CONCATENATE FILE SOURCES
+specimen_reframed = specimen_df.groupby(["haul_num", "station", "sex", "length"])["length"].value_counts().to_frame("length_count").reset_index()
+specimen_reframed
+# MELD
+all_lengths = pd.concat([length_df, specimen_reframed])
+# COMBINE 
+comb_lengths = all_lengths.groupby(["haul_num", "sex", "length"])["length_count"].sum().to_frame("length_count").reset_index()
+
+
+# CONVERT TO TS
+comb_lengths["ts"] = TS_SLOPE * np.log10(comb_lengths["length"]) + TS_INTERCEPT
+# TO SIGMA_BS
+comb_lengths["sigma_bs"] = 10 ** (comb_lengths["ts"] / 10)
+# WEIGHTED MEAN SIGMA_BS
+sigma_mean = np.average(comb_lengths["sigma_bs"], weights=comb_lengths["length_count"])
+
+### 
+# INTEGRATE NASC
+path2file = "C:/Users/15052/Downloads/win_1720457505_1720460000_NASC.zarr"
+
+Path(path2file).exists()
+xds = xr.open_dataset(path2file, engine="zarr")
+xds
+xdf = xds.to_dataframe().reset_index()
+xdf["NASC"] = xdf["NASC"].fillna(0.0)
+# convert frequency
+xdf["frequency_nominal"] = (xdf["frequency_nominal"] * 1e-3).astype(int)
+# filter
+xdf_38 = xdf[xdf["frequency_nominal"] == nasc_frequency]
+
+xdf_38.plot.scatter(x="distance", y="depth", c="NASC")
+plt.show()
+
+xdf_int = xdf_38.groupby(["distance", "longitude", "latitude"])["NASC"].sum().reset_index()
+
+plt.scatter(xdf_int["longitude"], xdf_int["latitude"], c=xdf_int["NASC"])
+plt.plot(xdf_int["longitude"], xdf_int["latitude"])
+plt.show()
+
+# CONVERT TO NUMBER DENSITY
+xdf_int["number_density"] = xdf_int["NASC"] / (4.0 * np.pi * sigma_mean)
+
+
+###################
+from typing import Union
+from pathlib import Path
+import copy
+import yaml
+
+# from echopop.acoustics import ts_length_regression, to_dB, to_linear
+# from echopop.live.core import DATA_STRUCTURE
+
+
+### INIT CONFIG
+initialization_config = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
+
+# Initialize `meta` attribute
+meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"])
+
+# Loading the configuration settings and definitions that are used to
+# initialize the Survey class object
+config = yaml.safe_load(Path(initialization_config).read_text())
+
+nasc_frequency = config["acoustics"]["nasc_frequency"]
\ No newline at end of file

From 00c898dcb6027199c80b2648fb7974123dc8bc90 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 15 Jul 2024 13:37:14 -0700
Subject: [PATCH 02/81] Initial data loading function refactoring

---
 config_files/live_initialization_config.yml   |  79 +++--
 config_files/live_survey_year_2019_config.yml |  81 +----
 echopop/zarr_read_ingest_test.py              | 327 +++++++++++++++++-
 3 files changed, 374 insertions(+), 113 deletions(-)

diff --git a/config_files/live_initialization_config.yml b/config_files/live_initialization_config.yml
index ba1474aa..7cff952b 100644
--- a/config_files/live_initialization_config.yml
+++ b/config_files/live_initialization_config.yml
@@ -6,52 +6,51 @@
   #####################################################################################################################
   # Biological data processing#
   ########################
-  # Jolly and Hampton (1990) stratified mean calculation
-  bio_hake_len_bin: [                   # length sequence array 2 - 80 cm. This specifies the bin centers!
-                  2,                    # start of bin centers
-                  80,                   # end of bin centers
-                  40                    # number of bins in total
-  ]
+  biology:
+    # Length-binning
+    # NOTE: start : end : number 
+    length_distribution:
+      bins: [2, 80, 40]
+    # Station separation
+    # NOTE: if `separate_stations` is True, `['list']` is required for `station_id`
+    stations: 
+      separate_stations: True
+      station_id: ["length", "specimen"]
 
   #####################################################################################################################
   # Geospatial settings#
   ########################
-  inpfc:                                  # INPFC northern latitude limits and labels
-    latitude_max: [36.0, 40.5, 43.0,    
-                  45.7667, 48.50, 55.0]
-    stratum_names: [1, 2, 3, 4, 5, 6]     
   geospatial:
-    init: epsg:4326                       # EPSG integer code for geodetic parameter dataset
+    inpfc:                                  # INPFC northern latitude limits and labels
+      latitude_max: [36.0, 40.5, 43.0,    
+                    45.7667, 48.50, 55.0]
+      stratum_names: [1, 2, 3, 4, 5, 6]     
+    projection: epsg:4326                   # EPSG integer code for geodetic parameter dataset
+  # NOTE: `link_biology_acoustics` defines how biological and acoustic data are linked with one another. This 
+  # comprises True/False statements that denote the desired association. All values set to "True" will be output.
+  # `global`        --> NASC associated with sigma_bs calculated from all survey data
+  # `INPFC`         --> NASC for each INPFC stratum associated with matched stratum-specific sigma_bs 
+  # `closest_haul`  --> NASC associated with sigma_bs calculated from the closest (spatially) trawls
+  # `weighted_haul` --> NASC associated with sigma_bs calculated from all survey data weighted by distance from haul coordinates
+    link_biology_acoustics:         
+      global: False
+      INPFC: True
+      closest_haul: False
+      weighted_haul: False
 
   #####################################################################################################################
-  # Analysis settings#
-  ########################
+  # Acoustics settings#
+  ######################## 
   acoustics:
-    nasc_frequency: 38                    # kHz
-  biology:
-    separate_stations:
-      station_id: ["length", "specimen"]
-  ## NOTE: `link_biology_acoustics` defines how biological and acoustic data are linked with one another. This 
-  ## comprises True/False statements that denote the desired association. All values set to "True" will be output.
-  ## `global`        --> NASC associated with sigma_bs calculated from all survey data
-  ## `INPFC`         --> NASC for each INPFC stratum associated with matched stratum-specific sigma_bs 
-  ## `closest_haul`  --> NASC associated with sigma_bs calculated from the closest (spatially) trawls
-  ## `weighted_haul` --> NASC associated with sigma_bs calculated from all survey data weighted by distance from haul coordinates
-  link_biology_acoustics:         
-    global: False
-    INPFC: True
-    closest_haul: False
-    weighted_haul: False
-  ## NOTE: `biological_processing` 
-
-  #####################################################################################################################
-  # Log-linear regression#
-  ########################
-  # Target strength (TS) - length (L) regression: TS=m*log10(L)+b
-  TS_length_regression_parameters:
-    pacific_hake:                         # corresponding species text code
-      number_code: 22500                  # species number code
-      TS_L_slope: 20.0                    # the 'm' or 'slope' parameter
-      TS_L_intercept: -68.0               # the 'b' or 'y-intercept'
-      length_units: cm                    # units for L used in regression/relationship
+    # Acoustic transmit frequency (Hz or kHz)
+    transmit: 
+      frequency: 38.0
+      units: kHz
+    # Target strength (TS) - length (L) regression: TS=m*log10(L)+b
+    TS_length_regression_parameters:
+      pacific_hake:                         # corresponding species text code
+        number_code: 22500                  # species number code
+        TS_L_slope: 20.0                    # the 'm' or 'slope' parameter
+        TS_L_intercept: -68.0               # the 'b' or 'y-intercept'
+        length_units: cm                    # units for L used in regression/relationship
 ...
diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml
index f374e624..6272b0fd 100644
--- a/config_files/live_survey_year_2019_config.yml
+++ b/config_files/live_survey_year_2019_config.yml
@@ -16,76 +16,13 @@ data_root_dir: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files
 
 ##############################################################################
 # Input data directories
-
-acoustic:
-  directory: acoustic/
-  extension: zarr
-  sheetname: null
-biological:
-  directory: biology/
-  extension: csv 
-  sheetname: null 
-
-  length: 
-    directory: Biological
-    filename: Biological/
-  length:
-    US:
-      filename: Biological/US/2019_biodata_length.xlsx
-      sheetname: biodata_length
-    CAN:
-      filename: Biological/CAN/2019_biodata_length_CAN.xlsx
-      sheetname: biodata_length_CAN
-  specimen:
-    US:
-      filename: Biological/US/2019_biodata_specimen_AGES.xlsx
-      sheetname: biodata_specimen
-    CAN:
-      filename: Biological/CAN/2019_biodata_specimen_CAN_AGES.xlsx
-      sheetname: biodata_specimen_CAN
-  catch:
-    US:
-      filename: Biological/US/2019_biodata_catch.xlsx
-      sheetname: biodata_catch
-    CAN:
-      filename: Biological/CAN/2019_biodata_catch_CAN.xlsx
-      sheetname: biodata_catch_CAN
-  haul_to_transect:
-    US:
-      filename: Biological/US/haul_to_transect_mapping_2019.xlsx
-      sheetname: Sheet1
-    CAN:
-      filename: Biological/CAN/haul_to_transect_mapping_2019_CAN.xlsx
-      sheetname: Sheet1
-stratification:
-  strata:
-    # The two stratification types are found in two sheets: "Base KS" and "INPFC"
-    filename: Stratification/US_CAN strata 2019_final.xlsx
-    sheetname: Base KS
-  geo_strata:
-    # The two stratification types are found in two sheets: "stratification1" and "INPFC"
-    filename: Stratification/Stratification_geographic_Lat_2019_final.xlsx
-    sheetname: [ INPFC , stratification1 ]
-NASC:
-  # NASC values
-  no_age1:
-    # file that excludes age1 values
-    filename: Exports/US_CAN_detailsa_2019_table2y+_ALL_final - updated.xlsx
-    sheetname: Sheet1
-  all_ages:
-    # file that includes all ages
-    filename: Exports/US_CAN_detailsa_2019_table1y+_ALL_final - updated.xlsx
-    sheetname: Sheet1
-kriging:
-  mesh:
-    filename: Kriging_files/Kriging_grid_files/krig_grid2_5nm_cut_centroids_2013.xlsx
-    sheetname: krigedgrid2_5nm_forChu
-  isobath_200m:
-    # filename: Kriging_files/Kriging_grid_files/Smoothing_EasyKrig.xlsx
-    filename: Kriging_files/Kriging_grid_files/transformation_isobath_coordinates.xlsx
-    sheetname: Smoothing_EasyKrig
-  vario_krig_para:
-    # NOTE: This file is not currently used
-    filename: Kriging_files/default_vario_krig_settings_2019_US_CAN.xlsx
-    sheetname: Sheet1
+input_directories: 
+  acoustic:
+    directory: acoustics/
+    extension: zarr
+    sheetname: null
+  biological:
+    directory: biology/
+    extension: csv 
+    sheetname: null 
 ...
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
index 9201adc7..3eac35bb 100644
--- a/echopop/zarr_read_ingest_test.py
+++ b/echopop/zarr_read_ingest_test.py
@@ -1,10 +1,335 @@
 import zarr
 import xarray as xr
 import shutil 
-from pathlib import Path
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
+from typing import Union, Tuple
+from pathlib import Path
+import copy
+import yaml
+import glob
+from datetime import datetime
+import geopandas as gpd
+
+####################################################################################################
+# * Functionality for a) loading YAML configuration file, b) search defined directory for 
+# * input files, c) ingest *.zarr/*.csv
+# TODO: Incorporate complete YAML file validator
+# TODO: Documentation
+def live_configuration(live_init_config_path: Union[str, Path], 
+                       live_file_config_path: Union[str, Path]):
+    
+    # Validate file existence
+    # ---- str-to-Path conversion, if necessary
+    live_init_config_path = Path(live_init_config_path)
+    live_file_config_path = Path(live_file_config_path)
+    # ---- Create list of both config paths
+    config_files = [live_init_config_path, live_file_config_path]
+    # ---- List of file existence checks
+    config_existence = [live_init_config_path.exists(), live_file_config_path.exists()]
+    # ---- Error evaluation and print message (if applicable)
+    if not all(config_existence):
+        missing_config = [
+            files for files, exists in zip(config_files, config_existence) if not exists
+        ]
+        raise FileNotFoundError(f"The following configuration files do not exist: {missing_config}")
+
+    # Read the YAML configuration/recipe file to parameterize the `LiveSurvey` class
+    # ---- Initialization settings
+    init_config = yaml.safe_load(Path(live_init_config_path).read_text())
+    # ---- Filepath/directory settings
+    file_config = yaml.safe_load(Path(live_file_config_path).read_text())
+    
+    # Check for intersecting/duplicative configuration keys
+    # ---- Compare sets of keys from each dictionary
+    config_intersect = set(init_config.keys()).intersection(set(file_config.keys()))
+    # ---- Raise error if needed
+    if config_intersect:
+        raise ValueError(
+            f"The initialization and file configuration files comprise the following intersecting "
+            f"keys: {' ,'.join(config_intersect)}. Key names must be unique for each configuration "
+            f"file."
+        )
+    
+    # Combine both into a dictionary output that can be added to the `LiveSurvey` class object
+    return {**init_config, **file_config}
+####################################################################################################  
+# TEST: YAML FILE CONFIGURATION
+# ---- Define filepaths
+live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
+live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
+# ---- Run function: `live_configuration`
+file_configuration = live_configuration(live_init_config_path, live_file_config_path)
+file_configuration
+####################################################################################################
+# * Accessory function for tuning the acoustic transmit frequency units/scaling
+# TODO: Documentation
+def configure_transmit_frequency(frequency_values: pd.Series,
+                                 transmit_settings: dict, 
+                                 current_units: str):
+    
+    # Extract transmit frequency units defined in configuration file
+    configuration_units = transmit_settings["units"]
+    
+    # Transform the units, if necessary
+    # ---- Hz to kHz
+    if current_units == "Hz" and configuration_units == "kHz":
+        return frequency_values * 1e-3
+    # ---- kHz to Hz
+    elif current_units == "kHz" and configuration_units == "Hz":
+        return frequency_values * 1e3
+    # ---- No change
+    else:
+        return frequency_values
+####################################################################################################
+# * Define `LIVE_INPUT_FILE_CONFIG_MAP` configuration mapping (this will be in an equivalent 
+# * `core.py`)
+# TODO: Update structure with additional information (as needed)
+# TODO: Documentation
+LIVE_INPUT_FILE_CONFIG_MAP = {
+    "acoustics": {
+        "xarray_coordinates": {
+            "distance": float,
+            "depth": float,
+        },
+        "xarray_variables": {
+            "NASC": float,
+            "frequency_nominal": float, 
+            "latitude": float,
+            "longitude": float,
+            "ping_time": "datetime64[ns]",
+        }
+    }
+}
+####################################################################################################
+# * Functionality for reading in processed acoustic data
+# TODO: Expand data validator and limit cases to '*.zarr' (for now)
+# TODO: Refactor "extra" components such as the validation steps, xarray-to-dataframe piping, etc.
+# TODO: Documentation
+def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame, xr.Dataset]:
+    # Get acoustic directory and initialization settings
+    # ---- Files
+    acoustic_file_settings = file_configuration["input_directories"]["acoustic"]
+    # ---- General settings
+    acoustic_analysis_settings = file_configuration["acoustics"]
+    
+    # Create full filepath
+    acoustic_directory_path = (
+        Path(file_configuration["data_root_dir"]) / acoustic_file_settings["directory"]
+    )
+    
+    # Validate filepath, columns, datatypes
+    # ---- Directory check
+    directory_existence = acoustic_directory_path.exists()
+    # ---- Error evaluation (if applicable)
+    if not directory_existence:
+        raise FileNotFoundError(
+            f"The acoustic data directory [{acoustic_directory_path}] does not exist."
+        )
+    # ---- Get the defined file extension
+    file_extension = acoustic_file_settings["extension"]
+    # ---- In the case of a *.zarr file
+    if file_extension == "zarr":
+        # ---- Create Path.glob generator object
+        file_path_obj = acoustic_directory_path.glob(f"*{'.'+file_extension}")
+        # ---- Find all zarr files
+        zarr_files = list(file_path_obj)
+        # ---- Ensure files exist or raise error otherwise
+        if len(zarr_files) < 1:
+            raise FileNotFoundError(
+                f"No `*.zarr` files found in [{acoustic_directory_path}]!"
+            )
+        # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
+        acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"]
+        # ---- Create list of coordinate data variables
+        specified_vars = list(acoustics_config_map["xarray_variables"].keys())
+        # ---- Create set of coordinate variables
+        specified_coords = list(acoustics_config_map["xarray_coordinates"].keys())      
+        # ---- Concatenate into a full configuration map
+        full_config_map = {**acoustics_config_map["xarray_coordinates"],
+                           **acoustics_config_map["xarray_variables"]}          
+        # ! [REQUIRES DASK] ---- Read in all listed files 
+        zarr_data_ds = xr.open_mfdataset(zarr_files, 
+                                         engine="zarr",
+                                         chunks="auto",
+                                         data_vars=specified_vars,
+                                         coords=specified_coords)
+        # ---- Extract coordinate metadata
+        coordinate_metadata = zarr_data_ds[["longitude", "latitude"]]
+        # ---- Convert to a DataFrame
+        zarr_data_df = zarr_data_ds.to_dataframe().reset_index()
+        # ---- Check for any missing columns
+        missing_columns = (
+            [key for key in full_config_map.keys() if key not in zarr_data_df.columns]
+        )
+        # ---- Raise Error, if needed
+        if missing_columns: 
+            raise ValueError(
+                f"The following columns are missing from at least one *.{file_extension} file in "
+                f"[{acoustic_directory_path}]: {', '.join(missing_columns)}!"    
+            )
+        # ---- Select defined columns
+        zarr_data_df_filtered = zarr_data_df[full_config_map.keys()]
+        # ---- Validate data types
+        zarr_data_df_filtered = (
+            zarr_data_df_filtered
+            .apply(lambda col: col.astype(full_config_map[col.name]) 
+                   if col.name in full_config_map else col)
+        )
+        
+    # Extract defined acoustic frequency
+    # ---- From the configuration 
+    transmit_settings = acoustic_analysis_settings["transmit"]
+    # ---- Transform `frequency_nominal`, if necessary
+    zarr_data_df_filtered["frequency_nominal"] = (
+        configure_transmit_frequency(zarr_data_df_filtered["frequency_nominal"],
+                                     transmit_settings,
+                                     zarr_data_ds["frequency_nominal"].units)
+    )
+    # ---- Filter out any unused frequency coordinates
+    zarr_data_df_output = (
+        zarr_data_df_filtered
+        [zarr_data_df_filtered["frequency_nominal"] == transmit_settings["frequency"]]
+    )
+    
+    # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object
+    # ---- Replace NASC `NaN` values with `0.0`
+    zarr_data_df_output.loc[:, "NASC"] = zarr_data_df_output.loc[:, "NASC"].fillna(0.0)
+    # ---- Drop frequency column and return the output
+    return zarr_data_df_output.drop(columns = ["frequency_nominal"]), coordinate_metadata
+####################################################################################################  
+# TEST: ACOUSTIC ZARR FILE INGESTION CONFIGURATION
+# ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration`
+acoustic_data, coordinate_metadata = load_acoustic_data(file_configuration)
+acoustic_data
+coordinate_metadata
+####################################################################################################
+def load_spatial_data(file_configuration: dict,
+                      acoustic_data: pd.DataFrame,
+                      coordinate_metadata: xr.Dataset):
+    
+    # Extract spatial strata *only* if spatial information from the configuration settings
+    # ---- Extract the projection
+    projection = file_configuration["geospatial"]["projection"]
+    # ---- Extract the biology-acoustics linking method options
+    acoustics_biology_link = file_configuration["geospatial"]["link_biology_acoustics"]
+    
+    # Validate the spatial biology-acoustics linking method
+    # ---- Get the biology-acoustics linking method
+    link_method = next(key for key, value in acoustics_biology_link.items() if value)
+    # ---- Flag Error if unexpected method
+    if link_method not in ["global", "closest_haul", "INPFC", "weighted_haul"]:
+        raise ValueError(
+            f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options "
+            f"include: 'global', 'closest_haul', 'weighted_haul', and 'INPFC'."
+        )
+        
+    # Validate projection information
+    # ---- Create a dummy GeoDataFrame to extract CRS information
+    # geo_crs = gpd.GeoDataFrame(geometry=[], crs=projection)
+    # ---- Extract coordinate limits from the acoustic data
+    # lat_min = coordinate_metadata.attrs['geospatial_lat_min']
+    # lat_max = coordinate_metadata.attrs['geospatial_lat_max']
+    # lon_min = coordinate_metadata.attrs['geospatial_lon_min']
+    # lon_max = coordinate_metadata.attrs['geospatial_lon_max']
+    # # ---- Create boundary box string
+    # boundary_box_str = (
+    #     f"POLYGON(({lon_min} {lat_min}, {lon_max} {lat_min}, {lon_max} {lat_max}, "
+    #     f"{lon_min} {lat_max}, {lon_min} {lat_min}))"
+    # )
+    
+    # data_gdf = gpd.GeoDataFrame(acoustic_data, geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]),crs=f"epsg:{utm_string_generator(lon_min, lat_min)}")
+    # gpd.GeoDataFrame(acoustic_data, geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]),crs=f"epsg:4326").to_crs("epsg:32610")
+    
+    # from pyproj import CRS
+    # from pyproj.aoi import AreaOfInterest
+    # from pyproj.database import query_utm_crs_info
+    
+    # utm_crs_list = query_utm_crs_info(
+    #     datum_name="WGS 84",
+    #     area_of_interest=AreaOfInterest(
+    #         west_lon_degree=lon_min,
+    #         south_lat_degree=lat_min,
+    #         east_lon_degree=-lon_max,
+    #         north_lat_degree=lat_max,
+    #     ),
+    # )
+    # CRS.from_epsg(utm_crs_list[0].code).to_epsg("+proj=latlon")
+    
+####################################################################################################
+def live_data(file_configuration: dict): 
+    
+    # Extract the file directories (or from the configuration) containing acoustic, biological, and 
+    # spatial definitions/data/parameters
+    # ---- Acoustic data
+    acoustic_data = load_validated_acoustic_data(file_configuration)
+    # ---- Biological data 
+    # ---- Spatial data
+    
+
+
+####################################################################################################
+# * Define `LIVE_DATA_STRUCTURE` configuration mapping (this will be in an equivalent `core.py`)
+# TODO: Update structure with additional information (as needed)
+# TODO: Documentation
+LIVE_DATA_STRUCTURE = {
+    "meta": {
+        "provenance": dict(),
+        "date": list(),
+    },
+    "input": {
+        "acoustics": {
+            "nasc_df": pd.DataFrame(),
+        },
+        "biology": {
+            "catch_df": pd.DataFrame(),
+            "distributions": {
+                "length_bins_df": pd.DataFrame(),
+            },
+            "length_df": pd.DataFrame(),
+            "specimen_df": pd.DataFrame(),
+        },
+    },
+    "results": {
+        "acoustics": dict(),
+        "biology": dict(),
+        "stratified": dict(),        
+    },
+}
+####################################################################################################
+# * Define `LiveSurvey` class structure
+# TODO: Incorporate validators
+# TODO: Scope out full structure including accessors, attributes, and methods
+# TODO: Configure input arguments (for initialization)
+# TODO: Documentation
+class LiveSurvey:
+    """
+    A real-time processing version of the `echopop` base `Survey` class that ingests biological, 
+    acoustic, and event meta data to provide population estimates when generated.
+    """
+
+    def __init__(
+        self,
+        live_init_config_path: Union[str, Path], 
+        live_file_config_path: Union[str, Path],
+    ):
+        # Initialize `meta` attribute
+        self.meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"])
+
+        # Loading the configuration settings and definitions that are used for defining the 
+        # configuration settings
+        self.config = live_configuration(live_file_config_path, live_file_config_path)
+
+        # Loading the datasets defined in the configuration files
+        self.input = el.load_survey_data(self.config)
+
+        # Initialize the `results` data attribute
+        self.results = copy.deepcopy(LIVE_DATA_STRUCTURE["results"])
+
+current_units = zarr_data_ds["frequency_nominal"].units
+acoustic_analysis_settings["transmit"]
+file_configuration
 
 specimen_df = pd.DataFrame(
     {

From 69340e751adc68c1201f8590ea3a1fc959e304df Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 15 Jul 2024 21:47:42 -0700
Subject: [PATCH 03/81] Updated methods

---
 config_files/live_initialization_config.yml   |  13 ++
 config_files/live_survey_year_2019_config.yml |   8 +-
 echopop/zarr_read_ingest_test.py              | 143 ++++++++++++++++--
 3 files changed, 148 insertions(+), 16 deletions(-)

diff --git a/config_files/live_initialization_config.yml b/config_files/live_initialization_config.yml
index 7cff952b..6033595a 100644
--- a/config_files/live_initialization_config.yml
+++ b/config_files/live_initialization_config.yml
@@ -16,6 +16,9 @@
     stations: 
       separate_stations: True
       station_id: ["length", "specimen"]
+    # Trawl identifier 
+    catch: 
+      partition: codend
 
   #####################################################################################################################
   # Geospatial settings#
@@ -25,7 +28,17 @@
       latitude_max: [36.0, 40.5, 43.0,    
                     45.7667, 48.50, 55.0]
       stratum_names: [1, 2, 3, 4, 5, 6]     
+    griddify: 
+      # Coordinate bounds
+      bounds:
+        latitude: [32.75, 54.75]
+        longitude: [-134.75, -117.00]
+      # x/y (or E-W/N-S) grid resolution in nmi
+      grid_resolution:
+        x: 25.0
+        y: 25.0
     projection: epsg:4326                   # EPSG integer code for geodetic parameter dataset
+  # TODO: Remember to convert this back to a string
   # NOTE: `link_biology_acoustics` defines how biological and acoustic data are linked with one another. This 
   # comprises True/False statements that denote the desired association. All values set to "True" will be output.
   # `global`        --> NASC associated with sigma_bs calculated from all survey data
diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml
index 6272b0fd..bf65930d 100644
--- a/config_files/live_survey_year_2019_config.yml
+++ b/config_files/live_survey_year_2019_config.yml
@@ -12,7 +12,7 @@ survey_year: 2019            # survey year being considered
 ##############################################################################
 # Directory path that contains all input data needed
 
-data_root_dir: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files
+data_root_dir: C:/Users/15052/Documents/GitHub/echopop_data/live_2019_files
 
 ##############################################################################
 # Input data directories
@@ -20,9 +20,11 @@ input_directories:
   acoustic:
     directory: acoustics/
     extension: zarr
-    sheetname: null
   biological:
     directory: biology/
     extension: csv 
-    sheetname: null 
+    file_ids: 
+      catch: catch_perc
+      length: lf
+      specimen: spec
 ...
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
index 3eac35bb..10ecf076 100644
--- a/echopop/zarr_read_ingest_test.py
+++ b/echopop/zarr_read_ingest_test.py
@@ -1,6 +1,4 @@
-import zarr
 import xarray as xr
-import shutil 
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
@@ -57,8 +55,8 @@ def live_configuration(live_init_config_path: Union[str, Path],
 ####################################################################################################  
 # TEST: YAML FILE CONFIGURATION
 # ---- Define filepaths
-live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
-live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
+live_init_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
+live_file_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
 # ---- Run function: `live_configuration`
 file_configuration = live_configuration(live_init_config_path, live_file_config_path)
 file_configuration
@@ -150,6 +148,8 @@ def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame, xr.Datas
         full_config_map = {**acoustics_config_map["xarray_coordinates"],
                            **acoustics_config_map["xarray_variables"]}          
         # ! [REQUIRES DASK] ---- Read in all listed files 
+        # TODO: The sliding/overlapping windows makes this annoying -- in theory, only a single new zarr file will be ingested
+        # TODO: So this needs to be replaced w/ `open_dataset` instead
         zarr_data_ds = xr.open_mfdataset(zarr_files, 
                                          engine="zarr",
                                          chunks="auto",
@@ -200,6 +200,7 @@ def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame, xr.Datas
     return zarr_data_df_output.drop(columns = ["frequency_nominal"]), coordinate_metadata
 ####################################################################################################  
 # TEST: ACOUSTIC ZARR FILE INGESTION CONFIGURATION
+# NOTE: 
 # ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration`
 acoustic_data, coordinate_metadata = load_acoustic_data(file_configuration)
 acoustic_data
@@ -214,7 +215,14 @@ def load_spatial_data(file_configuration: dict,
     projection = file_configuration["geospatial"]["projection"]
     # ---- Extract the biology-acoustics linking method options
     acoustics_biology_link = file_configuration["geospatial"]["link_biology_acoustics"]
-    
+
+    # Convert the DataFrame to a GeoDataFrame
+    acoustic_data_gdf = gpd.GeoDataFrame(
+        data=acoustic_data,
+        geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]),
+        crs=projection
+    )
+
     # Validate the spatial biology-acoustics linking method
     # ---- Get the biology-acoustics linking method
     link_method = next(key for key, value in acoustics_biology_link.items() if value)
@@ -224,6 +232,9 @@ def load_spatial_data(file_configuration: dict,
             f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options "
             f"include: 'global', 'closest_haul', 'weighted_haul', and 'INPFC'."
         )
+    
+    # Create INPFC stratum dataframe
+    # ---- Extract 
         
     # Validate projection information
     # ---- Create a dummy GeoDataFrame to extract CRS information
@@ -406,17 +417,123 @@ def __init__(
 
 
 ###################
-from typing import Union
-from pathlib import Path
-import copy
-import yaml
+from geopy.distance import distance
+from shapely.geometry import Polygon, Point, box
+import geopandas as gpd
+from shapely.ops import unary_union
+import pyproj
+
+
+grid_settings = file_configuration["geospatial"]["griddify"]
+grid = []
+lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters
+lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters
+lat_min = grid_settings["bounds"]["latitude"][0]
+lat_max = grid_settings["bounds"]["latitude"][1]
+lon_min = grid_settings["bounds"]["longitude"][0]
+lon_max = grid_settings["bounds"]["longitude"][1]
+
+utm_str = utm_string_generator((lon_max + lon_min)/2, (lat_max + lat_min)/2)
+utm_proj = pyproj.Proj(f"epsg:{utm_str}")
+x_min, y_min = utm_proj(lon_min, lat_min)
+x_max, y_max = utm_proj(lon_max, lat_max)
+
+num_lon_steps = int((x_max - x_min) / lon_step)
+num_lat_steps = int((y_max - y_min) / lat_step)
+
+lon1 = np.linspace(x_min, x_max - lon_step, num_lon_steps)
+lat1 = np.linspace(y_min, y_max - lat_step, num_lat_steps)
+lon2 = lon1 + lon_step
+lat2 = lat1 + lat_step
+
+# Convert UTM coordinates back to degrees
+lon_min_grid, lat_min_grid = np.meshgrid(lon1, lat1)
+lon_max_grid, lat_max_grid = np.meshgrid(lon2, lat2)
+
+# Convert UTM coordinates back to degrees with adjusted resolution
+lon1_deg, lat1_deg = utm_proj(lon_min_grid.ravel(), lat_min_grid.ravel(), inverse=True)
+lon2_deg, lat2_deg = utm_proj(lon_max_grid.ravel(), lat_max_grid.ravel(), inverse=True)
+
+polygons = [box(lon1, lat1, lon2, lat2) for lon1, lat1, lon2, lat2 in zip(lon1_deg, lat1_deg, lon2_deg, lat2_deg)]
+grid_gdf = gpd.GeoDataFrame({'geometry': polygons}, crs="epsg:4326")
+
+world = gpd.read_file("C:/Users/15052/Documents/GitHub/echopop_data/live_2019_files/coastline/ne_110m_land/ne_110m_land.shp")
+bbox = box(lon_min - 0.25, lat_min - 0.25, lon_max + 0.25, lat_max + 0.25)
+shapefile = world
+clipped_shapefile = gpd.clip(shapefile, bbox).to_crs(utm_proj.srs)
+clipped_shapefile.to_crs(utm_proj.srs)
+# clipped_geometry = bbox.intersection(world.union_all())
+# clipped_gdf = gpd.GeoDataFrame(geometry=[clipped_geometry], crs=world.crs)
+
+from shapely.geometry import MultiPolygon
+# Create an empty list to store clipped geometries
+# clipped_geometries = []
+
+# # Iterate over each grid polygon
+# for index, row in grid_gdf.iterrows():
+#     # Intersect grid polygon with land shape
+#     intersection = row['geometry'].intersection(clipped_shapefile.unary_union)
+
+#     # If intersection is a MultiPolygon, get the difference with the land shape
+#     if isinstance(intersection, MultiPolygon):
+#         clipped = row['geometry'].difference(clipped_shapefile.unary_union)
+#         if clipped.is_empty:
+#             continue
+#         clipped_geometries.append(clipped)
+#     else:
+#         # If intersection is a single Polygon, directly add to clipped geometries
+#         clipped_geometries.append(intersection)
+
+# clipped_grid = gpd.GeoDataFrame(geometry=clipped_geometries, crs=grid_gdf.crs)
+
+clipped_geometries = grid_gdf['geometry'].to_crs(utm_proj.srs).difference(clipped_shapefile.geometry.union_all())
+clipped_gdf = gpd.GeoDataFrame(geometry=clipped_geometries)
+clipped_gdf.to_crs(epsg=32610)
+
+invalid_geometries = clipped_gdf[~clipped_gdf.is_valid]
+clipped_gdf = clipped_gdf.buffer(0.001)
+clipped_gdf['area_sqm'] = clipped_gdf.area / 46300.00000000001**2
+
+clipped_gdf.area
+
+fig, ax = plt.subplots(figsize=(10, 8))
+clipped_gdf.plot(ax=ax, facecolor="none", edgecolor="black")
+clipped_shapefile.plot(ax=ax, edgecolor='black', linewidth=0.5)
+plt.tight_layout()
+plt.show()
+
+
+bbox.crs = {"init": "epsg:4326"}
+intersection = gpd.overlay(bbox, world, how='intersection')
+
+world_cut = gpd.sjoin(world, gpd.GeoDataFrame(geometry=[bbox]), how='inner', op='intersects')
+
+world_cut = world[world.geometry.intersects(bbox)]
+world_cut.to_crs("epsg:4326")
+
+import matplotlib.pyplot as plt
+fig, ax = plt.subplots(figsize=(10, 10))
+grid_gdf.plot(ax=ax, facecolor="none", edgecolor="black")
+world_cut.plot(ax=ax, linewidth=2, color='blue')
+plt.show()
+
+for cell in grid_gdf:
+
+    x, y = cell.exterior.xy  # Extract x and y coordinates of the cell
+    ax.fill(x, y, facecolor='none', edgecolor='black')  # Plot the cell as a polygon patch
+# Plot coastline
+# world.plot(ax=ax, linewidth=2, color='blue')
+plt.show()
+
+
+bbox = (lat_min, lon_min, lat_max, lon_max)
+G = ox.graph_from_bbox(bbox[2], bbox[3], bbox[0], bbox[1], network_type='none', simplify=False)
+G = ox.geometries_from_bbox(north=bbox[2], south=bbox[0], east=bbox[3], west=bbox[1], tags={'natural': ['coastline']})
 
-# from echopop.acoustics import ts_length_regression, to_dB, to_linear
-# from echopop.live.core import DATA_STRUCTURE
 
 
-### INIT CONFIG
-initialization_config = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
+latitudes = range(int(lat_min), int(lat_max) + 1, int(lat_step))
+longitudes = range(int(lon_min), int(lon_max) + 1, int(lon_step))
 
 # Initialize `meta` attribute
 meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"])

From 3adf521bac3371b301513f9df04362f81b3fcfe9 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Fri, 19 Jul 2024 17:41:32 -0700
Subject: [PATCH 04/81] Updated methods/processing (plus SQL)

---
 config_files/live_initialization_config.yml   |   4 +-
 config_files/live_survey_year_2019_config.yml |  14 +-
 echopop/mesh_generation.py                    | 905 ++++++++++++++++++
 echopop/zarr_read_ingest_test.py              | 745 ++++++++++++--
 4 files changed, 1607 insertions(+), 61 deletions(-)
 create mode 100644 echopop/mesh_generation.py

diff --git a/config_files/live_initialization_config.yml b/config_files/live_initialization_config.yml
index 6033595a..84c48bbb 100644
--- a/config_files/live_initialization_config.yml
+++ b/config_files/live_initialization_config.yml
@@ -31,8 +31,8 @@
     griddify: 
       # Coordinate bounds
       bounds:
-        latitude: [32.75, 54.75]
-        longitude: [-134.75, -117.00]
+        latitude: [32.75, 55.50]
+        longitude: [-135.25, -117.00]
       # x/y (or E-W/N-S) grid resolution in nmi
       grid_resolution:
         x: 25.0
diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml
index bf65930d..a8450039 100644
--- a/config_files/live_survey_year_2019_config.yml
+++ b/config_files/live_survey_year_2019_config.yml
@@ -8,11 +8,13 @@
 # Parameters
 
 survey_year: 2019            # survey year being considered
-
+species:
+  text_code: pacific_hake    # target species for the survey year -- species name
+  number_code: 22500         # target species for the survey year -- numeric code
 ##############################################################################
 # Directory path that contains all input data needed
 
-data_root_dir: C:/Users/15052/Documents/GitHub/echopop_data/live_2019_files
+data_root_dir: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files
 
 ##############################################################################
 # Input data directories
@@ -23,6 +25,14 @@ input_directories:
   biological:
     directory: biology/
     extension: csv 
+    file_name_formats:
+      catch: "{DATE:YYYYMM}_{HAUL}_{FILE_ID:catch_perc}"
+      length: "{DATE:YYYYMM}_{SPECIES_CODE}_{HAUL}_{FILE_ID:lf}"
+      specimen: "{DATE:YYYYMM}_{SPECIES_CODE}_{HAUL}_{FILE_ID:spec}"
+    file_index:
+      catch: [haul_num]
+      length: [haul_num, species_id]
+      specimen: [haul_num, species_id]
     file_ids: 
       catch: catch_perc
       length: lf
diff --git a/echopop/mesh_generation.py b/echopop/mesh_generation.py
new file mode 100644
index 00000000..3fab6d89
--- /dev/null
+++ b/echopop/mesh_generation.py
@@ -0,0 +1,905 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import geopandas as gpd
+from geopy.distance import distance
+from shapely.geometry import Polygon, Point, box
+import geopandas as gpd
+from shapely.ops import unary_union
+import pyproj
+import geopy
+from echopop.spatial.projection import wgs84_to_utm, utm_string_generator
+import shapely.geometry
+from echopop.survey import Survey
+survey = Survey( init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/initialization_config.yml" ,
+                 survey_year_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/survey_year_2019_config.yml" )
+
+
+grid_settings = file_configuration["geospatial"]["griddify"]
+# lat_min = grid_settings["bounds"]["latitude"][0]
+lat_min = 33.75
+# lat_max = grid_settings["bounds"]["latitude"][1]
+lat_max = 55.50
+# lon_min = grid_settings["bounds"]["longitude"][0]
+lon_min = -134.25
+lon_max = grid_settings["bounds"]["longitude"][1]
+
+projection = file_configuration["geospatial"]["projection"]
+
+utm_code = utm_string_generator((lon_max + lon_min)/2, (lat_max + lat_min)/2)
+utm_num = int(utm_code)
+utm_str = f"epsg:{utm_num}"
+
+biology_data = filtered_biology_output
+
+from sqlalchemy import create_engine, text, Engine, inspect
+root_dir = file_configuration["data_root_dir"]
+db_directory = Path(root_dir) / "database"
+db_directory.mkdir(parents=True, exist_ok=True)
+db_file = db_directory / "biology.db"
+# Create the engine with the full path
+engine = create_engine(f'sqlite:///{db_file}')
+
+SQL_COMMANDS = {
+    "create": "CREATE TABLE IF NOT EXISTS {table_name} ({column_definitions});",
+    "check": "SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';",
+    "drop": "DROP TABLE IF EXISTS {table_name};",
+    "select": "SELECT {columns} FROM {table_name};",
+    "index": "CREATE UNIQUE INDEX IF NOT EXISTS {index_name} ON {table_name} ({columns})",
+    # "insert": "INSERT INTO {table_name} ({columns});",
+    "insert": """
+        INSERT INTO {table_name} ({columns}) 
+        SELECT {columns} 
+        FROM (SELECT VALUES {values} FROM (VALUES {value_placeholder})) AS source ({columns}) 
+        {filter_clause};
+        """,
+    "inspect": None,
+}
+
+SQL_DTYPES = {
+    'int32': 'INTEGER',
+    'int64': 'INTEGER',
+    'float64': 'FLOAT',
+    'bool': 'BOOLEAN',
+    'datetime64[ns]': 'DATETIME',
+    'object': 'TEXT'
+}
+
+def SQL(db_file: str, command: str, **kwargs):
+
+    # Create engine from `db_file` string
+    engine = create_engine(f"sqlite:///{db_file}")
+
+    # Format `columns`, if there are any and more than 1
+    if "columns" in kwargs.keys():
+        if isinstance(kwargs["columns"], list):
+            kwargs["columns"] = ", ".join(kwargs["columns"])
+    else:
+        kwargs["columns"] = "*"
+
+    # Format `columns`, if there are any and more than 1
+    # if "filter_columns" in kwargs.keys():
+    #     # ---- Store the value for later
+    #     kwargs["filter_columns_store"] = kwargs["filter_columns"]
+    #     if isinstance(kwargs["filter_columns"], list):
+    #         kwargs["filter_columns"] = ", ".join(kwargs["filter_columns"])
+
+    # Run the command
+    try:
+        with engine.connect() as connection:
+            # ---- SELECT
+            if command == "select":
+                return pd.read_sql(text(SQL_COMMANDS[command].format(**kwargs)), con=connection)
+            # ---- CREATE
+            elif command == "create":
+                # ---- Extract dataframe
+                df_to_add = kwargs["dataframe"]
+                # ---- Check whether the table already exists or not
+                table_exists = (
+                    connection.execute(text(SQL_COMMANDS["check"].format(**kwargs))).fetchone()
+                )
+                # ---- If it doesn't, pre-allocate the table 
+                if table_exists is None:
+                    # ---- Get column definitions as a string
+                    column_def_dict = {
+                        col: SQL_DTYPES.get(str(dtype), 'TEXT') 
+                        for col, dtype in zip(df_to_add.columns, df_to_add.dtypes)
+                    }
+                    # ---- Convert to a single string
+                    kwargs["column_definitions"] = (
+                        ", ".join([f"{col} {dtype}" for col, dtype in column_def_dict.items()])
+                    )
+                    # ---- Create table
+                    connection.execute(text(SQL_COMMANDS["create"].format(**kwargs)))
+            # ---- REPLACE
+            elif command == "replace":
+                # ---- Extract dataframe
+                df_to_add = kwargs["dataframe"]
+                # ---- Replace current
+                df_to_add.to_sql(name=kwargs["table_name"], 
+                                 con=connection, 
+                                 if_exists="replace", index=False)
+
+            # ---- INSERT
+            elif command == "insert": 
+                # ---- Extract dataframe
+                df_to_add = kwargs["dataframe"]
+                # ---- Check if 
+                # table_exists = (
+                #     connection.execute(text(SQL_COMMANDS["check"].format(**kwargs))).fetchone()
+                # )
+                # tables = SQL(db_file, "inspect")
+                # ---- If it doesn't, pre-allocate the table 
+                # if kwargs["table_name"] not in tables and "filter_columns" in kwargs.keys():
+                df_to_add.to_sql(name=kwargs["table_name"], 
+                                    con=connection, 
+                                    if_exists="append", index=False)
+                # else:
+                    #     # ---- Format `filter_columns` command if present
+                    # if "filter_columns" in kwargs.keys():
+                    #     # ---- Fetch table
+                    #     fetch_table = (
+                    #         connection.execute(text(
+                    #             ("SELECT DISTINCT {filter_columns} FROM {table_name}")
+                    #             .format(**kwargs))
+                    #         )
+                    #     )
+                    #     # ---- Format the SQL data into a DataFrame
+                    #     fetched_df = pd.DataFrame(fetch_table.fetchall(), columns=fetch_table.keys())              
+                    #     # ---- Create an index tuples
+                    #     index_tuples = (
+                    #         set(fetched_df[kwargs["filter_columns_store"]]
+                    #             .itertuples(index=False, name=None))
+                    #     )
+                    #     # ---- Filter the dataframe
+                    #     filtered_df = (
+                    #         df_to_add[
+                    #             ~df_to_add[fetched_df.columns].apply(tuple, axis=1)
+                    #             .isin(index_tuples)
+                    #             ]
+                    #     )
+                    #     # ---- Insert the data
+                    #     filtered_df.to_sql(name=kwargs["table_name"], 
+                    #                         con=connection, 
+                    #                         if_exists="append", index=False)
+                    # else:
+                    # df_to_add.to_sql(name=kwargs["table_name"], 
+                    #                 con=connection, 
+                    #                 if_exists="append", index=False)
+            # ---- INSPECT
+            elif command == "inspect":
+                return inspect(engine).get_table_names()
+            else: 
+                connection.execute(text(SQL_COMMANDS[command].format(**kwargs)))
+    finally: 
+        # ---- Dispose of the engine to release any resources being pooled/used
+        engine.dispose()
+
+_ = SQL(db_file, "drop", table_name="catch_df")
+_ = SQL(db_file, "drop", table_name="specimen_df")
+_ = SQL(db_file, "drop", table_name="length_df")
+_ = SQL(db_file, "drop", table_name="files_read")
+
+_ = SQL(db_file, "insert", table_name="files_read", dataframe=current_files)
+current = SQL(db_file, "select", table_name="files_read", columns="filepath")
+current
+
+
+# Get acoustic directory and initialization settings
+# ---- Files
+biology_file_settings = file_configuration["input_directories"]["biological"]
+# ---- General settings
+biology_analysis_settings = file_configuration["biology"]
+
+# Get the file-specific settings, datatypes, columns, etc.
+# ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
+biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"]
+# ---- Extract the expected file name ID's
+biology_file_ids = biology_file_settings["file_name_formats"]
+# ---- Extract all of the file ids
+biology_config_ids = list(biology_file_ids.keys())
+# ---- Initialize the dictionary that will define this key in the `input` attribute
+biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
+# ---- Initialize the SQL dictionary
+sql_biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
+
+# Create full filepath
+biology_directory_path = (
+    Path(file_configuration["data_root_dir"]) / biology_file_settings["directory"]
+)
+# ---- Directory check
+directory_existence = biology_directory_path.exists()
+# ---- Error evaluation (if applicable)
+if not directory_existence:
+    raise FileNotFoundError(
+        f"The acoustic data directory [{biology_directory_path}] does not exist."
+    )
+# ---- Get the defined file extension
+file_extension = biology_file_settings["extension"]
+# ---- Create Path.glob generator object
+file_path_obj = biology_directory_path.glob(f"*{'.'+file_extension}")
+#---- Create list of `*.csv`` files
+csv_files = list(file_path_obj)
+# ---- Ensure files exist or raise error otherwise
+if len(csv_files) < 1:
+    raise FileNotFoundError(
+        f"No `*.csv` files found in [{biology_directory_path}]!"
+    )
+else: 
+    # ---- Create Path to SQL database file
+    db_directory = Path(file_configuration["data_root_dir"]) / "database"
+    # ---- Create the directory if it does not already exist
+    db_directory.mkdir(parents=True, exist_ok=True)
+    # ---- Complete path to `biology.db`
+    db_file = db_directory / "biology.db"
+    # ---- Query the external SQL database to see if the file tracking table exists
+    tables = SQL(db_file, "inspect")
+    # ---- Create a list of string-formatted Path names
+    csv_files_str = [str(file) for file in csv_files]
+    # ---- Create DataFrame
+    current_files = pd.DataFrame(csv_files_str, columns=["filepath"])
+    # ---- Create if it is missing and then advance `csv_files`
+    if "files_read" not in tables:
+        # ---- Insert into the SQL database file
+        _ = SQL(db_file, "insert", table_name="files_read", columns="filepath",
+                    dataframe=current_files)        
+        # ---- Create empty list for later comparison
+        new_files = []
+    else:
+        # ---- Pull already processed filenames
+        previous_files = SQL(db_file, "select", table_name="files_read")
+        # ---- Compare against the current filelist 
+        new_files = (
+            [file for file in csv_files_str if file not in set(previous_files["filepath"])]
+        )  
+        # ---- Create a DataFrame for the new files
+        new_files_df = pd.DataFrame(new_files, columns=["filepath"])
+        # ---- Insert into the SQL database file
+        _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) 
+
+# Iterate through each of the file ids and read in the data 
+for id in list(biology_file_ids.keys()): 
+    # ---- Extract the specific config mapping for this tag/id
+    sub_config_map = biology_config_map[id]
+    # ---- Drop the `{FIELD_ID}` tag identifier
+    file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id])
+    # ---- Replace all other tags with `*` placeholders
+    file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
+    # ---- Create Path object with the generalized format
+    subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}")
+    # ---- List all files that match this pattern
+    subcsv_files_str = [str(file) for file in list(subfile_path_obj)]
+    # ---- Filter for only new files
+    subset_files = set(subcsv_files_str).intersection(set(new_files))
+    # ---- Pull from SQL database, if applicable
+    if f"{id}_df" in tables:
+        # ---- SELECT
+        sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*")
+        # ---- Concatenate to the dictionary
+        sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df])
+    # ---- Add data files not stored in SQL database
+    if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables:
+        if len(subset_files) > 0:
+            file_list = subset_files
+        else:
+            file_list = subcsv_files_str
+        # ---- Create a list of relevant dataframes
+        sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map) 
+                        for file in file_list]
+        # ---- Concatenate into a single DataFrame
+        sub_df = pd.concat(sub_df_lst, ignore_index=True)
+        # ---- Concatenate to the dictionary DataFrame
+        biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df])
+
+# Get contrasts used for filtering the dataset
+# ---- Species
+species_filter = file_configuration["species"]["number_code"]
+# ---- Trawl partition information
+trawl_filter = biology_analysis_settings["catch"]["partition"]
+# ---- Apply the filter
+filtered_biology_output = {
+    key: df[
+        (df['species_id'] == species_filter if 'species_id' in df.columns else True) &
+        (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns else True)
+    ]
+    for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty
+}
+
+# Update the SQL database
+for table_name, df in filtered_biology_output.items():
+    # ---- Update        
+    _ = SQL(db_file, "insert", table_name=table_name, columns="*", 
+            dataframe=df)
+    
+# Combine the two datasets 
+merged_output = {
+    key: pd.concat([
+        sql_biology_output.get(key, pd.DataFrame()), 
+        filtered_biology_output.get(key, pd.DataFrame())
+    ]).drop_duplicates().reset_index(drop=True)
+    for key in set(sql_biology_output) | set(filtered_biology_output)
+}
+# ---- Return output
+merged_output
+
+coordinate_metadata.attrs[]
+
+SQL(biology_db, command="drop", table_name="catch_df")
+SQL(biology_db, command="drop", table_name="specimen_df")
+SQL(biology_db, command="drop", table_name="length_df")
+SQL(biology_db, command="drop", table_name="files_read")
+_ = SQL(db_file=db_file, command="create", table_name="files_read", columns="filepath")
+tables = SQL(db_file, "inspect")
+tables
+current = SQL(db_file, "select", table_name="files_read", columns=["filepath"])
+current
+
+SQL(db_file, "select", table_name="catch_df", columns="*")
+new_files_df = pd.DataFrame(csv_files_str, columns=['file_path'])
+_ = SQL("insert", engine, table_name="files_read",dataframe=new_files_df)
+current = SQL("select", engine, table_name="csv_files_read", columns="file_path")
+current
+for table_name, df in biology_data.items():
+    df.to_sql(table_name, con=engine, if_exists='append', index=False)
+command = "read"
+engine = create_engine(f'sqlite:///{db_file}')
+table_name = "files_read"
+columns = "file_path"
+
+kwargs = {
+    "table_name": table_name,
+    "columns": columns, 
+}
+
+zarr_data_ds["depth"].diff(dim="depth")
+
+prc_nasc_df.groupby(["longitude", "latitude"])
+
+from pandas.core.groupby import DataFrameGroupBy
+
+def estimate_echometrics(acoustic_data_df: pd.DataFrame):
+
+    # Create copy
+    acoustic_df = acoustic_data_df.copy().reset_index(drop=True)
+
+    # Pre-compute the change in depth
+    acoustic_df["dz"] = acoustic_df["depth"].diff()
+
+    # Initialize echometrics dictionary
+    echometrics = {}
+
+    # Compute the metrics center-of-mass
+    if acoustic_df["NASC"].sum() == 0.0:
+        echometrics.update({
+            "n_layers": 0,
+            "mean_Sv": -999,
+            "max_Sv": -999,
+            "nasc_db": np.nan,
+            "center_of_mass": np.nan,
+            "dispersion": np.nan,
+            "evenness": np.nan,
+            "aggregation": np.nan,    
+            "occupied_area": 0.0,        
+        })
+    else:
+        
+        # Compute the number of layers
+        echometrics.update({
+            "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size
+        })
+
+        # Compute ABC
+        # ---- Convert NASC to ABC
+        acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2)
+        # ---- Estimate mean Sv
+        echometrics.update({
+            "mean_Sv": 10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max())
+        })
+        # --- Estimate max Sv (i.e. )
+        echometrics.update({
+            "max_Sv": 10 * np.log10(acoustic_df["ABC"].max() 
+                                    / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"])
+        })
+
+        # Compute (acoustic) abundance
+        echometrics.update({
+            "nasc_db": 10 * np.log10(acoustic_df["ABC"].sum())
+        })
+
+        # Compute center of mass
+        echometrics.update({
+            "center_of_mass": (
+                (acoustic_df["depth"] * acoustic_df["NASC"]).sum()
+                / (acoustic_df["NASC"]).sum()
+            )
+        })
+
+        # Compute the dispersion
+        echometrics.update({
+            "dispersion": (
+                ((acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2 
+                * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum()                
+            )
+        })
+
+        # Compute the evenness
+        echometrics.update({
+            "evenness": (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2
+        })
+
+        # Compute the index of aggregation
+        echometrics.update({
+            "aggregation": 1 / echometrics["evenness"]
+        })
+
+        # Get the occupied area
+        echometrics.update({
+            "occupied_area": (
+                acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max()
+            )
+        })
+
+    # Return the dictionary
+    return echometrics
+
+def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True):
+
+    # Vertically integrate PRC NASC
+    nasc_dict = {"nasc": acoustic_data_df["NASC"].sum()}
+    
+    # Horizontally concatenate `echometrics`, if `True`
+    if echometrics:
+        # ---- Compute values
+        # NOTE: This uses NASC instead of linear `sv`
+        echometrics_dict = estimate_echometrics(acoustic_data_df)
+        # ---- Merge
+        nasc_dict.update(echometrics_dict)
+
+    # Convert `nasc_dict` to a DataFrame and return the output
+    return pd.Series(nasc_dict)
+
+def process_group(group):
+    result = integrate_nasc(group, echometrics=True)
+    result = result.reset_index(drop=True)
+    # Concatenate the result back to the original group for alignment
+    group = group.reset_index(drop=True)
+    combined = pd.concat([group, result], axis=1)
+    return combined
+
+acoustic_data_df = acoustic_data["prc_nasc_df"]
+
+
+rc_nasc_df[prc_nasc_df["distance"] == 0.0]
+acoustic_data_df = mek[mek["distance"] == 0.0]
+pd.DataFrame(nasc_dict, index=[0]).reset_index(drop=True).unstack()
+nasc_data_df = (
+    prc_nasc_df.groupby(["longitude", "latitude", "ping_time"])
+    .apply(lambda group: integrate_nasc(group, echometrics=False), include_groups=False)
+    .reset_index()
+)
+
+
+
+
+kwargs = {
+    "table_name": "csv_files_read",
+    "columns": "file_path",
+    "dataframe": new_files_df
+}
+
+current_process = psutil.Process()
+import logging 
+
+# Create a session
+Session = sessionmaker(bind=engine)
+session = Session()
+
+# Perform database operations
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+logger.info("Performing database operations")
+
+# Create a session
+Session = sessionmaker(bind=engine)
+session = Session()
+
+# Perform database operations
+logger.info("Performing database operations")
+
+# Close the session
+session.close()
+logger.info("Session closed")
+
+# Dispose the engine
+engine.dispose()
+logger.info("Engine disposed")
+
+# Force garbage collection
+import gc
+gc.collect()
+logger.info("Garbage collection performed")
+
+import psutil
+
+pid = psutil.Process().pid
+process = psutil.Process(pid)
+open_files = process.open_files()
+db_path = r'C:\Users\Brandyn\Documents\GitHub\EchoPro_data\live_2019_files\database\biology.db'
+
+# Check if the file is still in use
+for file in open_files:
+    if db_path in file.path:
+        logger.info(f"File {db_path} is still in use.")
+    else:
+        logger.info(f"File {db_path} is not in use.")
+
+# Define the SQL to drop the table
+drop_table_sql = "DROP TABLE IF EXISTS csv_files_read;"
+# Execute the drop table SQL
+with engine.connect() as connection:
+    _ = connection.execute(text(drop_table_sql))
+
+import sqlite3
+if os.path.exists(db_path):
+    conn = sqlite3.connect(db_path)
+    conn.close()
+    # Force the file to be removed
+    try:
+        os.remove(db_path)
+        print(f"Database file {db_path} has been deleted.")
+    except PermissionError:
+        print(f"Failed to delete {db_path}. The file is still in use.")
+        
+create_table_sql = """
+CREATE TABLE IF NOT EXISTS csv_files_read (
+    file_path TEXT UNIQUE
+);
+"""
+# Execute the create table SQL
+with engine.connect() as connection:
+    _ = connection.execute(text(create_table_sql))
+
+root_directory =  Path(root_dir)
+dataset = "biology"
+
+# Convert to strings
+csv_files_str = [str(file) for file in csv_files]
+
+existing_files_df = pd.read_sql('SELECT file_path FROM csv_files_read', con=engine)
+existing_files_set = set(existing_files_df['file_path'])
+# Filter out duplicates from the csv_files list
+new_files = [file for file in csv_files_str if file not in existing_files_set]
+# Insert only new file paths into the SQL table
+if new_files:
+    new_files_df = pd.DataFrame(new_files, columns=['file_path'])
+    _ = new_files_df.to_sql('csv_files_read', con=engine, if_exists='append', index=False)
+
+
+with engine.connect() as conn:
+    conn.execute("""
+        CREATE TABLE IF NOT EXISTS csv_files_read (
+            file_path TEXT UNIQUE
+        )
+    """)
+
+csv_files
+files_df.to_sql('csv_files_read', con=engine, if_exists='append', index=False)
+file_name_format = biology_file_ids[id]
+def compile_filename_format(file_name_format: str):
+
+    # Create a copy of `file_name_format`
+    regex_pattern = file_name_format
+    
+    # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern
+    for key, value in LIVE_FILE_FORMAT_MAP.items():
+        regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"])
+    # ---- Replace the `FILE_ID` tag
+    regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P<FILE_ID>\1)', regex_pattern)
+
+    # Compile the regex pattern and return the output
+    return re.compile(regex_pattern)
+
+from sqlalchemy.orm import sessionmaker
+Session = sessionmaker(bind=engine)
+session = Session()
+session.close()
+engine.pool.status()
+# Dispose the engine to close all connections
+engine.dispose()
+import gc
+gc.collect()
+import psutil
+dbapi_conn = engine.raw_connection()
+dbapi_conn.close()
+# Get the process ID of the current process
+pid = psutil.Process().pid
+
+# List all open files for the current process
+process = psutil.Process(pid)
+open_files = process.open_files()
+
+for file in open_files:
+    print(file.path)
+
+
+pattern = filename_format
+config_settings = sub_config_map
+regex_pattern = pattern
+
+# Replace patterns based on LIVE_FILE_FORMAT_MAP
+for key, value in LIVE_FILE_FORMAT_MAP.items():
+    regex_pattern = regex_pattern.replace(f'{{{key}}}', value['expression'])
+regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P<FILE_ID>\1)', regex_pattern)
+new_pattern = compile_filename_format(regex_pattern)
+match_obj = new_pattern.search(file.name)
+# Get substring components as a list
+filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern)
+valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings)))
+
+for i in valid_tags: 
+    matched_key = LIVE_FILE_FORMAT_MAP[i]
+    df[matched_key["name"]] = matched_key["dtype"](match_obj.group(i))
+
+
+
+# Assign the data as new columns to the DataFrame
+for key, value in data_to_add.items():
+    df[key] = value
+
+for i in valid_tags: 
+    matched_key = LIVE_FILE_FORMAT_MAP[i]
+    df[matched_key["name"]] = matched_key["dtype"](match_obj.group(i))
+biology_analysis_settings
+species_id_value = 22500
+trawl_partition_value = 'Codend'  # Adjust as needed
+{
+    key: df[
+        (('species_id' not in df.columns) or (df['species_id'] == species_id_value)) &
+        (('trawl_partition' not in df.columns) or (df['trawl_partition'] == trawl_partition_value))
+    ]
+    for key, df in biology_output.items() if isinstance(df, pd.DataFrame)
+}
+
+(match_obj.group(i)).astype(matched_key["dtype"])
+pattern = '{DATE:YYYYMM}_{HAUL}_{FILE_ID:catch_perc}'
+modified_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'\1', pattern)
+# Create the regex pattern
+regex_pattern = modified_pattern.replace('{', '(?P<').replace('}', '>.+?)')
+re.compile(regex_pattern)
+
+modified_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'\1', pattern)
+    
+# Create the regex pattern
+regex_pattern = modified_pattern.replace('{', '(?P<').replace('}', '>.+?)')
+compile_filename_format(regex_pattern)
+# Regular expression to capture values inside the curly braces
+regex = r'\{([^:}]+):([^}]+)\}'
+
+# Find all matches
+matches = re.findall(regex, modified_pattern)
+
+# Get substring components as a list
+filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern)
+
+pattern_changed = pattern.replace("FILE_ID:", "")
+
+# Compilte the filename regular expression format
+compiled_regex = compile_filename_format(pattern_changed)
+
+file_id_tag = pattern.split('{FILE_ID:')[1].split('}')[0]
+
+ # Get the file name and produce a `re.Match` object
+match_obj = compiled_regex.search(file.name)
+
+
+def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict):
+
+    # Get the file name and produce a `re.Match` object
+    match_obj = pattern.search(file.name)
+
+    # Read in the `*.csv` file
+    df = pd.read_csv(file, usecols=list(config_settings["dtypes"].keys()))
+
+    # Validate the dataframe
+    # ---- Check for any missing columns
+    missing_columns = (
+        [key for key in config_settings["dtypes"].keys() if key not in df.columns]
+    )
+    # ---- Raise Error, if needed
+    if missing_columns: 
+        raise ValueError(
+            f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!"
+        )
+    # ---- Ensure the correct datatypes
+    df_validated = df.astype(config_settings["dtypes"])
+
+    # Replace column names and drop 
+    df_validated = df_validated.rename(columns=config_settings["names"])
+
+    # Get the haul number and add the the dataframe
+    # ---- Extract the haul number and convert to an integer
+    haul_num = int(match_obj.group("HAUL"))
+    # ---- Add the column
+    df_validated["haul_num"] = haul_num
+
+    # Return the resulting DataFrame
+    return df_validated
+
+##
+grid_settings["grid_resolution"]["x"] = 50
+grid_settings["grid_resolution"]["y"] = 50
+lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters
+lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters
+
+# CREATE BOUNDING
+bound_df = pd.DataFrame({
+    "lon": np.array([lon_min, lon_max, lon_max, lon_min, lon_min]),
+    "lat": np.array([lat_min, lat_min, lat_max, lat_max, lat_min])
+})
+
+bound_gdf = gpd.GeoDataFrame(
+    data=bound_df,
+    geometry=gpd.points_from_xy(bound_df["lon"], bound_df["lat"]),
+    crs = projection
+)
+
+utm_string_generator(-117.0, 33.75)
+bound_gdf.total_bounds
+# Convert to UTM
+bound_utm = bound_gdf.to_crs(utm_num)
+bound_utm.total_bounds
+y_step = lat_step
+x_step = lon_step
+# bound_utm = bound_gdf
+# y_step = grid_settings["grid_resolution"]["y"] * 1852 / 110574
+# x_step = grid_settings["grid_resolution"]["x"] * 1852 / 60.0
+
+xmin, ymin, xmax, ymax = bound_utm.total_bounds
+
+# Get number of cells
+n_x_cells = int(np.ceil((xmax - xmin) / x_step))
+n_y_cells = int(np.ceil((ymax - ymin) / y_step))
+
+import pyproj
+# create the cells in a loop
+# grid_cells = []
+# for x0 in np.arange(xmin, xmax, x_step):
+#     for y0 in np.arange(ymin, ymax, y_step):
+#         # bounds
+#         utm_zone = utm_string_generator(x0, y0)
+#         proj = pyproj.Proj(f"epsg:{utm_code}")
+#         x1 = x0-x_step
+#         y1 = y0+y_step
+#         grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))
+
+grid_cells = []
+for y0 in np.arange(ymin, ymax, y_step):
+
+    # x_step = grid_settings["grid_resolution"]["x"] * 1852 / (1852 * 60 * np.cos(np.radians(y0)))
+
+    for x0 in np.arange(xmin, xmax, x_step):
+        # bounds
+        # utm_zone = utm_string_generator(x0, y0)
+        # proj = pyproj.Proj(f"epsg:{utm_code}")
+        # x1, y1 = proj(x0, y0)
+        # x2, y2 = proj(x0 - x_step, y0 + y_step)
+        # grid_cells.append(box(x1, y1, x2, y2))
+        x1 = x0-x_step
+        y1 = y0+y_step
+        grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))
+
+cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=utm_code)
+cells_gdf.shape
+n_x_cells * n_y_cells
+# cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"])
+cells_gdf.total_bounds
+cells_gdf.to_crs(projection).total_bounds
+from shapely.validation import make_valid
+from shapely.geometry import mapping
+########
+world = gpd.read_file("C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files/coastline/ne_10m_land/ne_10m_land.shp")
+bb_orig = box(lon_min, lat_min, lon_max, lat_max)
+boundary_box = box(lon_min - 5, lat_min - 5, lon_max + 5, lat_max + 5)
+world_orig = gpd.clip(world, box(lon_min-1, lat_min-1, lon_max+1, lat_max+1))
+world_clipped_latlon = gpd.clip(world, boundary_box)
+world_clipped = gpd.clip(world, boundary_box).to_crs(utm_code)
+
+world_utm = world.to_crs(utm_code)
+world_utm = world_utm[~world_utm.is_empty]
+
+bbox_latlon = box(lon_min, lat_min, lon_max, lat_max)
+
+gpd.GeoDataFrame(geometry=[bbox_latlon], crs=projection).to_crs(utm_code)
+
+bbox_utm = bound_utm.total_bounds
+
+buffer = [-lon_step * 1.01, -lat_step * 1.01, lon_step * 1.01, lat_step * 1.01]
+array_buffer = bbox_utm + buffer
+array_names = ["minx", "miny", "maxx", "maxy"]
+buffered = dict(zip(array_names, array_buffer))
+buffer_boundary = box(**buffered)
+# box(array_buffer[0], array_buffer[1], array_buffer[2], array_buffer[3])
+# buffer_boundary = buffer_boundary.to_crs(world_utm.crs)
+
+buffer_boundary_gdf = gpd.GeoDataFrame(geometry=[buffer_boundary], crs=world_utm.crs)  # Replace with the correct EPSG code
+bb_orig_gdf = gpd.GeoDataFrame(geometry=[bb_orig], crs=projection) 
+# sub_clipped = gpd.clip(world_utm, buffer_boundary)
+# sub_clipped = gpd.clip(world_utm, bbox_utm) 
+
+# fig, ax = plt.subplots(figsize=(10, 10))
+# # Plot the buffer_boundary
+# world.plot(ax=ax, linewidth=2, color='gray')
+# buffer_boundary_gdf.to_crs(projection).plot(ax=ax, facecolor='none', edgecolor='blue')
+# bb_orig_gdf.plot(ax=ax, facecolor='none', edgecolor='red')
+# plt.xlim(lon_min-3, lon_max+3)
+# plt.ylim(lat_min-3, lat_max+3)
+# plt.show()
+
+len(bbox_latlon.exterior.coords)
+len(buffer_boundary.exterior.coords)
+
+# world_clipped_latlon = gpd.clip(world_utm, buffer_boundary).to_crs(projection)
+world_clipped_latlon
+########
+cells_clipped = cells_gdf["geometry"].difference(world_clipped.geometry.union_all()).to_frame("geometry")
+# cells_clipped = cells_gdf["geometry"].difference(world_clipped_latlon.geometry.union_all()).to_frame("geometry")
+cell_colors = cells_clipped.area / (lat_step * lon_step)
+# cell_colors = cells_clipped.to_crs({"proj": "cea"}).area / 46300.00000000001**2
+cells_clipped['cell_colors'] = cell_colors
+# ---> back to epsg lat/long
+cells_latlon = cells_clipped.to_crs(projection)
+cells_latlon_clipped = gpd.clip(cells_latlon, bb_orig_gdf)
+cell_colors_clipped = cells_latlon_clipped.to_crs(utm_code).area / (lat_step * lon_step)
+# cell_colors = cells_clipped.to_crs({"proj": "cea"}).area / 46300.00000000001**2
+cells_latlon_clipped['cell_colors'] = cell_colors_clipped
+########
+from shapely.geometry import Point, LineString, shape
+nasc_df = survey.input["acoustics"]["nasc_df"]
+nasc_gdf = gpd.GeoDataFrame(data=nasc_df, geometry=gpd.points_from_xy(nasc_df["longitude"], nasc_df["latitude"]), crs=projection)
+geo_df = nasc_gdf.groupby(["transect_num"])['geometry'].apply(lambda x: LineString(x.tolist())).to_frame("geometry").set_crs(projection)
+custom_crs = '+proj=epsg:4326 +lat_ts=0 +lat_0=0 +lon_0=-180 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs +type=crs'
+cells_latlon_clipped.to_crs(custom_crs).crs
+########
+import matplotlib.colors as colors
+import matplotlib.cm as cm
+cells_transformed = cells_latlon.to_crs(utm_code)
+lims = cells_transformed.total_bounds
+
+fig, ax = plt.subplots(figsize=(10, 10))
+# cells_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=True)
+# cells_clipped.plot.hexbin()
+cells_latlon.to_crs(utm_code).plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=False)
+# cells_latlon.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=False)
+# cells_latlon_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=False)
+# cells_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=True)
+# cells_gdf.plot(ax=ax, facecolor="none", edgecolor="black")
+norm = colors.Normalize(vmin=cells_latlon["cell_colors"].min(), vmax=cells_latlon["cell_colors"].max())
+cbar = plt.colorbar(cm.ScalarMappable(norm=norm, cmap="viridis"), ax=ax, orientation="horizontal", shrink=0.5)
+cbar.set_label("Normalized grid area (50x50 nmi)", fontsize=12, labelpad=10, loc='center')  
+cbar.ax.xaxis.set_label_position('top')
+cbar.ax.xaxis.set_ticks_position('top')
+geo_df.reset_index().to_crs(utm_code).plot(ax=ax, color="red")
+# geo_df.reset_index().plot(ax=ax, color="red")
+# plt.plot(ax=ax, nasc_df["longitude"], nasc_df["latitude"], color="red")
+ax.margins(0.00, 0.00)
+world_orig.to_crs(utm_code).plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
+# world_orig.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
+# bb_orig_gdf.to_crs(utm_code).plot(ax=ax, facecolor='none', edgecolor='red')
+plt.xlim(lims[0]*1.02, lims[2]*1.01)
+# ax.set_yticks([4e6, 5e6, 6e6])
+# ax.set_yticklabels(["4000", "5000", "6000"], fontsize=10)
+plt.ylim(lims[1]*0.98, lims[3]*1.005)
+ax.set_yticks([4e6, 5e6, 6e6])
+ax.set_yticklabels(["4000", "5000", "6000"], fontsize=10)
+plt.xlabel("Eastings (km)")
+plt.ylabel("Northings (km)")
+# plt.xlabel("Longitude (°E)")
+# ax.set_xticks([-135, -130, -125, -120])
+# plt.ylabel("Latitude (°N)")
+ax.set_xticks([-600e3, -400e3, -200e3, 0, 200e3, 400e3, 600e3, 800e3])
+ax.set_xticklabels(["-600", "-400", "-200", "0", "200", "400", "600", "800"], fontsize=10)
+# Adding the colorbar title
+# cax = fig.get_axes()[1]  # Assuming the colorbar is the second axis
+# cax.set_ylabel("Normalized grid area (25x25 nmi)")  # Setting the title of the colorbar
+plt.tight_layout()
+plt.show()
\ No newline at end of file
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
index 10ecf076..44a83ab4 100644
--- a/echopop/zarr_read_ingest_test.py
+++ b/echopop/zarr_read_ingest_test.py
@@ -9,6 +9,10 @@
 import glob
 from datetime import datetime
 import geopandas as gpd
+import os
+import re
+import contextlib
+from sqlalchemy import create_engine, text, Engine, inspect
 
 ####################################################################################################
 # * Functionality for a) loading YAML configuration file, b) search defined directory for 
@@ -55,11 +59,11 @@ def live_configuration(live_init_config_path: Union[str, Path],
 ####################################################################################################  
 # TEST: YAML FILE CONFIGURATION
 # ---- Define filepaths
-live_init_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
-live_file_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
+live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
+live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
 # ---- Run function: `live_configuration`
 file_configuration = live_configuration(live_init_config_path, live_file_config_path)
-file_configuration
+file_configuration.update({"database": {"acoustics": None, "biology": None}})
 ####################################################################################################
 # * Accessory function for tuning the acoustic transmit frequency units/scaling
 # TODO: Documentation
@@ -98,20 +102,156 @@ def configure_transmit_frequency(frequency_values: pd.Series,
             "longitude": float,
             "ping_time": "datetime64[ns]",
         }
-    }
+    },
+    "biology": {
+        "catch": {
+            "dtypes": {
+                "partition": str,
+                "species_code": int,
+                "sample_weight_kg": float,
+                "catch_perc": float,
+            },
+            "names": {
+                "partition": "trawl_partition",
+                "species_code": "species_id",
+                "sample_weight_kg": "haul_weight",
+                "catch_perc": "catch_percentage",
+            }
+        },
+        "length": {
+            "dtypes": {
+                "sex": str,
+                "rounded_length": int,
+                "frequency": int,
+            },
+            "names": {
+                "sex": "sex",
+                "rounded_length": "length",
+                "frequency": "length_count",
+            }
+        },
+        "specimen": {
+            "dtypes": {
+                "rounded_length": int,
+                "organism_weight": float,
+                "sex": str,
+            },
+            "names": {
+                "sex": "sex",
+                "rounded_length": "length",
+                "organism_weight": "weight"
+            },
+        },
+    },
 }
+
+LIVE_FILE_FORMAT_MAP = {
+    "DATE:YYYYMM": {
+        "name": "date",
+        "dtype": "datetime[ns]",
+        "expression": r"(?P<DATE>\d{6})",
+    },
+    "DATE:YYYYMMDD": {
+        "name": "date",
+        "dtype": "datetime[ns]",
+        "expression": r"(?P<DATE>\d{8})",
+    },
+    "HAUL": {
+        "name": "haul_num",
+        "dtype": int,
+        "expression": r"(?P<HAUL>\d+)",
+    },
+    "SPECIES_CODE": {
+        "name": "species_id",
+        "dtype": int,
+        "expression": r"(?P<SPECIES_CODE>\d+)"
+    },
+    "FILE_ID": {
+        "name": "file_id",
+        "dtype": str,
+        "expression": r"(?P<FILE_ID>.+)"
+    },
+}
+
+def compile_filename_format(file_name_format: str):
+
+    # Create a copy of `file_name_format`
+    regex_pattern = file_name_format
+    
+    # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern
+    for key, value in LIVE_FILE_FORMAT_MAP.items():
+        regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"])
+    # ---- Replace the `FILE_ID` tag
+    regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P<FILE_ID>\1)', regex_pattern)
+
+    # Compile the regex pattern and return the output
+    return re.compile(regex_pattern)
+
+def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict):
+
+    # Read in the `*.csv` file
+    df = pd.read_csv(file, usecols=list(config_settings["dtypes"].keys()))
+
+    # Validate the dataframe
+    # ---- Check for any missing columns
+    missing_columns = (
+        [key for key in config_settings["dtypes"].keys() if key not in df.columns]
+    )
+    # ---- Raise Error, if needed
+    if missing_columns: 
+        raise ValueError(
+            f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!"
+        )
+    # ---- Ensure the correct datatypes
+    df_validated = df.astype(config_settings["dtypes"])
+    # ---- Replace column names and drop 
+    df_validated = df_validated.rename(columns=config_settings["names"])
+
+    # Get the substring components that can be added to the DataFrame
+    filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern)
+    # ---- Create sub-list of columns that can be added to the DataFrame
+    valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings)))
+
+    # Compile the filename regular expression
+    compiled_regex = compile_filename_format(pattern)
+    # ---- Create the `Match` object that will be used to parse the string
+    match_obj = compiled_regex.search(file.name)
+
+    # Iterate through the filename-derived tags and add them to the DataFrame
+    for i in valid_tags: 
+        matched_key = LIVE_FILE_FORMAT_MAP[i]
+        df_validated[matched_key["name"]] = matched_key["dtype"](match_obj.group(i))
+
+    # Return the resulting DataFrame
+    return df_validated
 ####################################################################################################
 # * Functionality for reading in processed acoustic data
 # TODO: Expand data validator and limit cases to '*.zarr' (for now)
 # TODO: Refactor "extra" components such as the validation steps, xarray-to-dataframe piping, etc.
 # TODO: Documentation
-def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame, xr.Dataset]:
+def load_acoustic_data(file_configuration: dict, update_config: bool = True) -> Tuple[pd.DataFrame, xr.Dataset]:
     # Get acoustic directory and initialization settings
     # ---- Files
     acoustic_file_settings = file_configuration["input_directories"]["acoustic"]
     # ---- General settings
     acoustic_analysis_settings = file_configuration["acoustics"]
     
+    # Get the file-specific settings, datatypes, columns, etc.
+    # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
+    acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"]
+    # ---- Create list of coordinate data variables
+    specified_vars = list(acoustics_config_map["xarray_variables"].keys())
+    # ---- Create set of coordinate variables
+    specified_coords = list(acoustics_config_map["xarray_coordinates"].keys())      
+    # ---- Concatenate into a full configuration map
+    full_config_map = {**acoustics_config_map["xarray_coordinates"],
+                        **acoustics_config_map["xarray_variables"]} 
+    # ---- Initialize the dictionary that will define this key in the `input` attribute
+    acoustics_output = {"prc_nasc_df": pd.DataFrame(), 
+                        "nasc_df": pd.DataFrame()}
+    # ---- Initialize the SQL dictionary
+    # sql_acoustics_output = {"sv_df": pd.DataFrame()}
+
     # Create full filepath
     acoustic_directory_path = (
         Path(file_configuration["data_root_dir"]) / acoustic_file_settings["directory"]
@@ -127,57 +267,79 @@ def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame, xr.Datas
         )
     # ---- Get the defined file extension
     file_extension = acoustic_file_settings["extension"]
-    # ---- In the case of a *.zarr file
-    if file_extension == "zarr":
-        # ---- Create Path.glob generator object
-        file_path_obj = acoustic_directory_path.glob(f"*{'.'+file_extension}")
-        # ---- Find all zarr files
-        zarr_files = list(file_path_obj)
-        # ---- Ensure files exist or raise error otherwise
-        if len(zarr_files) < 1:
-            raise FileNotFoundError(
-                f"No `*.zarr` files found in [{acoustic_directory_path}]!"
-            )
-        # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
-        acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"]
-        # ---- Create list of coordinate data variables
-        specified_vars = list(acoustics_config_map["xarray_variables"].keys())
-        # ---- Create set of coordinate variables
-        specified_coords = list(acoustics_config_map["xarray_coordinates"].keys())      
-        # ---- Concatenate into a full configuration map
-        full_config_map = {**acoustics_config_map["xarray_coordinates"],
-                           **acoustics_config_map["xarray_variables"]}          
-        # ! [REQUIRES DASK] ---- Read in all listed files 
-        # TODO: The sliding/overlapping windows makes this annoying -- in theory, only a single new zarr file will be ingested
-        # TODO: So this needs to be replaced w/ `open_dataset` instead
-        zarr_data_ds = xr.open_mfdataset(zarr_files, 
-                                         engine="zarr",
-                                         chunks="auto",
-                                         data_vars=specified_vars,
-                                         coords=specified_coords)
-        # ---- Extract coordinate metadata
-        coordinate_metadata = zarr_data_ds[["longitude", "latitude"]]
-        # ---- Convert to a DataFrame
-        zarr_data_df = zarr_data_ds.to_dataframe().reset_index()
-        # ---- Check for any missing columns
-        missing_columns = (
-            [key for key in full_config_map.keys() if key not in zarr_data_df.columns]
+    # ---- Create Path.glob generator object (the case of a *.zarr file)
+    file_path_obj = acoustic_directory_path.glob(f"*{'.'+file_extension}")
+    # ---- Find all zarr files
+    zarr_files = list(file_path_obj)
+    # ---- Ensure files exist or raise error otherwise
+    if len(zarr_files) < 1:
+        raise FileNotFoundError(
+            f"No `*.zarr` files found in [{acoustic_directory_path}]!"
         )
-        # ---- Raise Error, if needed
-        if missing_columns: 
-            raise ValueError(
-                f"The following columns are missing from at least one *.{file_extension} file in "
-                f"[{acoustic_directory_path}]: {', '.join(missing_columns)}!"    
-            )
-        # ---- Select defined columns
-        zarr_data_df_filtered = zarr_data_df[full_config_map.keys()]
-        # ---- Validate data types
-        zarr_data_df_filtered = (
-            zarr_data_df_filtered
-            .apply(lambda col: col.astype(full_config_map[col.name]) 
-                   if col.name in full_config_map else col)
+    else:
+        # ---- Create Path to SQL database file
+        db_directory = Path(file_configuration["data_root_dir"]) / "database"
+        # ---- Create the directory if it does not already exist
+        db_directory.mkdir(parents=True, exist_ok=True)
+        # ---- Complete path to `biology.db`
+        db_file = db_directory / "acoustics.db"
+        # ---- Query the external SQL database to see if the file tracking table exists
+        tables = SQL(db_file, "inspect")
+        # ---- Create a list of string-formatted Path names
+        zarr_files_str = [str(file) for file in zarr_files]
+        # ---- Create DataFrame
+        current_files = pd.DataFrame(zarr_files_str, columns=["filepath"])
+        # ---- Create if it is missing and then advance `zarr_files`
+        if "files_read" not in tables:
+            # ---- Insert into the SQL database file
+            _ = SQL(db_file, "insert", table_name="files_read", columns="filepath",
+                    dataframe=current_files)        
+            # ---- Create empty list for later comparison
+            new_files = []
+        else:
+            # ---- Pull already processed filenames
+            previous_files = SQL(db_file, "select", table_name="files_read")
+            # ---- Compare against the current filelist 
+            new_files = (
+                [file for file in zarr_files_str if file not in set(previous_files["filepath"])]
+            )  
+            # ---- Create a DataFrame for the new files
+            new_files_df = pd.DataFrame(new_files, columns=["filepath"])
+            # ---- Insert into the SQL database file
+            _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) 
+
+    # Find new files that have not yet been processed
+    if not new_files: 
+        subset_files = zarr_files
+    else:
+        subset_files = set(zarr_files).intersection(set(new_files))
+
+    # Read in the `*.zarr` file(s)
+    # ! [REQUIRES DASK] ---- Read in the listed file
+    if len(subset_files) > 1:
+        zarr_data_ds = xr.open_mfdataset(subset_files, engine="zarr", chunks="auto", 
+                                         data_vars=specified_vars, coords=specified_coords)
+    elif len(subset_files) == 1:
+        zarr_data_ds = xr.open_dataset(subset_files[0], engine="zarr", chunks="auto")
+
+    # Pre-process the Dataset, convert it to a DataFrame, and validate the structure
+    # ---- Extract coordinate metadata
+    coordinate_metadata = zarr_data_ds[["longitude", "latitude"]]    
+    # ---- Convert to a DataFrame
+    zarr_data_df = zarr_data_ds.to_dataframe().reset_index()
+    # ---- Check for any missing columns
+    missing_columns = (
+        [key for key in full_config_map.keys() if key not in zarr_data_df.columns]
+    )
+    # ---- Raise Error, if needed
+    if missing_columns: 
+        raise ValueError(
+            f"The following columns are missing from at least one *.{file_extension} file in "
+            f"[{acoustic_directory_path}]: {', '.join(missing_columns)}!"    
         )
-        
+    # ---- Select defined columns
+    zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map)
+
     # Extract defined acoustic frequency
     # ---- From the configuration 
     transmit_settings = acoustic_analysis_settings["transmit"]
@@ -197,14 +359,475 @@ def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame, xr.Datas
     # ---- Replace NASC `NaN` values with `0.0`
     zarr_data_df_output.loc[:, "NASC"] = zarr_data_df_output.loc[:, "NASC"].fillna(0.0)
     # ---- Drop frequency column and return the output
-    return zarr_data_df_output.drop(columns = ["frequency_nominal"]), coordinate_metadata
+    acoustics_output["prc_nasc_df"] = zarr_data_df_output.drop(columns = ["frequency_nominal"])
+    # ---- Return output
+    if update_config:
+        if file_configuration["database"]["acoustics"] is None: 
+            file_configuration["database"]["acoustics"] = db_file
+        return acoustics_output, file_configuration
+    else:
+        return acoustics_output
 ####################################################################################################  
 # TEST: ACOUSTIC ZARR FILE INGESTION CONFIGURATION
 # NOTE: 
 # ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration`
-acoustic_data, coordinate_metadata = load_acoustic_data(file_configuration)
+acoustic_data, file_configuration = load_acoustic_data(file_configuration)
 acoustic_data
-coordinate_metadata
+####################################################################################################
+def load_biology_data(file_configuration: dict, update_config: bool = True):
+
+    # Get acoustic directory and initialization settings
+    # ---- Files
+    biology_file_settings = file_configuration["input_directories"]["biological"]
+    # ---- General settings
+    biology_analysis_settings = file_configuration["biology"]
+
+    # Get the file-specific settings, datatypes, columns, etc.
+    # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
+    biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"]
+        # ---- Extract the expected file name ID's
+    biology_file_ids = biology_file_settings["file_name_formats"]
+    # ---- Extract all of the file ids
+    biology_config_ids = list(biology_file_ids.keys())
+    # ---- Initialize the dictionary that will define this key in the `input` attribute
+    biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
+    # ---- Initialize the SQL dictionary
+    sql_biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
+    
+    # Create full filepath
+    biology_directory_path = (
+        Path(file_configuration["data_root_dir"]) / biology_file_settings["directory"]
+    )
+    # ---- Directory check
+    directory_existence = biology_directory_path.exists()
+    # ---- Error evaluation (if applicable)
+    if not directory_existence:
+        raise FileNotFoundError(
+            f"The acoustic data directory [{biology_directory_path}] does not exist."
+        )
+    # ---- Get the defined file extension
+    file_extension = biology_file_settings["extension"]
+    # ---- Create Path.glob generator object
+    file_path_obj = biology_directory_path.glob(f"*{'.'+file_extension}")
+    #---- Create list of `*.csv`` files
+    csv_files = list(file_path_obj)
+    # ---- Ensure files exist or raise error otherwise
+    if len(csv_files) < 1:
+        raise FileNotFoundError(
+            f"No `*.csv` files found in [{biology_directory_path}]!"
+        )
+    else: 
+        # ---- Create Path to SQL database file
+        db_directory = Path(file_configuration["data_root_dir"]) / "database"
+        # ---- Create the directory if it does not already exist
+        db_directory.mkdir(parents=True, exist_ok=True)
+        # ---- Complete path to `biology.db`
+        db_file = db_directory / "biology.db"
+        # ---- Query the external SQL database to see if the file tracking table exists
+        tables = SQL(db_file, "inspect")
+        # ---- Create a list of string-formatted Path names
+        csv_files_str = [str(file) for file in csv_files]
+        # ---- Create DataFrame
+        current_files = pd.DataFrame(csv_files_str, columns=["filepath"])
+        # ---- Create if it is missing and then advance `csv_files`
+        if "files_read" not in tables:
+            # ---- Insert into the SQL database file
+            _ = SQL(db_file, "insert", table_name="files_read", columns="filepath",
+                     dataframe=current_files)        
+            # ---- Create empty list for later comparison
+            new_files = []
+        else:
+            # ---- Pull already processed filenames
+            previous_files = SQL(db_file, "select", table_name="files_read")
+            # ---- Compare against the current filelist 
+            new_files = (
+                [file for file in csv_files_str if file not in set(previous_files["filepath"])]
+            )  
+            # ---- Create a DataFrame for the new files
+            new_files_df = pd.DataFrame(new_files, columns=["filepath"])
+            # ---- Insert into the SQL database file
+            _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) 
+
+    # Iterate through each of the file ids and read in the data 
+    for id in list(biology_file_ids.keys()): 
+        # ---- Extract the specific config mapping for this tag/id
+        sub_config_map = biology_config_map[id]
+        # ---- Drop the `{FIELD_ID}` tag identifier
+        file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id])
+        # ---- Replace all other tags with `*` placeholders
+        file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
+        # ---- Create Path object with the generalized format
+        subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}")
+        # ---- List all files that match this pattern
+        subcsv_files_str = [str(file) for file in list(subfile_path_obj)]
+        # ---- Filter for only new files
+        subset_files = set(subcsv_files_str).intersection(set(new_files))
+        # ---- Pull from SQL database, if applicable
+        if f"{id}_df" in tables:
+            # ---- SELECT
+            sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*")
+            # ---- Concatenate to the dictionary
+            sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df])
+        # ---- Add data files not stored in SQL database
+        if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables:
+            if len(subset_files) > 0:
+                file_list = subset_files
+            else:
+                file_list = subcsv_files_str
+            # ---- Create a list of relevant dataframes
+            sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map) 
+                            for file in file_list]
+            # ---- Concatenate into a single DataFrame
+            sub_df = pd.concat(sub_df_lst, ignore_index=True)
+            # ---- Lower-case sex
+            if "sex" in sub_df.columns:
+                sub_df["sex"] = sub_df["sex"].str.lower()
+            # ---- Concatenate to the dictionary DataFrame
+            biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df])
+
+    # Get contrasts used for filtering the dataset
+    # ---- Species
+    species_filter = file_configuration["species"]["number_code"]
+    # ---- Trawl partition information
+    trawl_filter = biology_analysis_settings["catch"]["partition"]
+    # ---- Apply the filter
+    filtered_biology_output = {
+        key: df[
+            (df['species_id'] == species_filter if 'species_id' in df.columns else True) &
+            (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns else True)
+        ]
+        for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty
+    }
+
+    # Update the SQL database
+    for table_name, df in filtered_biology_output.items():
+        # ---- Update        
+        _ = SQL(db_file, "insert", table_name=table_name, columns="*", 
+                dataframe=df)
+        
+    # Combine the two datasets 
+    merged_output = {
+        key: pd.concat([
+            sql_biology_output.get(key, pd.DataFrame()), 
+            filtered_biology_output.get(key, pd.DataFrame())
+        ]).drop_duplicates().reset_index(drop=True)
+        for key in set(sql_biology_output) | set(filtered_biology_output)
+    }
+    # ---- Return output
+    if update_config:
+        if file_configuration["database"]["biology"] is None: 
+            file_configuration["database"]["biology"] = db_file
+        return merged_output, file_configuration
+    else:
+        return merged_output
+####################################################################################################  
+# TEST: BIOLOGY FILE INGESTION CONFIGURATION
+# NOTE: 
+# ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration`
+biology_data, file_configuration = load_biology_data(file_configuration)
+biology_data
+####################################################################################################
+prc_nasc_df = acoustic_data["prc_nasc_df"]
+
+def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: dict, 
+                          echometrics: bool = True):
+
+    # Integrate NASC (and compute the echometrics, if necessary)
+    nasc_data_df = (
+        acoustic_data_df.groupby(["longitude", "latitude", "ping_time"])
+        .apply(lambda group: integrate_nasc(group, echometrics), include_groups=False)
+        .reset_index()
+    )
+    # ---- Amend the dtypes if echometrics were computed
+    if echometrics:
+        nasc_data_df = (
+            nasc_data_df
+            .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float,
+                             "center_of_mass": float, "dispersion": float, "evenness": float,
+                             "aggregation": float, "occupied_area": float})
+        )
+
+    # Get the name of the associated db file
+    acoustics_db = file_configuration["database"]["acoustics"]
+    # ---- Get current tables
+    tables = SQL(acoustics_db, "inspect")
+    
+    # 
+    if "nasc_df" not in tables:
+        _ = SQL(acoustics_db, "insert", table_name="nasc_df", dataframe=nasc_data_df)
+    else:
+        # ---- 
+        nasc_sql = SQL(acoustics_db, "select", table_name="nasc_df")
+        # ----
+        index_equiv = nasc_data_df[["longitude", "latitude", "ping_time"]].isin(nasc_sql)
+        # ----
+        bool_idx = index_equiv.apply(lambda x: np.all(x), axis=1)
+        # ---- 
+        _ = SQL(acoustics_db, "insert", table_name="nasc_df", dataframe=nasc_data_df.loc[~bool_idx])
+        # ----
+        nasc_data_df = pd.concat([nasc_sql, nasc_data_df], ignore_index=True)
+
+    # Return the output
+    return nasc_data_df
+
+
+SQL(acoustics_db, command="drop", table_name="nasc_df")
+SQL(acoustics_db, "inspect")
+
+nasc_analysis = process_acoustic_data(acoustic_data["prc_nasc_df"], file_configuration)
+
+SQL(acoustics_db, command="select", table_name="nasc_df")
+
+TS_SLOPE = 20.0
+TS_INTERCEPT = -68.0
+
+# CONVERT TO TS
+comb_lengths["ts"] = TS_SLOPE * np.log10(comb_lengths["length"]) + TS_INTERCEPT
+# TO SIGMA_BS
+comb_lengths["sigma_bs"] = 10 ** (comb_lengths["ts"] / 10)
+# WEIGHTED MEAN SIGMA_BS
+sigma_mean = np.average(comb_lengths["sigma_bs"], weights=comb_lengths["length_count"])
+
+from typing import Optional
+from echopop.utils import operations
+from echopop.acoustics import ts_length_regression, to_linear, to_dB
+
+__all__ = ["operations"]
+
+# Meld bio datasets
+length_datasets = biology_data["specimen_df"].meld(biology_data["length_df"], 
+                                                   contrasts=["haul_num", "sex", "species_id", "length"])
+
+# Create distribution
+distrib_params = file_configuration["biology"]["length_distribution"]["bins"]
+
+length_bins = np.linspace(**{key: value for key, value in zip(["start", "stop", "num"], distrib_params)}, dtype=float)
+binwidth = np.diff(length_bins / 2.0).mean()
+intervals = np.concatenate([length_bins[:1] - binwidth, length_bins + binwidth])
+length_bins_df = pd.DataFrame({"bin": length_bins, "interval": pd.cut(length_bins, intervals)})
+# 
+length_datasets["length_bin"] = pd.cut(length_datasets["length"], bins=intervals, labels=length_bins_df["bin"])
+
+stratify_key = file_configuration["geospatial"]["link_biology_acoustics"]
+
+if stratify_key == "global":
+    length_distribution = (
+        length_datasets.pivot_table(columns=["sex"], index=["length_bin"], 
+                                    values="length_count", aggfunc="sum", observed=False)
+    )
+    #
+    length_distribution["total"] = length_distribution.sum(axis=1)
+
+length_distribution.transpose()
+SQL(biology_db, "drop", table_name="length_distribution")
+# Get the name of the associated db file
+biology_db = file_configuration["database"]["biology"]
+# ---- Get current tables
+tables = SQL(biology_db, "inspect")
+
+
+if "length_distribution" not in tables:
+    _ = SQL(biology_db, "insert", table_name="length_distribution", 
+            dataframe=length_distribution.transpose())
+    
+
+SQL(biology_db, "select", table_name="length_distribution")
+SQL(biology_db, "drop", table_name="length_distribution")
+SQL(biology_db, "replace", table_name="length_distribution", dataframe=length_distribution.unstack().reset_index(name="count"))
+length_distribution.unstack().reset_index(name="count")
+mixed = SQL(biology_db, "select", table_name="length_distribution")
+length_bins[:1]
+from typing import Optional
+from echopop.utils import operations
+from echopop.acoustics import ts_length_regression, to_linear, to_dB
+
+__all__ = ["operations"]
+
+# Meld bio datasets
+length_datasets = biology_data["specimen_df"].meld(biology_data["length_df"], 
+                                                   contrasts=["haul_num", "species_id", "length"])
+
+ts_length_parameters_spp = [
+    spp
+    for spp in file_configuration["acoustics"]["TS_length_regression_parameters"].values()
+    if spp["number_code"] in np.unique(length_datasets.species_id).astype(int)
+]
+
+# ---- get species info
+target_species = pd.DataFrame.from_dict(ts_length_parameters_spp)
+
+ts_lengths_df = length_datasets.merge(
+    target_species.drop("length_units", axis=1),
+    left_on=["species_id"],
+    right_on=["number_code"],
+)
+# ---- filter out other spp
+length_datasets[length_datasets["species_id"].isin(target_species["number_code"])]
+
+#
+file_configuration["acoustics"]["TS_length_regression_parameters"][target_species["text_code"]]
+
+def average_sigma_bs(length: Union[pd.DataFrame, float, int], TS_L_slope: Optional[float] = None, TS_L_intercept: Optional[float] = None, weighted: Optional[Union[float, int, str]] = None):
+
+    # 
+    if isinstance(length, pd.DataFrame):
+        if "length" not in length.columns: 
+            raise ValueError(
+                "Column [`length`] missing from dataframe input `length`."
+            )
+        if "TS_L_slope" not in length.columns and TS_L_slope is None:
+            raise ValueError(
+                "Value [`TS_L_slope`] missing from dataframe input `length` and optional "
+                "separate argument `TS_L_slope`."
+            )
+        if "TS_L_intercept" not in length.columns and TS_L_intercept is None:
+            raise ValueError(
+                "Value [`TS_L_intercept`] missing from dataframe input `length` and optional "
+                "separate argument `TS_L_intercept`."
+        )
+    elif isinstance(length, float) or isinstance(length, int):
+        if TS_L_slope is None:
+            raise ValueError(
+                "Argument [`TS_L_slope`] missing."
+            )
+        elif TS_L_slope is not None and not isinstance(TS_L_slope, float):
+            raise TypeError(
+                "Argument `TS_L_slope` must be type `float`."
+        )
+        if "TS_L_intercept" not in length.columns and TS_L_intercept is None:
+            raise ValueError(
+                "Argument [`TS_L_intercept`] missing."
+        )
+        elif TS_L_intercept is not None and not isinstance(TS_L_intercept, float):
+            raise TypeError(
+                "Argument `TS_L_intercept` must be type `float`."
+        )
+
+    #
+    if TS_L_slope is None:
+        TS_L_slope = length["TS_L_slope"]
+
+    #
+    if TS_L_intercept is None:
+        TS_L_intercept = length["TS_L_intercept"]
+
+    #
+    if isinstance(length, pd.DataFrame):
+        length_val = length["length"]
+
+    ts_value = ts_length_regression(length_val, TS_L_slope, TS_L_intercept)
+    sigma_bs_value = to_linear(ts_value)
+
+
+
+    if isinstance(weighted, str):
+        if weighted not in length.columns:
+            raise ValueError(
+                f"Argument [`weighted` (str)], '{weighted}', is not a column in argument `length` "
+                f"(DataFrame)."
+            )
+        else: 
+            return (sigma_bs_value * length[weighted]).sum() / length[weighted].sum()
+    elif weighted is not None: 
+        if weighted.size != sigma_bs_value.size:
+            raise ValueError(
+                f"Argument [`weighted` (float|int)] of size {weighted.size} does not match size of "
+                f"argument [`length` (float|int)`] of size {sigma_bs_value.size}."
+            )
+        else:
+            return (sigma_bs_value * weighted).sum() / weighted.sum()
+    else:
+        return sigma_bs_value.mean()
+
+average_sigma_bs
+
+ts_lengths_df.groupby(["haul_num"]).apply(average_sigma_bs).apply(lambda x: to_dB(x))
+def integrate_nasc(prc_nasc_df: pd.DataFrame):
+
+# Compute the number of layers
+echometrics.update({
+    "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size
+})
+
+# Compute the index of aggregation
+echometrics.update({
+    "aggregation": 1 / echometrics["evenness"]
+})
+
+# Get the occupied area
+echometrics.update({
+    "occupied_area": (
+        acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max()
+    )
+})
+
+
+
+
+pd.read_fr
+pd.read_sql(text(SQL_COMMANDS["select"].format(**kwargs)), con=connection)
+engine = create_engine(f"sqlite:///{db_file}")
+connection = engine.connect()
+kwargs["dataframe"].to_sql(name=kwargs["table_name"], 
+                                                  con=connection, 
+                                                  if_exists="append", index=False)
+connection.close()
+engine.dispose()
+SQL(db_file, "insert", table_name=table_name, columns="*", 
+        filter_columns=insertion_filter,
+        dataframe=df)
+
+SQL(db_file, "select", table_name="files_read")
+SQL(db_file, "select", table_name="catch_df")
+SQL(db_file, "select", table_name="specimen_df")
+SQL(db_file, "select", table_name="length_df")
+
+def check_table_schema(connection, **kwargs):
+    query = text(("PRAGMA table_info({table_name});").format(**kwargs))
+    schema = connection.execute(query).fetchall()
+    print("Table Schema:", schema)
+
+check_table_schema(connection, table_name=table_name)
+
+def insert_test_data(connection, table_name):
+    test_data = pd.DataFrame({
+        'trawl_partition': ['test'],
+        'species_id': ['test'],
+        'haul_weight': [0.0],
+        'catch_percentage': [0.0],
+        'haul_num': [1]
+    })
+    
+    test_data.to_sql(name=table_name, con=connection, if_exists='append', index=False)
+    print("Test data inserted.")
+
+insert_test_data(connection, table_name)
+
+kwargs = {}
+command = "insert"
+kwargs["table_name"] = "catch_df"
+kwargs["dataframe"] = df
+kwargs["filter_columns"] = insertion_filter
+columns = "*"
+
+
+re.compile(file_name_format)
+pattern = file_name_format
+pattern = pattern.replace('{DATE:YYYYMM}', r'(?P<DATE>\d{6})')
+pattern = pattern.replace('{HAUL}', r'(?P<HAUL>\d+)')
+pattern = pattern.replace('{FILE_ID}', r'(?P<FILE_ID>.+)')
+regex = re.compile(pattern)
+haul_values = []
+
+file_name_format.search(file.name)
+sub_df_lst = []
+for file in subcsv_files:
+    match = regex.search(file.name)
+    if match:
+        haul_value = match.group('HAUL')
+        df = pd.read_csv(file, usecols=list(sub_config_map.keys()))
+        df['HAUL'] = haul_value  # Append HAUL value as a new column
+        sub_df_lst.append(df)
 ####################################################################################################
 def load_spatial_data(file_configuration: dict,
                       acoustic_data: pd.DataFrame,
@@ -438,6 +1061,14 @@ def __init__(
 x_min, y_min = utm_proj(lon_min, lat_min)
 x_max, y_max = utm_proj(lon_max, lat_max)
 
+lat = 55.5000
+lon = -134.2500
+utm_code = int(utm_string_generator(lon, lat))
+utm_proj = pyproj.Proj(f"epsg:{utm_code}")
+utm_proj(lon, lat)
+gpd.GeoDataFrame(geometry=gpd.points_from_xy(np.array([lon]), np.array([lat])), crs=projection).to_crs(utm_code)
+
+
 num_lon_steps = int((x_max - x_min) / lon_step)
 num_lat_steps = int((y_max - y_min) / lat_step)
 

From 9b79d814f7c2ea949d0a7ff3ce9d40e7d724d6d3 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Sun, 21 Jul 2024 20:05:09 -0700
Subject: [PATCH 05/81] Updating `LiveSurvey` methods

---
 echopop/live/__init__.py             |   4 +
 echopop/live/acoustics.py            |   0
 echopop/live/core.py                 |  28 --
 echopop/live/live_data_processing.py | 422 +++++++++++++++++++++++++++
 echopop/live/liveacoustics.py        | 143 +++++++++
 echopop/live/livecore.py             | 119 ++++++++
 echopop/live/livesurvey.py           |  56 ++--
 echopop/live/sql_methods.py          |  73 +++++
 8 files changed, 799 insertions(+), 46 deletions(-)
 delete mode 100644 echopop/live/acoustics.py
 delete mode 100644 echopop/live/core.py
 create mode 100644 echopop/live/live_data_processing.py
 create mode 100644 echopop/live/liveacoustics.py
 create mode 100644 echopop/live/livecore.py
 create mode 100644 echopop/live/sql_methods.py

diff --git a/echopop/live/__init__.py b/echopop/live/__init__.py
index b8585ba9..f4e742bb 100644
--- a/echopop/live/__init__.py
+++ b/echopop/live/__init__.py
@@ -1 +1,5 @@
+from echopop.utils import operations
+
+__all__ = ["operations"]
+
 from _echopop_version import version as __version__  # noqa
\ No newline at end of file
diff --git a/echopop/live/acoustics.py b/echopop/live/acoustics.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/echopop/live/core.py b/echopop/live/core.py
deleted file mode 100644
index de066ae3..00000000
--- a/echopop/live/core.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from datetime import datetime
-
-import pandas as pd
-
-LIVE_DATA_STRUCTURE = {
-    "meta": {
-        "provenance": dict(),
-        "date": list(),
-    },
-    "input": {
-        "acoustics": {
-            "nasc_df": pd.DataFrame(),
-        },
-        "biology": {
-            "catch_df": pd.DataFrame(),
-            "distributions": {
-                "length_bins_df": pd.DataFrame(),
-            },
-            "length_df": pd.DataFrame(),
-            "specimen_df": pd.DataFrame(),
-        },
-    },
-    "results": {
-        "acoustics": dict(),
-        "biology": dict(),
-        "stratified": dict(),        
-    },
-}
\ No newline at end of file
diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py
new file mode 100644
index 00000000..293862c4
--- /dev/null
+++ b/echopop/live/live_data_processing.py
@@ -0,0 +1,422 @@
+import yaml
+import re
+
+from pathlib import Path
+from typing import Union, Tuple
+
+import pandas as pd
+import xarray as xr
+import numpy as np
+
+from .livecore import(
+    LIVE_DATA_STRUCTURE,
+    LIVE_FILE_FORMAT_MAP,
+    LIVE_INPUT_FILE_CONFIG_MAP
+)
+
+from .sql_methods import SQL
+
+# TODO: Incorporate complete YAML file validator
+# TODO: Documentation
+def live_configuration(live_init_config_path: Union[str, Path], 
+                       live_file_config_path: Union[str, Path]):
+    
+    # Validate file existence
+    # ---- str-to-Path conversion, if necessary
+    live_init_config_path = Path(live_init_config_path)
+    live_file_config_path = Path(live_file_config_path)
+    # ---- Create list of both config paths
+    config_files = [live_init_config_path, live_file_config_path]
+    # ---- List of file existence checks
+    config_existence = [live_init_config_path.exists(), live_file_config_path.exists()]
+    # ---- Error evaluation and print message (if applicable)
+    if not all(config_existence):
+        missing_config = [
+            files for files, exists in zip(config_files, config_existence) if not exists
+        ]
+        raise FileNotFoundError(f"The following configuration files do not exist: {missing_config}")
+
+    # Read the YAML configuration/recipe file to parameterize the `LiveSurvey` class
+    # ---- Initialization settings
+    init_config = yaml.safe_load(Path(live_init_config_path).read_text())
+    # ---- Filepath/directory settings
+    file_config = yaml.safe_load(Path(live_file_config_path).read_text())
+    
+    # Check for intersecting/duplicative configuration keys
+    # ---- Compare sets of keys from each dictionary
+    config_intersect = set(init_config.keys()).intersection(set(file_config.keys()))
+    # ---- Raise error if needed
+    if config_intersect:
+        raise ValueError(
+            f"The initialization and file configuration files comprise the following intersecting "
+            f"keys: {' ,'.join(config_intersect)}. Key names must be unique for each configuration "
+            f"file."
+        )
+    
+    # Combine both into a dictionary output that can be added to the `LiveSurvey` class object
+    return {**init_config, **file_config}
+
+# TODO: Documentation
+def compile_filename_format(file_name_format: str):
+
+    # Create a copy of `file_name_format`
+    regex_pattern = file_name_format
+    
+    # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern
+    for key, value in LIVE_FILE_FORMAT_MAP.items():
+        regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"])
+    # ---- Replace the `FILE_ID` tag
+    regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P<FILE_ID>\1)', regex_pattern)
+
+    # Compile the regex pattern and return the output
+    return re.compile(regex_pattern)
+
+
+# TODO: Documentation
+def configure_transmit_frequency(frequency_values: pd.Series,
+                                 transmit_settings: dict, 
+                                 current_units: str):
+    
+    # Extract transmit frequency units defined in configuration file
+    configuration_units = transmit_settings["units"]
+    
+    # Transform the units, if necessary
+    # ---- Hz to kHz
+    if current_units == "Hz" and configuration_units == "kHz":
+        return frequency_values * 1e-3
+    # ---- kHz to Hz
+    elif current_units == "kHz" and configuration_units == "Hz":
+        return frequency_values * 1e3
+    # ---- No change
+    else:
+        return frequency_values
+    
+# TODO: Documentation
+def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict):
+
+    # Read in the `*.csv` file
+    df = pd.read_csv(file, usecols=list(config_settings["dtypes"].keys()))
+
+    # Validate the dataframe
+    # ---- Check for any missing columns
+    missing_columns = (
+        [key for key in config_settings["dtypes"].keys() if key not in df.columns]
+    )
+    # ---- Raise Error, if needed
+    if missing_columns: 
+        raise ValueError(
+            f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!"
+        )
+    # ---- Ensure the correct datatypes
+    df_validated = df.astype(config_settings["dtypes"])
+    # ---- Replace column names and drop 
+    df_validated = df_validated.rename(columns=config_settings["names"])
+
+    # Get the substring components that can be added to the DataFrame
+    filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern)
+    # ---- Create sub-list of columns that can be added to the DataFrame
+    valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings)))
+
+    # Compile the filename regular expression
+    compiled_regex = compile_filename_format(pattern)
+    # ---- Create the `Match` object that will be used to parse the string
+    match_obj = compiled_regex.search(file.name)
+
+    # Iterate through the filename-derived tags and add them to the DataFrame
+    for i in valid_tags: 
+        matched_key = LIVE_FILE_FORMAT_MAP[i]
+        df_validated[matched_key["name"]] = matched_key["dtype"](match_obj.group(i))
+
+    # Return the resulting DataFrame
+    return df_validated
+
+# TODO: Documentation
+# TODO: Refactor, break up cyclomatic complexity
+def load_biology_data(file_configuration: dict, update_config: bool = True):
+
+    # Get acoustic directory and initialization settings
+    # ---- Files
+    biology_file_settings = file_configuration["input_directories"]["biological"]
+    # ---- General settings
+    biology_analysis_settings = file_configuration["biology"]
+
+    # Get the file-specific settings, datatypes, columns, etc.
+    # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
+    biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"]
+        # ---- Extract the expected file name ID's
+    biology_file_ids = biology_file_settings["file_name_formats"]
+    # ---- Extract all of the file ids
+    biology_config_ids = list(biology_file_ids.keys())
+    # ---- Initialize the dictionary that will define this key in the `input` attribute
+    biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
+    # ---- Initialize the SQL dictionary
+    sql_biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
+    
+    # Create full filepath
+    biology_directory_path = (
+        Path(file_configuration["data_root_dir"]) / biology_file_settings["directory"]
+    )
+    # ---- Directory check
+    directory_existence = biology_directory_path.exists()
+    # ---- Error evaluation (if applicable)
+    if not directory_existence:
+        raise FileNotFoundError(
+            f"The acoustic data directory [{biology_directory_path}] does not exist."
+        )
+    # ---- Get the defined file extension
+    file_extension = biology_file_settings["extension"]
+    # ---- Create Path.glob generator object
+    file_path_obj = biology_directory_path.glob(f"*{'.'+file_extension}")
+    #---- Create list of `*.csv`` files
+    csv_files = list(file_path_obj)
+    # ---- Ensure files exist or raise error otherwise
+    if len(csv_files) < 1:
+        raise FileNotFoundError(
+            f"No `*.csv` files found in [{biology_directory_path}]!"
+        )
+    else: 
+        # ---- Create Path to SQL database file
+        db_directory = Path(file_configuration["data_root_dir"]) / "database"
+        # ---- Create the directory if it does not already exist
+        db_directory.mkdir(parents=True, exist_ok=True)
+        # ---- Complete path to `biology.db`
+        db_file = db_directory / "biology.db"
+        # ---- Query the external SQL database to see if the file tracking table exists
+        tables = SQL(db_file, "inspect")
+        # ---- Create a list of string-formatted Path names
+        csv_files_str = [str(file) for file in csv_files]
+        # ---- Create DataFrame
+        current_files = pd.DataFrame(csv_files_str, columns=["filepath"])
+        # ---- Create if it is missing and then advance `csv_files`
+        if "files_read" not in tables:
+            # ---- Insert into the SQL database file
+            _ = SQL(db_file, "insert", table_name="files_read", columns="filepath",
+                     dataframe=current_files)        
+            # ---- Create empty list for later comparison
+            new_files = []
+        else:
+            # ---- Pull already processed filenames
+            previous_files = SQL(db_file, "select", table_name="files_read")
+            # ---- Compare against the current filelist 
+            new_files = (
+                [file for file in csv_files_str if file not in set(previous_files["filepath"])]
+            )  
+            # ---- Create a DataFrame for the new files
+            new_files_df = pd.DataFrame(new_files, columns=["filepath"])
+            # ---- Insert into the SQL database file
+            _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) 
+
+    # Iterate through each of the file ids and read in the data 
+    for id in list(biology_file_ids.keys()): 
+        # ---- Extract the specific config mapping for this tag/id
+        sub_config_map = biology_config_map[id]
+        # ---- Drop the `{FIELD_ID}` tag identifier
+        file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id])
+        # ---- Replace all other tags with `*` placeholders
+        file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
+        # ---- Create Path object with the generalized format
+        subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}")
+        # ---- List all files that match this pattern
+        subcsv_files_str = [str(file) for file in list(subfile_path_obj)]
+        # ---- Filter for only new files
+        subset_files = set(subcsv_files_str).intersection(set(new_files))
+        # ---- Pull from SQL database, if applicable
+        if f"{id}_df" in tables:
+            # ---- SELECT
+            sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*")
+            # ---- Concatenate to the dictionary
+            sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df])
+        # ---- Add data files not stored in SQL database
+        if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables:
+            if len(subset_files) > 0:
+                file_list = subset_files
+            else:
+                file_list = subcsv_files_str
+            # ---- Create a list of relevant dataframes
+            sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map) 
+                            for file in file_list]
+            # ---- Concatenate into a single DataFrame
+            sub_df = pd.concat(sub_df_lst, ignore_index=True)
+            # ---- Lower-case sex
+            if "sex" in sub_df.columns:
+                sub_df["sex"] = sub_df["sex"].str.lower()
+            # ---- Concatenate to the dictionary DataFrame
+            biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df])
+
+    # Get contrasts used for filtering the dataset
+    # ---- Species
+    species_filter = file_configuration["species"]["number_code"]
+    # ---- Trawl partition information
+    trawl_filter = biology_analysis_settings["catch"]["partition"]
+    # ---- Apply the filter
+    filtered_biology_output = {
+        key: df[
+            (df['species_id'] == species_filter if 'species_id' in df.columns else True) &
+            (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns else True)
+        ]
+        for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty
+    }
+
+    # Update the SQL database
+    for table_name, df in filtered_biology_output.items():
+        # ---- Update        
+        _ = SQL(db_file, "insert", table_name=table_name, columns="*", 
+                dataframe=df)
+        
+    # Combine the two datasets 
+    merged_output = {
+        key: pd.concat([
+            sql_biology_output.get(key, pd.DataFrame()), 
+            filtered_biology_output.get(key, pd.DataFrame())
+        ]).drop_duplicates().reset_index(drop=True)
+        for key in set(sql_biology_output) | set(filtered_biology_output)
+    }
+    # ---- Return output
+    if update_config:
+        if file_configuration["database"]["biology"] is None: 
+            file_configuration["database"]["biology"] = db_file
+        return merged_output, file_configuration
+    else:
+        return merged_output
+
+# TODO: Expand data validator and limit cases to '*.zarr' (for now)
+# TODO: Refactor "extra" components such as the validation steps, xarray-to-dataframe piping, etc.
+# TODO: Documentation
+def load_acoustic_data(file_configuration: dict, update_config: bool = True) -> Tuple[pd.DataFrame, xr.Dataset]:
+    # Get acoustic directory and initialization settings
+    # ---- Files
+    acoustic_file_settings = file_configuration["input_directories"]["acoustic"]
+    # ---- General settings
+    acoustic_analysis_settings = file_configuration["acoustics"]
+    
+    # Get the file-specific settings, datatypes, columns, etc.
+    # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
+    acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"]
+    # ---- Create list of coordinate data variables
+    specified_vars = list(acoustics_config_map["xarray_variables"].keys())
+    # ---- Create set of coordinate variables
+    specified_coords = list(acoustics_config_map["xarray_coordinates"].keys())      
+    # ---- Concatenate into a full configuration map
+    full_config_map = {**acoustics_config_map["xarray_coordinates"],
+                        **acoustics_config_map["xarray_variables"]} 
+    # ---- Initialize the dictionary that will define this key in the `input` attribute
+    acoustics_output = {"prc_nasc_df": pd.DataFrame(), 
+                        "nasc_df": pd.DataFrame()}
+    # ---- Initialize the SQL dictionary
+    # sql_acoustics_output = {"sv_df": pd.DataFrame()}
+
+    # Create full filepath
+    acoustic_directory_path = (
+        Path(file_configuration["data_root_dir"]) / acoustic_file_settings["directory"]
+    )
+    
+    # Validate filepath, columns, datatypes
+    # ---- Directory check
+    directory_existence = acoustic_directory_path.exists()
+    # ---- Error evaluation (if applicable)
+    if not directory_existence:
+        raise FileNotFoundError(
+            f"The acoustic data directory [{acoustic_directory_path}] does not exist."
+        )
+    # ---- Get the defined file extension
+    file_extension = acoustic_file_settings["extension"]
+    # ---- Create Path.glob generator object (the case of a *.zarr file)
+    file_path_obj = acoustic_directory_path.glob(f"*{'.'+file_extension}")
+    # ---- Find all zarr files
+    zarr_files = list(file_path_obj)
+    # ---- Ensure files exist or raise error otherwise
+    if len(zarr_files) < 1:
+        raise FileNotFoundError(
+            f"No `*.zarr` files found in [{acoustic_directory_path}]!"
+        )
+    else:
+        # ---- Create Path to SQL database file
+        db_directory = Path(file_configuration["data_root_dir"]) / "database"
+        # ---- Create the directory if it does not already exist
+        db_directory.mkdir(parents=True, exist_ok=True)
+        # ---- Complete path to `biology.db`
+        db_file = db_directory / "acoustics.db"
+        # ---- Query the external SQL database to see if the file tracking table exists
+        tables = SQL(db_file, "inspect")
+        # ---- Create a list of string-formatted Path names
+        zarr_files_str = [str(file) for file in zarr_files]
+        # ---- Create DataFrame
+        current_files = pd.DataFrame(zarr_files_str, columns=["filepath"])
+        # ---- Create if it is missing and then advance `zarr_files`
+        if "files_read" not in tables:
+            # ---- Insert into the SQL database file
+            _ = SQL(db_file, "insert", table_name="files_read", columns="filepath",
+                    dataframe=current_files)        
+            # ---- Create empty list for later comparison
+            new_files = []
+        else:
+            # ---- Pull already processed filenames
+            previous_files = SQL(db_file, "select", table_name="files_read")
+            # ---- Compare against the current filelist 
+            new_files = (
+                [file for file in zarr_files_str if file not in set(previous_files["filepath"])]
+            )  
+            # ---- Create a DataFrame for the new files
+            new_files_df = pd.DataFrame(new_files, columns=["filepath"])
+            # ---- Insert into the SQL database file
+            _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) 
+
+    # Find new files that have not yet been processed
+    if not new_files: 
+        subset_files = zarr_files
+    else:
+        subset_files = set(zarr_files).intersection(set(new_files))
+
+    # Read in the `*.zarr` file(s)
+    # ! [REQUIRES DASK] ---- Read in the listed file
+    if len(subset_files) > 1:
+        zarr_data_ds = xr.open_mfdataset(subset_files, engine="zarr", chunks="auto", 
+                                         data_vars=specified_vars, coords=specified_coords)
+    elif len(subset_files) == 1:
+        zarr_data_ds = xr.open_dataset(subset_files[0], engine="zarr", chunks="auto")
+
+    # Pre-process the Dataset, convert it to a DataFrame, and validate the structure
+    # ---- Extract coordinate metadata
+    coordinate_metadata = zarr_data_ds[["longitude", "latitude"]]    
+    # ---- Convert to a DataFrame
+    zarr_data_df = zarr_data_ds.to_dataframe().reset_index()
+    # ---- Check for any missing columns
+    missing_columns = (
+        [key for key in full_config_map.keys() if key not in zarr_data_df.columns]
+    )
+    # ---- Raise Error, if needed
+    if missing_columns: 
+        raise ValueError(
+            f"The following columns are missing from at least one *.{file_extension} file in "
+            f"[{acoustic_directory_path}]: {', '.join(missing_columns)}!"    
+        )
+    # ---- Select defined columns
+    zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map)
+
+    # Extract defined acoustic frequency
+    # ---- From the configuration 
+    transmit_settings = acoustic_analysis_settings["transmit"]
+    # ---- Transform `frequency_nominal`, if necessary
+    zarr_data_df_filtered["frequency_nominal"] = (
+        configure_transmit_frequency(zarr_data_df_filtered["frequency_nominal"],
+                                     transmit_settings,
+                                     zarr_data_ds["frequency_nominal"].units)
+    )
+    # ---- Filter out any unused frequency coordinates
+    zarr_data_df_output = (
+        zarr_data_df_filtered
+        [zarr_data_df_filtered["frequency_nominal"] == transmit_settings["frequency"]]
+    )
+    
+    # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object
+    # ---- Replace NASC `NaN` values with `0.0`
+    zarr_data_df_output.loc[:, "NASC"] = zarr_data_df_output.loc[:, "NASC"].fillna(0.0)
+    # ---- Drop frequency column and return the output
+    acoustics_output["prc_nasc_df"] = zarr_data_df_output.drop(columns = ["frequency_nominal"])
+    # ---- Return output
+    if update_config:
+        if file_configuration["database"]["acoustics"] is None: 
+            file_configuration["database"]["acoustics"] = db_file
+        return acoustics_output, file_configuration
+    else:
+        return acoustics_output
\ No newline at end of file
diff --git a/echopop/live/liveacoustics.py b/echopop/live/liveacoustics.py
new file mode 100644
index 00000000..f526f578
--- /dev/null
+++ b/echopop/live/liveacoustics.py
@@ -0,0 +1,143 @@
+from typing import Union, Optional
+
+import pandas as pd
+
+from echopop.acoustics import ts_length_regression, to_linear, to_dB
+
+# TODO: Documentation
+def average_sigma_bs(length: Union[pd.DataFrame, float, int], 
+                     weights: Optional[Union[float, int, str]] = None):
+
+    # Function approach for dataframe input
+    if isinstance(length, pd.DataFrame):
+        if "length" not in length.columns: 
+            raise ValueError(
+                "Column [`length`] missing from dataframe input `length`."
+            )
+        elif "TS_L_slope" not in length.columns:
+            raise ValueError(
+                "Column [`TS_L_slope`] missing from dataframe input `length`."
+            )
+        elif "TS_L_slope" not in length.columns:
+            raise ValueError(
+                "Column [`TS_L_intercept`] missing from dataframe input `length`."
+            )
+        else:           
+            # ---- Compute the TS (as an array)
+            target_strength = ts_length_regression(length["length"], length["TS_L_slope"], 
+                                                   length["TS_L_intercept"])
+            # ---- Convert to `sigma_bs`
+            sigma_bs_value = to_linear(target_strength)
+            # ---- Weighted or arithmetic avveraging
+            if weights is None:
+                return sigma_bs_value.mean()
+            elif weights not in length.columns:
+                raise ValueError(
+                    f"Defined `weights` column, {weights}, missing from dataframe input "
+                    f"`length`."
+                )               
+            else:
+                return (sigma_bs_value * length[weights]).sum() / length[weights].sum()
+
+# TODO: Documentation
+# TODO: Refactor
+def estimate_echometrics(acoustic_data_df: pd.DataFrame):
+
+    # Create copy
+    acoustic_df = acoustic_data_df.copy().reset_index(drop=True)
+
+    # Pre-compute the change in depth
+    acoustic_df["dz"] = acoustic_df["depth"].diff()
+
+    # Initialize echometrics dictionary
+    echometrics = {}
+
+    # Compute the metrics center-of-mass
+    if acoustic_df["NASC"].sum() == 0.0:
+        echometrics.update({
+            "n_layers": 0,
+            "mean_Sv": -999,
+            "max_Sv": -999,
+            "nasc_db": np.nan,
+            "center_of_mass": np.nan,
+            "dispersion": np.nan,
+            "evenness": np.nan,
+            "aggregation": np.nan,    
+            "occupied_area": 0.0,        
+        })
+    else:
+        
+        # Compute the number of layers
+        echometrics.update({
+            "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size
+        })
+
+        # Compute ABC
+        # ---- Convert NASC to ABC
+        acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2)
+        # ---- Estimate mean Sv
+        echometrics.update({
+            "mean_Sv": 10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max())
+        })
+        # --- Estimate max Sv (i.e. )
+        echometrics.update({
+            "max_Sv": 10 * np.log10(acoustic_df["ABC"].max() 
+                                    / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"])
+        })
+
+        # Compute (acoustic) abundance
+        echometrics.update({
+            "nasc_db": 10 * np.log10(acoustic_df["ABC"].sum())
+        })
+
+        # Compute center of mass
+        echometrics.update({
+            "center_of_mass": (
+                (acoustic_df["depth"] * acoustic_df["NASC"]).sum()
+                / (acoustic_df["NASC"]).sum()
+            )
+        })
+
+        # Compute the dispersion
+        echometrics.update({
+            "dispersion": (
+                ((acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2 
+                * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum()                
+            )
+        })
+
+        # Compute the evenness
+        echometrics.update({
+            "evenness": (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2
+        })
+
+        # Compute the index of aggregation
+        echometrics.update({
+            "aggregation": 1 / echometrics["evenness"]
+        })
+
+        # Get the occupied area
+        echometrics.update({
+            "occupied_area": (
+                acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max()
+            )
+        })
+
+    # Return the dictionary
+    return echometrics
+
+def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True):
+
+    # Vertically integrate PRC NASC
+    nasc_dict = {"nasc": acoustic_data_df["NASC"].sum()}
+    
+    # Horizontally concatenate `echometrics`, if `True`
+    if echometrics:
+        # ---- Compute values
+        # NOTE: This uses NASC instead of linear `sv`
+        echometrics_dict = estimate_echometrics(acoustic_data_df)
+        # ---- Merge
+        nasc_dict.update(echometrics_dict)
+
+    # Convert `nasc_dict` to a DataFrame and return the output
+    return pd.Series(nasc_dict)
diff --git a/echopop/live/livecore.py b/echopop/live/livecore.py
new file mode 100644
index 00000000..83e72a86
--- /dev/null
+++ b/echopop/live/livecore.py
@@ -0,0 +1,119 @@
+from datetime import datetime
+
+import pandas as pd
+
+LIVE_DATA_STRUCTURE = {
+    "meta": {
+        "provenance": dict(),
+        "date": list(),
+    },
+    "database": {
+        "acoustics": None,
+        "biology": None,
+        "population": None,
+    },
+    "input": {
+        "acoustics": {
+            "nasc_df": pd.DataFrame(),
+        },
+        "biology": {
+            "catch_df": pd.DataFrame(),
+            "distributions": {
+                "length_bins_df": pd.DataFrame(),
+            },
+            "length_df": pd.DataFrame(),
+            "specimen_df": pd.DataFrame(),
+        },
+    },
+    "results": {
+        "acoustics": dict(),
+        "biology": dict(),
+        "stratified": dict(),        
+    },
+}
+
+# TODO: Update structure with additional information (as needed)
+# TODO: Documentation
+LIVE_INPUT_FILE_CONFIG_MAP = {
+    "acoustics": {
+        "xarray_coordinates": {
+            "distance": float,
+            "depth": float,
+        },
+        "xarray_variables": {
+            "NASC": float,
+            "frequency_nominal": float, 
+            "latitude": float,
+            "longitude": float,
+            "ping_time": "datetime64[ns]",
+        }
+    },
+    "biology": {
+        "catch": {
+            "dtypes": {
+                "partition": str,
+                "species_code": int,
+                "sample_weight_kg": float,
+                "catch_perc": float,
+            },
+            "names": {
+                "partition": "trawl_partition",
+                "species_code": "species_id",
+                "sample_weight_kg": "haul_weight",
+                "catch_perc": "catch_percentage",
+            }
+        },
+        "length": {
+            "dtypes": {
+                "sex": str,
+                "rounded_length": int,
+                "frequency": int,
+            },
+            "names": {
+                "sex": "sex",
+                "rounded_length": "length",
+                "frequency": "length_count",
+            }
+        },
+        "specimen": {
+            "dtypes": {
+                "rounded_length": int,
+                "organism_weight": float,
+                "sex": str,
+            },
+            "names": {
+                "sex": "sex",
+                "rounded_length": "length",
+                "organism_weight": "weight"
+            },
+        },
+    },
+}
+
+LIVE_FILE_FORMAT_MAP = {
+    "DATE:YYYYMM": {
+        "name": "date",
+        "dtype": "datetime[ns]",
+        "expression": r"(?P<DATE>\d{6})",
+    },
+    "DATE:YYYYMMDD": {
+        "name": "date",
+        "dtype": "datetime[ns]",
+        "expression": r"(?P<DATE>\d{8})",
+    },
+    "HAUL": {
+        "name": "haul_num",
+        "dtype": int,
+        "expression": r"(?P<HAUL>\d+)",
+    },
+    "SPECIES_CODE": {
+        "name": "species_id",
+        "dtype": int,
+        "expression": r"(?P<SPECIES_CODE>\d+)"
+    },
+    "FILE_ID": {
+        "name": "file_id",
+        "dtype": str,
+        "expression": r"(?P<FILE_ID>.+)"
+    },
+}
diff --git a/echopop/live/livesurvey.py b/echopop/live/livesurvey.py
index 70765b0f..6d6a8621 100644
--- a/echopop/live/livesurvey.py
+++ b/echopop/live/livesurvey.py
@@ -3,8 +3,10 @@
 import copy
 import yaml
 
-from .core import(
-    DATA_STRUCTURE
+from .livecore import(
+    LIVE_DATA_STRUCTURE,
+    LIVE_FILE_FORMAT_MAP,
+    LIVE_INPUT_FILE_CONFIG_MAP
 )
 
 from ..acoustics import (
@@ -13,29 +15,47 @@
     to_linear
 )
 
+from . import live_data_processing as eldp
+
 class LiveSurvey:
     """
-    A real-time processing version of the `echopop` base 
-    `Survey` class that ingests biological, acoustic, and
-    event meta data to provide population estimates when 
-    generated.
+    A real-time processing version of the `echopop` base `Survey` class that ingests biological, 
+    acoustic, and event meta data to provide population estimates when generated.
     """
 
     def __init__(
-        self
+        self,
+        live_init_config_path: Union[str, Path], 
+        live_file_config_path: Union[str, Path],
+        update_config: bool = True,
+        verbose: bool = True,
     ):
         # Initialize `meta` attribute
-        self.meta = copy.deepcopy(DATA_STRUCTURE["meta"])
+        self.meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"])
 
         # Loading the configuration settings and definitions that are used to
         # initialize the Survey class object
-        self.config = el.load_configuration(Path(init_config_path), Path(survey_year_config_path))
-
-        # Loading the datasets defined in the configuration files
-        self.input = el.load_survey_data(self.config)
-
-        # Initialize the `analysis` data attribute
-        self.analysis = copy.deepcopy(DATA_STRUCTURE["analysis"])
-
-        # Initialize the `results` data attribute
-        self.results = copy.deepcopy(DATA_STRUCTURE["results"])
\ No newline at end of file
+        self.config = eldp.live_configuration(Path(live_init_config_path), 
+                                              Path(live_file_config_path))
+
+        # Initialize input attribute
+        self.input = copy.deepcopy(LIVE_DATA_STRUCTURE["input"])
+
+        # Initialize database attribute
+        self.database = copy.deepcopy(LIVE_DATA_STRUCTURE["database"])
+
+        # Initialize the results attribute
+        self.results = copy.deepcopy(LIVE_DATA_STRUCTURE["results"])
+
+        # TODO: Replace Tuple output by appending the "database" key to the respective dataset dict
+        # Ingest data
+        # ---- Acoustics
+        self.input["acoustics"]["prc_nasc_df"], self.config = eldp.load_acoustic_data(self.config,
+                                                                                      update_config)
+        # ---- Biology
+        self.input["biology"], self.config = eldp.load_biology_data(self.config,
+                                                                    update_config)
+        
+        # TODO: Add verbosity for printing database filepaths/connections 
+        if verbose: 
+            pass
\ No newline at end of file
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
new file mode 100644
index 00000000..37a9d3b7
--- /dev/null
+++ b/echopop/live/sql_methods.py
@@ -0,0 +1,73 @@
+from sqlalchemy import create_engine, text, Engine, inspect
+
+import pandas as pd
+
+SQL_COMMANDS = {
+    "create": "CREATE TABLE IF NOT EXISTS {table_name} ({column_definitions});",
+    "check": "SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';",
+    "drop": "DROP TABLE IF EXISTS {table_name};",
+    "select": "SELECT {columns} FROM {table_name};",
+    "index": "CREATE UNIQUE INDEX IF NOT EXISTS {index_name} ON {table_name} ({columns})",
+    # "insert": "INSERT INTO {table_name} ({columns});",
+    "insert": """
+        INSERT INTO {table_name} ({columns}) 
+        SELECT {columns} 
+        FROM (SELECT VALUES {values} FROM (VALUES {value_placeholder})) AS source ({columns}) 
+        {filter_clause};
+        """,
+    "inspect": None,
+}
+
+SQL_DTYPES = {
+    'int32': 'INTEGER',
+    'int64': 'INTEGER',
+    'float64': 'FLOAT',
+    'bool': 'BOOLEAN',
+    'datetime64[ns]': 'DATETIME',
+    'object': 'TEXT'
+}
+
+# TODO: Documentation
+def SQL(db_file: str, command: str, **kwargs):
+
+    # Create engine from `db_file` string
+    engine = create_engine(f"sqlite:///{db_file}")
+
+    # Format `columns`, if there are any and more than 1
+    if "columns" in kwargs.keys():
+        if isinstance(kwargs["columns"], list):
+            kwargs["columns"] = ", ".join(kwargs["columns"])
+    else:
+        kwargs["columns"] = "*"
+
+    # Run the command
+    try:
+        with engine.connect() as connection:
+            # ---- SELECT
+            if command == "select":
+                return pd.read_sql(text(SQL_COMMANDS[command].format(**kwargs)), con=connection)
+            # ---- REPLACE
+            elif command == "replace":
+                # ---- Extract dataframe
+                df_to_add = kwargs["dataframe"]
+                # ---- Replace current
+                df_to_add.to_sql(name=kwargs["table_name"], 
+                                 con=connection, 
+                                 if_exists="replace", index=False)
+
+            # ---- INSERT
+            elif command == "insert": 
+                # ---- Extract dataframe
+                df_to_add = kwargs["dataframe"]
+                # ---- Inser into the table
+                df_to_add.to_sql(name=kwargs["table_name"], con=connection, if_exists="append", 
+                                 index=False)
+            # ---- INSPECT
+            elif command == "inspect":
+                return inspect(engine).get_table_names()
+            # ---- OTHER COMMAND
+            else: 
+                connection.execute(text(SQL_COMMANDS[command].format(**kwargs)))
+    finally: 
+        # ---- Dispose of the engine to release any resources being pooled/used
+        engine.dispose()

From 382d444d06b4427dd83902e3e33eaee3a62cbc6b Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 22 Jul 2024 15:40:28 -0700
Subject: [PATCH 06/81] General changes

---
 echopop/live/biology.py     |    0
 echopop/live/load.py        |    0
 echopop/live/spatial.py     |    0
 echopop/live/sql_methods.py |  165 ++++-
 echopop/live/write.py       |    0
 echopop/mesh_generation.py  | 1186 +++++++++++++++++++++++++++++++++++
 6 files changed, 1339 insertions(+), 12 deletions(-)
 delete mode 100644 echopop/live/biology.py
 delete mode 100644 echopop/live/load.py
 delete mode 100644 echopop/live/spatial.py
 delete mode 100644 echopop/live/write.py

diff --git a/echopop/live/biology.py b/echopop/live/biology.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/echopop/live/load.py b/echopop/live/load.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/echopop/live/spatial.py b/echopop/live/spatial.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index 37a9d3b7..e8d8de93 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -1,9 +1,140 @@
 from sqlalchemy import create_engine, text, Engine, inspect
-
+import sqlalchemy as sqla
 import pandas as pd
+from typing import Optional
+
+def sql_create(connection: sqla.Connection, df: pd.DataFrame, table_name: str, 
+               primary_keys: Optional[list] = None):
+    """
+    Generate a SQL command to create a table with dynamic columns, primary keys, and indices.
+
+    Args:
+        table_name (str): The name of the table.
+        columns (dict): A dictionary where keys are column names and values are data types.
+        primary_keys (list, optional): List of column names to be used as primary keys.
+
+    Returns:
+        str: The SQL command to create the table.
+    """
+    # Generate column definitions
+    column_definitions = (
+        ",\n".join(f"{col} {SQL_DTYPES[type(col).__name__]}" 
+                   for col in df.columns)
+        )
+    
+    # Generate primary key definition
+    primary_key_definition = ""
+    if primary_keys:
+        primary_key_definition = f",\nPRIMARY KEY ({', '.join(primary_keys)})"
+        
+    # Combine all parts into the final SQL command
+    create_table_command = f"""
+    CREATE TABLE IF NOT EXISTS {table_name} (
+        {column_definitions}
+        {primary_key_definition}
+    );
+    """
+    
+    # Execute
+    connection.execute(text(create_table_command.strip()))
+
+def sql_validate(connection: sqla.Connection, table_name: str): 
+    """
+    Check if a table exists in the database.
+
+    Args:
+        connection: SQLAlchemy Connection object.
+        table_name (str): The name of the table to check.
+
+    Returns:
+        bool: True if the table exists, False otherwise.
+    """    
+    inspector = inspect(connection)
+    return table_name in inspector.get_table_names()
+
+def sql_inspect(connection: sqla.Connection):
+    """
+    Get a list of all tables present
+
+    Args:
+        connection: SQLAlchemy Connection object.
+
+    Returns:
+        list: True if the table exists, False otherwise.
+    """  
+    return inspect(connection).get_table_names()
+
+def sql_drop(connection: sqla.Connection, table_name: str):
+    """
+    """
+    connection.execute(text(f"DROP TABLE IF EXISTS {table_name};"))
+    
+def sql_insert(connection: sqla.Connection, table_name: str, columns: list, dataframe: pd.DataFrame,
+               id_columns: Optional[list] = None):
+    """
+    Insert data into a table.
+
+    Args:
+        connection (Connection): The SQLAlchemy Connection instance.
+        table_name (str): The name of the table.
+        columns (list): List of column names.
+        data (list of dict): List of dictionaries containing data to insert or update.
+        conflict_columns (list): List of column names to use for conflict resolution.
+    """
+    
+    # Prepare the SQL statement for insertion
+    # ---- If not a List
+    if not isinstance(columns, list):
+        columns = list(columns)
+        
+    column_names = ", ".join(columns)
+    
+    # Convert the DataFrame into a tuple and then into a string
+    # ---- DataFrame to Tuple
+    data_tuple = [tuple(row) for row in dataframe.itertuples(index=False)]
+    # ---- Tuple to String
+    if dataframe.columns.size == 1:
+        data_str = ", ".join(
+            f"{', '.join(map(str, row))}"
+            for row in data_tuple
+        )
+    else:   
+        data_str = ", ".join(
+            f"({', '.join(map(str, row))})"
+            for row in data_tuple
+        )
+    
+    # Construct the "ON CONFLICT, DO UPDATE SET" if needed
+    on_conflict_clause = ""
+    if id_columns:
+        on_conflict_clause = f"""
+        ON CONFLICT ({', '.join(id_columns)})
+        DO UPDATE SET {', '.join(f'{col}=excluded.{col}' for col in columns)}
+        """
+    
+    # Construct the SQL query
+    sql_command = f"""
+    INSERT INTO {table_name} ({column_names})
+    VALUES ({data_str})
+    {on_conflict_clause}
+    """    
+
+    # Execute
+    connection.execute(text(sql_command.strip()))
+    
+    # Commit
+    connection.commit()
+    
 
+    
 SQL_COMMANDS = {
-    "create": "CREATE TABLE IF NOT EXISTS {table_name} ({column_definitions});",
+    "create": sql_create,
+    "drop": sql_drop,
+    "inspect": sql_inspect,
+    "validate": sql_validate,
+    
+    
+    
     "check": "SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';",
     "drop": "DROP TABLE IF EXISTS {table_name};",
     "select": "SELECT {columns} FROM {table_name};",
@@ -24,22 +155,32 @@
     'float64': 'FLOAT',
     'bool': 'BOOLEAN',
     'datetime64[ns]': 'DATETIME',
-    'object': 'TEXT'
+    'object': 'TEXT',
+    "str": "TEXT",
 }
 
+def format_sql_columns(kwargs: dict):
+    if "columns" in kwargs:
+        if isinstance(kwargs["columns"], list):
+            kwargs["columns"] = ", ".join(kwargs["columns"])
+    else: 
+        kwargs["columns"] = "*"
+        
+    # Return the updated `kwargs` dictionary
+    return kwargs
+
+# TODO: Documentation
+
+
 # TODO: Documentation
 def SQL(db_file: str, command: str, **kwargs):
 
     # Create engine from `db_file` string
     engine = create_engine(f"sqlite:///{db_file}")
-
-    # Format `columns`, if there are any and more than 1
-    if "columns" in kwargs.keys():
-        if isinstance(kwargs["columns"], list):
-            kwargs["columns"] = ", ".join(kwargs["columns"])
-    else:
-        kwargs["columns"] = "*"
-
+    
+    # Format the data columns, if necessary, to fit within the SQL commands
+    kwargs = format_sql_columns(kwargs)
+    
     # Run the command
     try:
         with engine.connect() as connection:
@@ -59,7 +200,7 @@ def SQL(db_file: str, command: str, **kwargs):
             elif command == "insert": 
                 # ---- Extract dataframe
                 df_to_add = kwargs["dataframe"]
-                # ---- Inser into the table
+                # ---- Insert into the table
                 df_to_add.to_sql(name=kwargs["table_name"], con=connection, if_exists="append", 
                                  index=False)
             # ---- INSPECT
diff --git a/echopop/live/write.py b/echopop/live/write.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/echopop/mesh_generation.py b/echopop/mesh_generation.py
index 3fab6d89..077d9c93 100644
--- a/echopop/mesh_generation.py
+++ b/echopop/mesh_generation.py
@@ -1,3 +1,1189 @@
+import numpy as np
+import pandas as pd
+from sqlalchemy import create_engine, text
+from pathlib import Path
+import os 
+
+SQL_COMMANDS["create"].format(**{"table_name": "A", "column_definitions": "B"})
+
+# Coordinates
+x = np.array([1, 2, 3, 4, 5])
+y = np.array([1, 2, 3, 4, 5])
+
+# Create the grid points
+grid_points = [(i, j, 0) for i in x for j in y]
+
+#
+data_root_dir = Path("C:/Users/Brandyn/Documents/GitHub/EchoPro_data/")
+db_directory = data_root_dir / "database"
+# ---- Create the directory if it does not already exist
+db_directory.mkdir(parents=True, exist_ok=True)
+# ---- Complete path to `biology.db`
+db_file = db_directory / "grid.db"
+
+from sqlalchemy import create_engine, MetaData, Table, select, inspect, update, text, case
+
+# Initialize the database and create the table
+engine = create_engine(f"sqlite:///{db_file}")
+
+# Define metadata and the table to drop
+metadata = MetaData()
+grid_table = Table('grid', metadata, autoload_with=engine)
+# Drop the table
+with engine.connect() as connection:
+    grid_table.drop(connection)
+    print("Table 'grid' has been dropped.")
+
+# Inspect the database
+inspector = inspect(engine)
+tables = inspector.get_table_names()
+print(tables)
+
+def create_table_sql(table_name, columns, primary_keys=None, index_columns=None):
+    """
+    Generate a SQL command to create a table with dynamic columns, primary keys, and indices.
+
+    Args:
+        table_name (str): The name of the table.
+        columns (dict): A dictionary where keys are column names and values are data types.
+        primary_keys (list, optional): List of column names to be used as primary keys.
+        index_columns (list, optional): List of column names to be indexed.
+
+    Returns:
+        str: The SQL command to create the table.
+    """
+    # Generate column definitions
+    column_definitions = ",\n    ".join(f"{col} {dtype}" for col, dtype in columns.items())
+
+    # Generate primary key definition
+    primary_key_definition = ""
+    if primary_keys:
+        primary_key_definition = f",\n    PRIMARY KEY ({', '.join(primary_keys)})"
+
+    # Generate index definitions
+    index_definitions = ""
+    if index_columns:
+        index_definitions = "\n".join(
+            f"CREATE INDEX IF NOT EXISTS idx_{table_name}_{col} ON {table_name} ({col});"
+            for col in index_columns
+        )
+
+    # Combine all parts into the final SQL command
+    create_table_command = f"""
+    CREATE TABLE IF NOT EXISTS {table_name} (
+        {column_definitions}
+        {primary_key_definition}
+    );
+    """
+    # Return the command and any index definitions
+    return create_table_command.strip() + "\n" + index_definitions
+
+# Define metadata and the table to drop
+metadata = MetaData()
+grid_table = Table('grid', metadata, autoload_with=engine)
+# Drop the table
+with engine.connect() as connection:
+    grid_table.drop(connection)
+    print("Table 'grid' has been dropped.")
+    
+check_table_exists(engine, "grid")
+
+with engine.connect() as connection:
+    sql_create(connection, df, table_name, primary_keys)
+
+# Create the table
+table_name = "grid"
+columns = {"x": "INTEGER", "y": "INTEGER", "value": "REAL"}
+primary_keys = ["x", "y"]
+index_columns = ["x", "y"]
+
+create_sql = create_table_sql(table_name, columns, primary_keys, index_columns)
+print("Create Table SQL:\n", create_sql)
+
+with engine.connect() as connection:
+    connection.execute(text(create_sql))
+
+inspector = inspect(engine)
+tables = inspector.get_table_names()
+print(tables)
+
+check_table_exists(engine, "grid")
+
+sql_command = f"SELECT * FROM {table_name};"
+
+with engine.connect() as connection:
+    result = connection.execute(text(sql_command))
+    rows = result.fetchall()
+
+for row in rows:
+    print(row)
+    
+    
+check_table_exists(engine, "files_read")
+
+zarr_files_str = ["A", "B", "C", "D"]
+# ---- Create DataFrame
+current_files = pd.DataFrame(zarr_files_str, columns=["filepath"])
+
+with engine.connect() as connection:
+    sql_create(connection, table_name="files_read", df=current_files)
+    sql_insert(connection, table_name="files_read", columns=["filepath"], dataframe=current_files)
+
+table_name = "files_read"
+sql_command = f"SELECT * FROM {table_name};"
+
+with engine.connect() as connection:
+    result = connection.execute(text(sql_command))
+    rows = result.fetchall()
+
+for row in rows:
+    print(row)
+    
+    
+
+from sqlalchemy.exc import IntegrityError
+
+def insert_or_update(engine, table_name, columns, data, conflict_columns):
+    """
+    Insert or update data in a table.
+
+    Args:
+        engine (Engine): The SQLAlchemy engine instance.
+        table_name (str): The name of the table.
+        columns (list): List of column names.
+        data (list of dict): List of dictionaries containing data to insert or update.
+        conflict_columns (list): List of column names to use for conflict resolution.
+    """
+
+    # Prepare the SQL statement for insertion
+    column_names = ", ".join(columns)
+    placeholder = ", ".join(f":{col}" for col in columns)
+    # values_list = ", ".join(f"({', '.join(f':{col}' for col in columns)})" for _ in data)
+    values_str = ", ".join(
+        f"({', '.join(map(str, row))})"
+        for row in data
+    )
+    
+    
+    # Construct the SQL query
+    sql = f"""
+    INSERT INTO {table_name} ({column_names})
+    VALUES {values_str}
+    ON CONFLICT ({', '.join(conflict_columns)})
+    DO UPDATE SET {', '.join(f'{col}=excluded.{col}' for col in columns)}
+    """
+    
+    # Flatten the list of data for execution
+    # flattened_data = [item for sublist in [[(item[col] for col in columns)] for item in data] for item in sublist]
+
+    # Execute the SQL command
+    with engine.connect() as connection:
+        try:
+            connection.execute(text(sql))
+            # connection.commit()
+            print(f"Data inserted or updated successfully in table '{table_name}'.")
+        except IntegrityError as e:
+            print(f"IntegrityError: {e}")
+        except Exception as e:
+            print(f"An error occurred: {e}")
+            
+# Prepare data for insertion or update
+# data = [{'x': i, 'y': j, 'value': v} for i, j, v in grid_points]
+data = grid_points
+
+# Insert or update data
+insert_or_update(engine, table_name, columns.keys(), data, primary_keys)
+
+sql_command = f"SELECT * FROM {table_name};"
+
+with engine.connect() as connection:
+    result = connection.execute(text(sql_command))
+    rows = result.fetchall()
+
+for row in rows:
+    print(row)
+
+def update_specific_rows(engine, table_name, updates, conditions):
+    """
+    Update specific rows in a table based on conditions.
+
+    Args:
+        engine (Engine): The SQLAlchemy engine instance.
+        table_name (str): The name of the table.
+        updates (dict): Dictionary of columns and their new values to be updated.
+        conditions (dict): Dictionary of columns and their values to be used in the WHERE clause.
+    """
+
+    # Construct the SET clause for the update
+    set_clause = ", ".join(f"{col} = :{col}" for col in updates.keys())
+    
+    # Construct the WHERE clause for the update
+    where_clause = " AND ".join(f"{col} = :{col}_cond" for col in conditions.keys())
+    
+    # Construct the SQL query
+    sql = f"""
+    UPDATE {table_name}
+    SET {set_clause}
+    WHERE {where_clause}
+    """
+    
+    # Prepare parameters for the query
+    parameters = {**updates, **{f"{col}_cond": val for col, val in conditions.items()}}
+
+    # Execute the SQL command
+    with engine.connect() as connection:
+        try:
+            connection.execute(text(sql), parameters)
+            print(f"Rows updated successfully in table '{table_name}'.")
+        except IntegrityError as e:
+            print(f"IntegrityError: {e}")
+        except Exception as e:
+            print(f"An error occurred: {e}")
+
+# Define table name
+table_name = "grid"
+# Define the table and columns
+table_name = 'grid'
+condition_columns = ['x', 'y']
+
+# Define the updates and conditions
+dd = {"x": np.array([1, 2, 3 , 4, 5]), "y": np.array([1, 2, 3 , 4, 5]), "value": np.array([1, 2, 3 , 4, 5]).astype(float)}
+new_data = pd.DataFrame(dd)
+new_data
+df = new_data
+
+with engine.connect() as connection: 
+    # sql_create(connection, table_name = "grid", df = df)
+    # sql_validate(connection, "grid")
+    # sql_drop(connection, "grid")
+    sql_insert(connection, table_name="grid", columns=df.columns, dataframe=df, id_columns=["x", "y"])
+
+
+data_tuples = [tuple(row) for row in df.itertuples(index=False)]
+
+all_columns = df.columns.tolist()
+if len(condition_columns) >= len(all_columns):
+    raise ValueError("The number of condition columns must be less than the number of columns in data.")
+
+# Prepare column names and conditions
+update_columns = [col for col in all_columns if col not in condition_columns]
+condition_str = " AND ".join(f"{col} = ?" for col in condition_columns)
+update_str = ", ".join(f"{col} = ?" for col in update_columns)
+data_tuples = [tuple(row) for row in df.itertuples(index=False)]
+# Generate values string for SQL command
+values_str = ", ".join(
+    f"({', '.join(map(str, row))})"
+    for row in data_tuples
+)
+
+# Construct the SQL query
+sql = f"""
+INSERT INTO {table_name} ({', '.join(all_columns)})
+VALUES {values_str}
+ON CONFLICT ({', '.join(condition_columns)})
+DO UPDATE SET {', '.join(f'{col} = {table_name}.{col} + excluded.{col}' for col in update_columns)}
+"""
+
+# Execute the SQL command
+with engine.connect() as connection:
+    try:
+        connection.execute(text(sql))
+        connection.commit()
+        print(f"Specific rows updated successfully in table '{table_name}'.")
+    except IntegrityError as e:
+        print(f"IntegrityError: {e}")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        
+sql_command = f"SELECT * FROM {table_name};"
+
+with engine.connect() as connection:
+    result = connection.execute(text(sql_command))
+    rows = result.fetchall()
+
+for row in rows:
+    print(row)     
+    
+       
+# Insert or update data
+insert_or_update(engine, table_name, columns.keys(), data, primary_keys)
+
+sql_command = f"SELECT * FROM {table_name};"
+
+with engine.connect() as connection:
+    result = connection.execute(text(sql_command))
+    rows = result.fetchall()
+
+for row in rows:
+    print(row)
+
+# Ensure that condition_columns match the length of data tuples minus the update column
+if len(condition_columns) != len(df.columns) - 1:
+    raise ValueError("The number of condition columns must match the number of columns in data minus the update column.")
+
+# Prepare the SQL statement for update
+update_columns = [col for col in df.columns if col not in condition_columns]
+condition_str = " AND ".join(f"{col} = ?" for col in condition_columns)
+update_str = ", ".join(f"{col} = ?" for col in update_columns)
+# Convert DataFrame rows to list of tuples
+data_tuples = [tuple(row) for row in df.itertuples(index=False)]
+
+# Generate a values string for the SQL command
+values_str = ", ".join(
+    f"({', '.join(map(str, row))})"
+    for row in data_tuples
+)
+# Construct the SQL query
+sql = f"""
+UPDATE {table_name}
+SET {update_str}
+WHERE {condition_str}
+"""
+
+# Flatten the list of data for execution
+flattened_data = []
+for row in data_tuples:
+    conditions = row[:len(condition_columns)]
+    update_values = row[len(condition_columns):]
+    flattened_data.extend(conditions + update_values)
+    
+# Execute the SQL command
+with engine.connect() as connection:
+    try:
+        connection.execute(text(sql), flattened_data)
+        print(f"Specific rows updated successfully in table '{table_name}'.")
+    except IntegrityError as e:
+        print(f"IntegrityError: {e}")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+# Execute the SQL command
+with engine.connect() as connection:
+    try:
+        connection.execute(text(sql), flattened_data)
+        print(f"Specific rows updated successfully in table '{table_name}'.")
+    except IntegrityError as e:
+        print(f"IntegrityError: {e}")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+# Update specific rows
+update_specific_rows(engine, table_name, updates, conditions)
+
+# Verify the update
+sql_command = f"SELECT * FROM {table_name};"
+with engine.connect() as connection:
+    result = connection.execute(text(sql_command))
+    rows = result.fetchall()
+
+for row in rows:
+    print(row)
+# Construct the full SQL command
+sql_command = f"""
+INSERT INTO {table_name} ({columns_str})
+VALUES {values_str};
+"""
+
+# Execute the SQL command
+with engine.connect() as connection:
+    connection.execute(text(sql_command))
+    connection.commit()
+
+check_table_exists(engine, "grid")
+
+# Define table name, columns, and data
+table_name = 'grid'
+columns = ['x', 'y', 'value']
+data = [
+    (1, 1, 1.0),
+    (2, 2, 1.5),
+    (3, 3, 2.0)
+]
+
+# Prepare the columns part of the SQL statement
+columns_str = ", ".join(columns)
+
+# Prepare the values part of the SQL statement
+values_str = ", ".join(
+    f"({', '.join(map(str, row))})"
+    for row in data
+)
+
+
+
+
+    
+    
+print("Generated SQL Command:")
+print(sql_command)
+
+# Execute the SQL command
+with engine.connect() as connection:
+    connection.execute(text(sql_command))
+
+def insert_values_sql(table_name, columns, values, filter_clause=""):
+    """
+    Generate a SQL command to insert values into a table.
+
+    Args:
+        table_name (str): The name of the table.
+        columns (list): List of column names to be inserted.
+        values (list of tuples): List of tuples where each tuple represents a row of values to be inserted.
+        filter_clause (str, optional): Optional filter clause to specify conditions for insertion.
+
+    Returns:
+        str: The SQL command to insert values into the table.
+    """
+    # Generate column names
+    column_names = ", ".join(columns)
+    
+    # Generate value placeholders
+    value_placeholders = ", ".join("?" * len(columns))
+
+    # Generate values part
+    values_part = ", ".join(f"({', '.join('?' * len(columns))})" for _ in values)
+    
+    # Flatten the values list for insertion
+    flattened_values = [item for sublist in values for item in sublist]
+
+    # Create the SQL command
+    insert_command = f"""
+    INSERT INTO {table_name} ({column_names})
+    VALUES {values_part}
+    {filter_clause}
+    """
+    return insert_command.strip(), flattened_values
+
+# Define the values for insertion
+insert_columns = ["x", "y", "value"]
+insert_values = [(1, 1, 10.0)]
+
+insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values)
+print("Insert Values SQL:\n", insert_sql)
+print("Data:\n", insert_data)
+
+insrt_stmt = 
+
+with engine.connect() as connection:
+    connection.execute(text(insert_sql), tuple(insert_data))
+
+# Define the values for insertion
+insert_columns = ["x", "y", "value"]
+insert_values = [(1, 1, 10.0), (2, 2, 20.0), (3, 3, 30.0)]
+
+# Call the function
+insert_or_update_table(engine, table_name, columns, data, conflict_columns)
+
+# Example usage
+table_name = "grid"
+columns = ["x", "y", "value"]
+data = [
+    (1, 1, 1.0),
+    (2, 2, 1.5),
+    (3, 3, 2.0),
+]
+
+sql_command = "INSERT INTO grid (x, y, value) VALUES (:x, :y, :value)"
+test_data = [{'x': 1, 'y': 1, 'value': 1.0}]
+
+with engine.connect() as connection:
+    connection.execute(text(sql_command), test_data)
+
+# Generate the SQL command and data
+insert_stmt = insert_into_table(table_name, columns, data)
+
+# Print the generated SQL command (for validation)
+print("Insert SQL Command:")
+print(insert_stmt)
+
+# Print for validation
+print("Insert SQL Command:")
+print(insert_sql)
+print("Data:")
+print(insert_data)
+
+# Example execution with SQLAlchemy
+with engine.connect() as connection:
+    connection.execute(insert_stmt)
+
+def insert_values_sql(table_name, columns, values):
+    """
+    Generate SQL command for inserting values into a table.
+
+    Args:
+        table_name (str): The name of the table.
+        columns (list): List of column names.
+        values (list of tuples): List of values to insert.
+
+    Returns:
+        str: The SQL command to insert the values.
+        list: Flattened list of values for binding to the SQL command.
+    """
+    column_names = ", ".join(columns)
+    value_placeholders = ", ".join("?" * len(columns))
+    values_part = ", ".join(f"({value_placeholders})" for _ in values)
+    flattened_values = [item for sublist in values for item in sublist]
+    
+    insert_command = f"""
+    INSERT INTO {table_name} ({column_names})
+    VALUES {values_part}
+    """
+    return insert_command.strip(), flattened_values
+
+def check_table_exists(engine, table_name):
+    """
+    Check if a table exists in the database.
+
+    Args:
+        engine: SQLAlchemy engine object.
+        table_name (str): The name of the table to check.
+
+    Returns:
+        bool: True if the table exists, False otherwise.
+    """
+    inspector = inspect(engine)
+    return table_name in inspector.get_table_names()
+
+with engine.connect() as connection:
+    # sql_validate(connection, "grid")
+    sql_inspect(connection)
+    sql_drop(connection, table_name)
+
+def select_from_table(engine, table_name, columns='*'):
+    """
+    Select data from a table.
+
+    Args:
+        engine: SQLAlchemy engine object.
+        table_name (str): The name of the table to select from.
+        columns (str or list): Columns to select. '*' selects all columns.
+
+    Returns:
+        list: List of rows returned by the query.
+    """
+    metadata = MetaData(bind=engine)
+    table = Table(table_name, metadata, autoload_with=engine)
+
+    if columns == '*':
+        columns = [col.name for col in table.columns]
+    elif isinstance(columns, str):
+        columns = [columns]
+
+    stmt = select([table.c[col] for col in columns])
+    
+    with engine.connect() as connection:
+        result = connection.execute(stmt)
+        return result.fetchall()
+    
+# Create table
+table_name = "grid"
+columns = {"x": "INTEGER", "y": "INTEGER", "value": "REAL"}
+primary_keys = ["x", "y"]
+index_columns = ["value"]
+
+create_sql = create_table_sql(table_name, columns, primary_keys, index_columns)
+print("Create Table SQL:\n", create_sql)
+
+with engine.connect() as connection:
+    connection.execute(create_sql)
+
+insert_columns = ["x", "y", "value"]
+insert_values = [(1, 1, 10.0), (2, 2, 20.0), (3, 3, 30.0)]
+
+# Insert data function
+def insert_values_sql(table_name, columns, values):
+    column_names = ", ".join(columns)
+    value_placeholders = ", ".join("?" * len(columns))
+    values_part = ", ".join(f"({value_placeholders})" for _ in values)
+    
+    insert_command = f"""
+    INSERT INTO {table_name} ({column_names})
+    VALUES {values_part}
+    """
+    # Flatten the list of values into a single list
+    flattened_values = [value for sublist in values for value in sublist]
+    
+    return insert_command.strip(), flattened_values
+
+
+table_name = 'grid'
+columns = ['x', 'y', 'value']
+data = [
+    (1, 1, 1.0),
+    (2, 2, 1.5),
+    (3, 3, 2.0)
+]
+
+# Prepare the columns part of the SQL statement
+columns_str = ", ".join(columns)
+
+# Prepare the values part of the SQL statement
+values_str = ", ".join(
+    f"({', '.join(map(str, row))})"
+    for row in data
+)
+
+# Construct the full SQL command
+sql_command = f"""
+INSERT INTO {table_name} ({columns_str})
+VALUES {values_str};
+"""
+
+# Execute the SQL command
+with engine.connect() as connection:
+    connection.execute(text(sql_command))
+
+sql_command = f"SELECT * FROM {table_name};"
+
+with engine.connect() as connection:
+    result = connection.execute(text(sql_command))
+    rows = result.fetchall()
+    
+print(f"Data in table {table_name}:")
+for row in rows:
+    print(row)
+# Construct the full SQL command
+sql_command = f"""
+INSERT INTO {table_name} ({columns_str})
+VALUES {values_str};
+"""
+
+
+insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values)
+print("Insert Values SQL:\n", insert_sql)
+print("Insert Data:\n", insert_data)
+
+with engine.connect() as connection:
+    connection.execute(insert_sql, [insert_data])
+
+# Check table existence
+exists = check_table_exists(engine, table_name)
+print(f"Table '{table_name}' exists: {exists}")
+
+# Select data from table
+data = select_from_table(engine, table_name, insert_columns)
+print(f"Data from '{table_name}':")
+for row in data:
+    print(row)
+
+
+
+
+create_sql = create_table_sql(table_name, columns, primary_keys, index_columns)
+print("Create Table SQL:\n", create_sql)
+
+# Define the values for insertion
+insert_columns = ["x", "y", "value"]
+insert_values = [(1, 1, 10.0)]
+
+insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values)
+print("Insert Values SQL:\n", insert_sql)
+print("Data:\n", insert_data)
+
+# Example usage
+table_name = "grid"
+columns = {
+    "x": "INTEGER",
+    "y": "INTEGER",
+    "value": "REAL"
+}
+primary_keys = ["x", "y"]
+index_columns = ["value"]
+
+sql_command = create_table_sql(table_name, columns, primary_keys, index_columns)
+print(sql_command)
+
+# Create the table
+create_table_sql = """
+CREATE TABLE IF NOT EXISTS grid (
+    x INTEGER,
+    y INTEGER,
+    value REAL,
+    PRIMARY KEY (x, y)
+);
+"""
+
+# Insert grid points
+insert_values = ", ".join(f"({i}, {j}, {v})" for i, j, v in grid_points)
+insert_sql = f"""
+INSERT INTO grid (x, y, value) VALUES {insert_values};
+"""
+
+# Connect to the database and execute the commands
+with engine.connect() as connection:
+    try:
+        # Create table if it does not exist
+        connection.execute(text(create_table_sql))
+        # Insert grid points
+        connection.execute(text(insert_sql))
+        connection.commit()
+        print("Grid points successfully inserted.")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+
+engine = create_engine(f"sqlite:///{db_file}")
+metadata = MetaData()
+grid_table = Table('grid', metadata, autoload_with=engine)
+# Read existing grid values from the database into a DataFrame
+with engine.connect() as connection:
+    select_stmt = select(grid_table.c.x, grid_table.c.y, grid_table.c.value)
+    result = connection.execute(select_stmt)
+    existing_data = pd.DataFrame(result.fetchall(), columns=['x', 'y', 'value'])
+
+# Coordinates to update
+update_coords = {(1,1), (2,2), (3,3), (4,4), (5,5)}
+
+# Create a dictionary for fast lookup
+update_dict = {(i, j): 1.0 for i, j in update_coords}
+
+# Update the grid_points with new values where applicable
+updated_grid_points = [
+    (i, j, update_dict.get((i, j), value))
+    for i, j, value in grid_points
+]
+
+# Convert the list of tuples to a DataFrame
+df_updated_grid_points = pd.DataFrame(updated_grid_points, columns=['x', 'y', 'value'])
+
+# Print the DataFrame
+print(df_updated_grid_points)
+
+# Merge existing and updated data to find differences
+merged_data = pd.merge(existing_data, df_updated_grid_points, on=['x', 'y'], suffixes=('_existing', '_updated'))
+differences = merged_data[merged_data['value_existing'] != merged_data['value_updated']]
+
+# Assuming 'differences' is your DataFrame with updated values
+# Create a dictionary for batch updating
+update_dict = differences.set_index(['x', 'y'])['value_updated'].to_dict()
+
+# Generate the SQLAlchemy update statement
+update_stmt = update(grid_table).where(
+    grid_table.c.x.in_(update_dict.keys())
+).values({
+    grid_table.c.value: update_dict.get((grid_table.c.x, grid_table.c.y), grid_table.c.value)
+})
+
+# Create the CASE statement
+case_stmt = case(
+    { 
+        (grid_table.c.x == x) & (grid_table.c.y == y): value 
+        for (x, y), value in update_dict.items()
+    },
+    else_=grid_table.c.value
+)
+
+# Convert the DataFrame into a dictionary of case statements
+case_stmt = case(
+    [(grid_table.c.x == x) & (grid_table.c.y == y), value]
+    for (x, y), value in update_dict.items()
+)
+
+# Create the case statement
+case_stmt = case(
+    { (x, y): value for (x, y), value in update_dict.items() },
+    value=grid_table.c.x,  # Assuming `x` is the column being compared
+    else_=grid_table.c.value
+)
+
+case_stmt = case(
+    {
+        (x, y): value
+        for (x, y), value in update_dict.items()
+    },
+    value=grid_table.c.x,
+    else_=grid_table.c.value
+)
+
+# Create the case statement
+# Create a CASE statement using a dictionary
+case_stmt = case(
+    {
+        (grid_table.c.x == x) & (grid_table.c.y == y): value
+        for (x, y), value in update_dict.items()
+    },
+    else_=grid_table.c.value
+)
+case_stmt = case(
+    {((grid_table.c.x == x) & (grid_table.c.y == y)): value
+     for (x, y), value in update_dict.items()},
+    else_=grid_table.c.value
+)
+print("Case Statement:", str(case_stmt.compile(engine, compile_kwargs={"literal_binds": True})))
+
+
+# Create the update statement
+update_stmt = (
+    update(grid_table).
+    where(grid_table.c.value != case_stmt).
+    values(value=case_stmt)
+)    
+    
+print("Update Statement:", str(update_stmt.compile(engine, compile_kwargs={"literal_binds": True})))
+
+
+# Print the SQL for each update
+for (x, y), value in update_dict.items():
+    update_stmt = (
+        update(grid_table)
+        .where((grid_table.c.x == x) & (grid_table.c.y == y))
+        .values(value=value)
+    )
+    # Print the SQL statement with literal values for debugging
+    print("Update Statement:", str(update_stmt.compile(engine, compile_kwargs={"literal_binds": True})))
+
+    # Execute the update statement
+    with engine.connect() as connection:
+        result = connection.execute(update_stmt)
+        print(f"Updated {result.rowcount} entries for coordinates ({x}, {y}).")
+
+# Execute the update
+with engine.connect() as connection:
+    result = connection.execute(update_stmt)
+    print(f"Updated {result.rowcount} entries.")
+
+engine.dispose()
+
+engine = create_engine(f"sqlite:///{db_file}")
+metadata = MetaData()
+grid_table = Table('grid', metadata, autoload_with=engine)
+# Verify the updated rows
+select_stmt = select(grid_table)
+
+with engine.connect() as connection:
+    result = connection.execute(select_stmt)
+    rows = result.fetchall()
+
+for row in rows:
+    print(row)
+    
+# Define your SQLite engine and metadata
+engine = create_engine(F'sqlite:///{db_file}')
+metadata = MetaData()
+
+# Reflect the grid table
+grid_table = Table('grid', metadata, autoload_with=engine)
+
+# Define your update dictionary
+update_dict = {(1, 1): 1.0, (2, 2): 1.0, (3, 3): 1.0, (4, 4): 1.0, (5, 5): 1.0}
+
+# Execute updates
+# with engine.connect() as connection:
+connection = engine.connect()
+# for (x, y), value in update_dict.items():
+(x,y) = (1, 1)
+value = update_dict[(1,1)]
+
+update_stmt = (
+    update(grid_table)
+    .where((grid_table.c.x == x) & (grid_table.c.y == y))
+    .values(value=value)
+)
+# Print the SQL statement for debugging
+print("Executing Update Statement:", str(update_stmt.compile(engine, compile_kwargs={"literal_binds": True})))
+    
+# Execute the update statement
+result = connection.execute(update_stmt)
+print(f"Updated {result.rowcount} entries for coordinates ({x}, {y}).")
+connection.close()
+
+select_stmt = select(grid_table.c.x)
+
+# Execute the SELECT statement
+with engine.connect() as connection:
+    result = connection.execute(select_stmt)
+    x_values = result.fetchall()
+
+type(x_values[0])
+
+select_stmt = select(grid_table.c.y)
+
+# Execute the SELECT statement
+with engine.connect() as connection:
+    result = connection.execute(select_stmt)
+    y_values = result.fetchall()
+    
+select_stmt = select(grid_table.c.value)
+
+# Execute the SELECT statement
+with engine.connect() as connection:
+    result = connection.execute(select_stmt)
+    values = result.fetchall()  
+    
+case_stmt = case(
+    *[(grid_table.c.x == x) & (grid_table.c.y == y, value)
+      for (x, y), value in update_dict.items()],
+    else_=grid_table.c.value
+)
+
+update_dict = {(1, 2): 1.0, (3, 2): 1.0, (1, 5): 1.0, (4, 5): 1.0, (3, 5): 4.0}
+
+with engine.connect() as connection:
+    # Select all values to check the current state
+    result = connection.execute(select(grid_table.c.x, grid_table.c.y, grid_table.c.value))
+    current_values = result.fetchall()
+    print("Current Values:", current_values)
+    
+with engine.connect() as connection:
+    with connection.begin():  # Begin a transaction
+        for (x, y), value in update_dict.items():
+            stmt = (
+                update(grid_table)
+                .where((grid_table.c.x == x) & (grid_table.c.y == y))
+                .values(value=grid_table.c.value + value)
+            )
+            connection.execute(stmt)
+            
+with engine.connect() as connection:
+    # Re-select to check the updated state
+    result = connection.execute(select(grid_table.c.x, grid_table.c.y, grid_table.c.value))
+    updated_values = result.fetchall()
+    print("Updated Values:", updated_values)
+    
+    
+# Confirm the updates
+with engine.connect() as connection:
+    select_stmt = select([grid_table])
+    result = connection.execute(select_stmt)
+    rows = result.fetchall()
+
+# Print all rows to verify updates
+print("Database contents after update:")
+for row in rows:
+    print(row)
+    
+    
+# Construct the update statement
+update_stmt = (
+    update(grid_table)
+    .values(value=case_stmt)
+    .where(grid_table.c.value != case_stmt)
+)
+
+# Create a SELECT statement to fetch all rows from the grid_table
+select_stmt = select(grid_table)
+
+# Execute the SELECT statement and fetch results
+with engine.connect() as connection:
+    result = connection.execute(select_stmt)
+    rows = result.fetchall()
+
+# Print or inspect the fetched rows
+for row in rows:
+    print(row)
+
+# Create the update statement
+update_stmt = (
+    update(grid_table)
+    .where(grid_table.c.value != case_stmt)
+    .values(value=case_stmt)
+)
+
+# Execute the update
+with engine.connect() as connection:
+    result = connection.execute(update_stmt)
+    print(f"Updated {result.rowcount} entries.")
+
+case(
+    [
+        ((grid_table.c.x == x) & (grid_table.c.y == y), value)
+        for (x, y), value in update_dict.items()
+    ],
+    else_=grid_table.c.value
+)
+
+# Create a case statement for conditional update
+case_statements = {
+    (x, y): case(
+        [(grid_table.c.x == x) & (grid_table.c.y == y, value)],
+        else_=grid_table.c.value
+    ) 
+    for (x, y), value in update_dict.items()
+}
+
+
+# Define SQL command to select all data from the grid table
+select_sql = "SELECT * FROM grid;"
+
+# Connect to the database and execute the query
+with engine.connect() as connection:
+    try:
+        # Execute the select command
+        result = connection.execute(text(select_sql))
+        # Fetch all rows from the result
+        rows = result.fetchall()
+        # Print the results
+        print("Data in grid table:")
+        for row in rows:
+            print(row)
+    except Exception as e:
+        print("An error occurred: {}".format(e))
+
+# Coordinates to update
+update_coords = {(1,1), (2,2), (3,3), (4,4), (5,5)}
+
+# Create a copy of grid_points and update specific coordinates
+updated_grid_points = [
+    (i, j, 1.0) if (i, j) in update_coords else (i, j, value)
+    for i, j, value in grid_points
+]
+
+# Retrieve current data from the database
+with engine.connect() as connection:
+    result = connection.execute(text("SELECT x, y, value FROM grid;"))
+    current_data = result.fetchall()
+    
+# Convert to a dictionary for easy comparison
+current_values = {(x, y): value for x, y, value in current_data}   
+
+# Convert updated_grid_points to a dictionary
+updated_values = {(i, j): value for i, j, value in updated_grid_points}
+
+# Find differences
+differences = [
+    (i, j, value)
+    for i, j, value in updated_grid_points
+    if (i, j) in updated_values and (i, j) not in current_values or
+    (i, j) in current_values and current_values[(i, j)] != value
+]
+
+# Update differing values in the database
+with engine.connect() as connection:
+    for i, j, value in differences:
+        connection.execute(
+            text(f"UPDATE grid SET value = {value} WHERE x = {i} AND y = {j}"),
+        )
+    print(f"Updated {len(differences)} entries.")
+
+# Step 8: Read the table into Python
+with engine.connect() as connection:
+    # Query to select all rows from the table
+    result = connection.execute(text("SELECT x, y, value FROM grid;"))
+    df = pd.DataFrame(result.fetchall(), columns=['x', 'y', 'value'])
+
+# Print the DataFrame to validate the changes
+print(df)
+
+# Check current values
+with engine.connect() as connection:
+    result = connection.execute(text("SELECT x, y, value FROM grid;"))
+    current_values = {(row[0], row[1]): row[2] for row in result.fetchall()}
+
+print("Current grid points in database:")
+for row in current_values.items():
+    print(row)
+    
+print("Updated grid points with changes:")
+for row in updated_grid_points:
+    print(row)
+
+# Determine differences
+differences = [
+    (i, j, value)
+    for i, j, value in updated_grid_points
+    if (i, j) in current_values and current_values[(i, j)] != value
+]
+
+print(f"Differences to update: {differences}")
+
+# Step 6: Update the database with INSERT OR REPLACE
+with engine.connect() as connection:
+    with connection.begin():  # Ensure transactions are committed
+        for i, j, value in updated_grid_points:
+            sql = """
+            INSERT OR REPLACE INTO grid (x, y, value) 
+            VALUES (:x, :y, :value)
+            """
+            print(f"Executing SQL: {sql} with values: x={i}, y={j}, value={value}")
+            connection.execute(
+                text(sql),
+                {"x": i, "y": j, "value": value}
+            )
+        print(f"Updated entries with INSERT OR REPLACE.")
+
+# Step 8: Read the table into Python
+with engine.connect() as connection:
+    result = connection.execute(text("SELECT x, y, value FROM grid;"))
+    rows = result.fetchall()
+    df = pd.DataFrame(rows, columns=['x', 'y', 'value'])
+
+# Print the DataFrame to validate the changes
+print("Updated table data:")
+print(df)
+
+
+engine.dispose()
+
+# Check if the file exists and then remove it
+if db_file.exists():
+    db_file.unlink()
+    print(f"Deleted the file: {db_file}")
+else:
+    print(f"The file does not exist: {db_file}")
+
+with engine.connect() as connection:
+    connection.execute(text("""
+    CREATE TABLE IF NOT EXISTS grid (
+        x INTEGER,
+        y INTEGER,
+        value REAL,
+        PRIMARY KEY (x, y)
+    );
+    """))
+    
+    connection.execute(text("""
+    INSERT OR REPLACE INTO grid (x, y, value) VALUES
+    (1, 1, 0), (1, 2, 0), (1, 3, 0), (1, 4, 0), (1, 5, 0),
+    (2, 1, 0), (2, 2, 0), (2, 3, 0), (2, 4, 0), (2, 5, 0),
+    (3, 1, 0), (3, 2, 0), (3, 3, 0), (3, 4, 0), (3, 5, 0),
+    (4, 1, 0), (4, 2, 0), (4, 3, 0), (4, 4, 0), (4, 5, 0),
+    (5, 1, 0), (5, 2, 0), (5, 3, 0), (5, 4, 0), (5, 5, 0);
+    """))
+    
+    # Insert initial values (0) into the grid table
+    values = ",".join(["({}, {}, {})".format(i, j, 0) for i, j, _ in grid_points])
+    connection.execute(text("INSERT INTO grid (x, y, value) VALUES {values};".format(values=values)))
+    
+    # Commit
+    connection.commit() 
+    
+    # Verify data insertion
+    result = connection.execute(text("SELECT * FROM grid;"))
+    rows = result.fetchall()
+    print("Data in grid table:", rows)
+    
+    connection.execute(text("""
+    INSERT INTO grid (x, y, value) VALUES
+    """ + ",".join(["({}, {}, {})".format(i, j, 0) for i, j, _ in grid_points]) + ";"))
+
+engine.dispose()
+    
+    
+    result = connection.execute(text("SELECT * FROM grid;"))
+    rows = result.fetchall()
+    print("Data in grid table:", rows)
+
+with engine.connect() as connection:
+    result = connection.execute(text("SELECT name FROM sqlite_master WHERE type='table';"))
+    print(result.fetchall())
+
+with engine.connect() as connection:
+    # Describe the table schema
+    result = connection.execute(text("PRAGMA table_info(grid);"))
+    columns = result.fetchall()
+    print("Table schema:", columns)
+
+with engine.connect() as connection:
+    result = connection.execute(text("SELECT * FROM grid;"))
+    rows = result.fetchall()
+    for row in rows:
+        print(row)
+
+SQL(db_file, command="select")
+
+
+
+
+
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt

From 2374ce9b96d2a17dd02af8f63104dcbf5f74ecbf Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 24 Jul 2024 18:26:04 -0700
Subject: [PATCH 07/81] Commited SQL changes

---
 config_files/live_initialization_config.yml   |    4 +-
 config_files/live_survey_year_2019_config.yml |   12 +-
 .../{liveacoustics.py => live_acoustics.py}   |    0
 echopop/live/{livecore.py => live_core.py}    |   16 +-
 echopop/live/live_data_processing.py          | 1029 ++++++++++++-----
 .../live/{livesurvey.py => live_survey.py}    |   16 +-
 echopop/live/sql_methods.py                   |  234 ++--
 echopop/mesh_generation.py                    |  113 +-
 echopop/zarr_read_ingest_test.py              |  830 ++++++-------
 9 files changed, 1419 insertions(+), 835 deletions(-)
 rename echopop/live/{liveacoustics.py => live_acoustics.py} (100%)
 rename echopop/live/{livecore.py => live_core.py} (86%)
 rename echopop/live/{livesurvey.py => live_survey.py} (78%)

diff --git a/config_files/live_initialization_config.yml b/config_files/live_initialization_config.yml
index 84c48bbb..a407520e 100644
--- a/config_files/live_initialization_config.yml
+++ b/config_files/live_initialization_config.yml
@@ -35,8 +35,8 @@
         longitude: [-135.25, -117.00]
       # x/y (or E-W/N-S) grid resolution in nmi
       grid_resolution:
-        x: 25.0
-        y: 25.0
+        x_distance: 50.0
+        y_distance: 50.0
     projection: epsg:4326                   # EPSG integer code for geodetic parameter dataset
   # TODO: Remember to convert this back to a string
   # NOTE: `link_biology_acoustics` defines how biological and acoustic data are linked with one another. This 
diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml
index a8450039..b7b7aef4 100644
--- a/config_files/live_survey_year_2019_config.yml
+++ b/config_files/live_survey_year_2019_config.yml
@@ -19,22 +19,30 @@ data_root_dir: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files
 ##############################################################################
 # Input data directories
 input_directories: 
-  acoustic:
+  acoustics:
     directory: acoustics/
+    database_name: acoustics.db
     extension: zarr
-  biological:
+  biology:
     directory: biology/
+    database_name: biology.db
     extension: csv 
     file_name_formats:
       catch: "{DATE:YYYYMM}_{HAUL}_{FILE_ID:catch_perc}"
       length: "{DATE:YYYYMM}_{SPECIES_CODE}_{HAUL}_{FILE_ID:lf}"
       specimen: "{DATE:YYYYMM}_{SPECIES_CODE}_{HAUL}_{FILE_ID:spec}"
+      trawl_info: "{DATE:YYYYMM}_{HAUL}_{FILE_ID:operation_info}"
     file_index:
       catch: [haul_num]
       length: [haul_num, species_id]
       specimen: [haul_num, species_id]
+      trawl_info: []
     file_ids: 
       catch: catch_perc
       length: lf
       specimen: spec
+      trawl_info: operation_info
+  coastline: 
+    directory: coastline/
+    coastline_name: ne_110m_land
 ...
diff --git a/echopop/live/liveacoustics.py b/echopop/live/live_acoustics.py
similarity index 100%
rename from echopop/live/liveacoustics.py
rename to echopop/live/live_acoustics.py
diff --git a/echopop/live/livecore.py b/echopop/live/live_core.py
similarity index 86%
rename from echopop/live/livecore.py
rename to echopop/live/live_core.py
index 83e72a86..95750f5f 100644
--- a/echopop/live/livecore.py
+++ b/echopop/live/live_core.py
@@ -63,6 +63,20 @@
                 "catch_perc": "catch_percentage",
             }
         },
+        "trawl_info": {
+            "dtypes": {
+                "operation_number": int,
+                "td_timestamp": str,
+                "td_latitude": float,
+                "td_longitude": float,
+            },
+            "names": {
+                "operation_number": "haul_num",
+                "td_timestamp": "datetime",
+                "td_latitude": "latitude",
+                "td_longitude": "longitude",
+            },
+        },
         "length": {
             "dtypes": {
                 "sex": str,
@@ -73,7 +87,7 @@
                 "sex": "sex",
                 "rounded_length": "length",
                 "frequency": "length_count",
-            }
+            },
         },
         "specimen": {
             "dtypes": {
diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py
index 293862c4..fd89993c 100644
--- a/echopop/live/live_data_processing.py
+++ b/echopop/live/live_data_processing.py
@@ -2,13 +2,13 @@
 import re
 
 from pathlib import Path
-from typing import Union, Tuple
+from typing import Union, Tuple, Optional, List
 
 import pandas as pd
 import xarray as xr
 import numpy as np
 
-from .livecore import(
+from .live_core import(
     LIVE_DATA_STRUCTURE,
     LIVE_FILE_FORMAT_MAP,
     LIVE_INPUT_FILE_CONFIG_MAP
@@ -34,7 +34,9 @@ def live_configuration(live_init_config_path: Union[str, Path],
         missing_config = [
             files for files, exists in zip(config_files, config_existence) if not exists
         ]
-        raise FileNotFoundError(f"The following configuration files do not exist: {missing_config}")
+        raise FileNotFoundError(
+            f"The following configuration files do not exist: {missing_config}."
+            )
 
     # Read the YAML configuration/recipe file to parameterize the `LiveSurvey` class
     # ---- Initialization settings
@@ -56,21 +58,147 @@ def live_configuration(live_init_config_path: Union[str, Path],
     # Combine both into a dictionary output that can be added to the `LiveSurvey` class object
     return {**init_config, **file_config}
 
-# TODO: Documentation
-def compile_filename_format(file_name_format: str):
+def validate_data_directory(root_directory: str, file_settings: dict) -> List[Path]:
 
-    # Create a copy of `file_name_format`
-    regex_pattern = file_name_format
+    # Get acoustic directory and initialization settings
+    # ---- Create the full filepath 
+    directory_path = Path(root_directory) / file_settings["directory"]
+    # ---- Get the defined file extension
+    file_extension = file_settings["extension"]
+
+    # Validate filepath, columns, datatypes
+    # ---- Error evaluation (if applicable)
+    if not directory_path.exists():
+        raise FileNotFoundError(
+            f"The acoustic data directory [{directory_path}] does not exist."
+        )
     
-    # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern
-    for key, value in LIVE_FILE_FORMAT_MAP.items():
-        regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"])
-    # ---- Replace the `FILE_ID` tag
-    regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P<FILE_ID>\1)', regex_pattern)
+    # Validate that files even exist
+    # ---- List available *.zarr files
+    data_files = list(directory_path.glob(f"*{'.'+file_extension}"))
+    # ---- Error evaluation (if applicable)
+    if not data_files:
+        raise FileNotFoundError(
+            f"No `*.{file_extension}` files found in [{directory_path}]!"
+        )
+    
+    # Return the output
+    return data_files
 
-    # Compile the regex pattern and return the output
-    return re.compile(regex_pattern)
+def query_processed_files(root_directory: str, file_settings: dict, files: List[Path]) -> dict:
+
+    # Get the database name 
+    db_name = file_settings["database_name"]
+
+    # Create filepath to the SQL database
+    # ---- Create Path to SQL database file
+    db_directory = Path(root_directory) / "database"
+    # ---- Create the directory if it does not already exist
+    db_directory.mkdir(parents=True, exist_ok=True)
+    # ---- Complete path to the database file
+    db_file = db_directory / db_name
+
+    # Create a list of string-formatted Path names
+    files_str = [str(file) for file in files]
+    # ---- Create DataFrame
+    current_files = pd.DataFrame(files_str, columns=["filepath"])
+
+    # Check for the table `files_read`
+    files_read_tbl = SQL(db_file, "validate", table_name="files_read")
+
+    # Validate whether the table exists; if not, create the table and then insert
+    if not files_read_tbl:
+        # ---- Create table
+        SQL(db_file, "create", table_name="files_read", dataframe=current_files, 
+            primary_keys = ["filepath"])
+        # ---- Populate table
+        SQL(db_file, "insert", table_name="files_read", dataframe=current_files)
+        # ---- Break early
+        return files_str, db_file
+    
+    # Query already existing files
+    previous_files = SQL(db_file, "select", table_name="files_read", output_type=str)
+    # ---- Insert file list
+    SQL(db_file, "insert", table_name="files_read", dataframe=current_files, id_columns="filepath")
+
+    # Filter out previously processed files
+    # ---- Apply filter by comparing sets and return the output
+    return list(set(files_str) - set(previous_files)), db_file
+
+def sql_data_exchange(database_file: Path, **kwargs):
+
+    # Check whether the `table_name` table exists
+    table_exists = SQL(database_file, "validate", **kwargs)
+
+    # If empty and table does not exist
+    if kwargs["dataframe"].empty and table_exists:
+        return SQL(database_file, "select", **kwargs)
+
+    # Create table if it does not exist and run the initial insertion
+    if not table_exists:
+        # ---- Create table
+        SQL(database_file, "create", **kwargs)
+        # ---- Ignore the `id_columns` argument, if present
+        try:
+            del kwargs["id_columns"]
+        except KeyError:
+            pass
+        # ---- Insert into table        
+        SQL(database_file, "insert", **kwargs)
+        # ---- Return the initial dataframe
+        return kwargs.get("dataframe")
+    
+    # Insert into the table
+    SQL(database_file, "insert", **kwargs)
+    
+    # Select existing data frame the database and return the output
+    return SQL(database_file, "select", **kwargs)
+
+def read_acoustic_zarr(acoustic_files: Path) -> tuple:
+
+    # Get the file-specific settings, datatypes, columns, etc.
+    # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
+    acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"]
+    # ---- Create list of coordinate data variables
+    specified_vars = list(acoustics_config_map["xarray_variables"].keys())
+    # ---- Create set of coordinate variables
+    specified_coords = list(acoustics_config_map["xarray_coordinates"].keys())      
+    # ---- Concatenate into a full configuration map
+    full_config_map = {**acoustics_config_map["xarray_coordinates"],
+                        **acoustics_config_map["xarray_variables"]} 
+    
+    # Determine the file loading method for the `acoustic_files`
+    if len(acoustic_files) > 1:
+        zarr_data_ds = xr.open_mfdataset(acoustic_files, engine="zarr", chunks="auto", 
+                                         data_vars=specified_vars, coords=specified_coords)
+    else:
+        zarr_data_ds = xr.open_dataset(acoustic_files[0], engine="zarr", chunks="auto")
+
+    # Pre-process the Dataset, convert it to a DataFrame, and validate the structure
+    # ---- Convert to a DataFrame
+    zarr_data_df = zarr_data_ds.to_dataframe().reset_index()
+    # ---- Check for any missing columns
+    missing_columns = (
+        [key for key in full_config_map.keys() if key not in zarr_data_df.columns]
+    )
+    # ---- Raise Error, if needed
+    if missing_columns: 
+        raise ValueError(
+            f"The following columns are missing from at least one file: in "
+            f"{', '.join(missing_columns)}!"
+        )
+    # ---- Select defined columns
+    zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map)
+
+    # Gather some of the units
+    data_units = {
+        "longitude": zarr_data_ds.longitude.units,
+        "latitude": zarr_data_ds.latitude.units,
+        "frequency": zarr_data_ds.frequency_nominal.units,
+    }
 
+    # Return a Tuple
+    return zarr_data_df_filtered, data_units
 
 # TODO: Documentation
 def configure_transmit_frequency(frequency_values: pd.Series,
@@ -90,17 +218,104 @@ def configure_transmit_frequency(frequency_values: pd.Series,
     # ---- No change
     else:
         return frequency_values
+
+def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame,
+                             file_configuration: dict) -> pd.DataFrame:
+
+    # Get acoustic processing settings
+    acoustic_analysis_settings = file_configuration["acoustics"]
+    # ---- Extract the fined acoustic frequency
+    transmit_settings = acoustic_analysis_settings["transmit"]
+
+    # Filter the dataset
+    # ---- Configure `frequency_nominal`, if necessary
+    prc_nasc_df["frequency_nominal"] = (
+        configure_transmit_frequency(prc_nasc_df["frequency_nominal"],
+                                     transmit_settings,
+                                     acoustic_analysis_settings["dataset_units"]["frequency"])
+    )
+    # ---- Filter out any unused frequency coordinates
+    prc_nasc_df_filtered = (
+        prc_nasc_df[prc_nasc_df["frequency_nominal"] == transmit_settings["frequency"]]
+    )
+
+    # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object
+    # ---- Replace NASC `NaN` values with `0.0`
+    prc_nasc_df_filtered.loc[:, "NASC"] = prc_nasc_df_filtered.loc[:, "NASC"].fillna(0.0)
+    # ---- Drop the `frequency_nominal` column and return the output 
+    return prc_nasc_df_filtered.drop(columns = ["frequency_nominal"])
+
+def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame]:
+
+    # Get the acoustic file settings and root directory
+    # ---- File settings
+    file_settings = file_configuration["input_directories"]["acoustics"]
+    # ---- Root directory
+    root_directory = file_configuration["data_root_dir"]
     
-# TODO: Documentation
-def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict):
+    # Get and validate the acoustic data directory and files
+    acoustic_files = validate_data_directory(root_directory, file_settings)
+
+    # Query `acoustics.db` to process only new files (or create the db file in the first place)
+    new_acoustic_files, file_configuration["database"]["acoustics"] = (
+        query_processed_files(root_directory, file_settings, acoustic_files)  
+    )  
+
+    # Read in the acoustic data files
+    if new_acoustic_files:
+        # ! [REQUIRES DASK] ---- Read in the listed file
+        prc_nasc_df, acoustic_data_units = read_acoustic_zarr(new_acoustic_files)
+        # ---- Add the `acoustic_data_units` to the dictionary
+        file_configuration["acoustics"]["dataset_units"] = acoustic_data_units
+        # ---- Preprocess the acoustic dataset
+        prc_nasc_df_processed = preprocess_acoustic_data(prc_nasc_df, file_configuration)
+        # ---- Return output
+        return prc_nasc_df_processed
+    else:
+        return None
+    
+def filter_filenames(directory_path: Path, filename_id: str, 
+                     files: List[Path],
+                     file_extension: str):
+
+    # Drop the `{FIELD_ID}` tag identifier
+    file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', filename_id)
+    # ---- Replace all other tags with `*` placeholders
+    file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
+    # ---- Create Path object with the generalized format
+    subfile_path_obj = directory_path.glob(f"{file_id_format}.{file_extension}")
+    # ---- List all files that match this pattern
+    subfile_str = [str(file) for file in list(subfile_path_obj)]
+
+    # Convert list of proposed files from Path to String
+    file_str = [str(file) for file in list(files)]
+    
+    # Find intersection with the proposed filenames and return the output
+    return list(set(subfile_str).intersection(set(file_str)))
+
+def compile_filename_format(file_name_format: str):
+
+    # Create a copy of `file_name_format`
+    regex_pattern = file_name_format
+    
+    # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern
+    for key, value in LIVE_FILE_FORMAT_MAP.items():
+        regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"])
+    # ---- Replace the `FILE_ID` tag
+    regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P<FILE_ID>\1)', regex_pattern)
+
+    # Compile the regex pattern and return the output
+    return re.compile(regex_pattern)
+
+def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict):
 
     # Read in the `*.csv` file
-    df = pd.read_csv(file, usecols=list(config_settings["dtypes"].keys()))
+    df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys()))
 
     # Validate the dataframe
     # ---- Check for any missing columns
     missing_columns = (
-        [key for key in config_settings["dtypes"].keys() if key not in df.columns]
+        [key for key in config_map["dtypes"].keys() if key not in df.columns]
     )
     # ---- Raise Error, if needed
     if missing_columns: 
@@ -108,9 +323,9 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict):
             f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!"
         )
     # ---- Ensure the correct datatypes
-    df_validated = df.astype(config_settings["dtypes"])
+    df_validated = df.astype(config_map["dtypes"])
     # ---- Replace column names and drop 
-    df_validated = df_validated.rename(columns=config_settings["names"])
+    df_validated = df_validated.rename(columns=config_map["names"])
 
     # Get the substring components that can be added to the DataFrame
     filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern)
@@ -130,293 +345,545 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict):
     # Return the resulting DataFrame
     return df_validated
 
-# TODO: Documentation
-# TODO: Refactor, break up cyclomatic complexity
-def load_biology_data(file_configuration: dict, update_config: bool = True):
+def preprocess_biology_data(biology_output: dict, file_configuration: dict):
+  
+    # Get SQL database file
+    biology_db = file_configuration["database"]["biology"]
+    
+    # Get contrasts used for filtering the dataset
+    # ---- Species
+    species_filter = file_configuration["species"]["number_code"]
+    # ---- Trawl partition information
+    trawl_filter = file_configuration["biology"]["catch"]["partition"]
+    # ---- Create filter dictionary
+    filter_dict = dict(species_id=species_filter, trawl_partition=trawl_filter)
 
-    # Get acoustic directory and initialization settings
-    # ---- Files
-    biology_file_settings = file_configuration["input_directories"]["biological"]
-    # ---- General settings
-    biology_analysis_settings = file_configuration["biology"]
+    # Apply the filter
+    filtered_biology_output = {
+        key: biology_data_filter(df, filter_dict) 
+        for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty
+    }
+    # ---- Swap this out if no new files are present
+    if not filtered_biology_output:
+        # ---- Get available tables
+        table_list = list(set(SQL(biology_db, "map")) - set(["files_read"]))
+        # ---- Plug into the dictionary
+        filtered_biology_output.update({key: pd.DataFrame() for key in table_list})
+    # ---- Initialize the results dictionary   
+    results_dict = {key: pd.DataFrame() for key in filtered_biology_output.keys()}
+
+    # Update the SQL database
+    for table_name, df in filtered_biology_output.items():
+        # ---- Get identifier columns
+        key_columns = get_table_key_names(biology_db, filtered_biology_output, table_name)
+        # ---- Create copy
+        df = df.copy()
+        # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint
+        df.loc[:, "id"] = "row" + df.index.astype(str) + "-" + "-".join(key_columns)        
+        # ---- Insert the new data into the database & pull in the combined dataset
+        table_df = sql_data_exchange(biology_db, 
+                                     dataframe=df, 
+                                     table_name=table_name, 
+                                     id_columns=["id"],
+                                     primary_keys=["id"],
+                                     output_type=pd.DataFrame)
+        # ---- Add to the outgoing dictionary (and drop SQL db identifier)
+        results_dict.update({table_name: table_df.drop(columns="id")})
+    
+    # Return the output
+    return results_dict
+
+def get_table_key_names(db_file: Path, data_dict: dict, table_name: str) -> List[str]:
+
+    # Get the data input column names
+    if data_dict[table_name].empty:
+        # ---- Inspect the table
+        inspected_table = SQL(db_file, "inspect", table_name=table_name)
+        # ---- Create a list of the data columns
+        table_columns = list(inspected_table.keys())
+    else:
+        # ---- Get the DataFrame column names
+        table_columns = data_dict[table_name].columns
+
+    # Create a list of the primary keys
+    key_columns = (
+           set(table_columns)
+           .intersection(["trawl_partition", "sex", "haul_num", "species_id", "longitude", 
+                          "latitude"]) 
+        )
+
+    # Return a list of the output
+    return list(key_columns)
+
+def load_biology_data(file_configuration: dict):
+
+    # Get the acoustic file settings and root directory
+    # ---- File settings
+    file_settings = file_configuration["input_directories"]["biology"]
+    # ---- Root directory
+    root_directory = file_configuration["data_root_dir"]
+
+    # Get and validate the acoustic data directory and files
+    biology_files = validate_data_directory(root_directory, file_settings)
+
+    # Query `biology.db` to process only new files (or create the db file in the first place)
+    # SQL(biology_db, "drop", table_name="files_read")
+    new_biology_files, file_configuration["database"]["biology"] = (
+        query_processed_files(root_directory, file_settings, biology_files)
+    )
 
     # Get the file-specific settings, datatypes, columns, etc.
     # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
     biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"]
-        # ---- Extract the expected file name ID's
-    biology_file_ids = biology_file_settings["file_name_formats"]
+    # ---- Extract the expected file name ID's
+    biology_file_ids = file_settings["file_name_formats"]
     # ---- Extract all of the file ids
     biology_config_ids = list(biology_file_ids.keys())
     # ---- Initialize the dictionary that will define this key in the `input` attribute
     biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
-    # ---- Initialize the SQL dictionary
-    sql_biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
+    # ---- Create filepath object
+    directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"]
     
-    # Create full filepath
-    biology_directory_path = (
-        Path(file_configuration["data_root_dir"]) / biology_file_settings["directory"]
+    # Add SQL file to dict
+    file_configuration["database"]["biology"] = (
+        Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"]         
     )
-    # ---- Directory check
-    directory_existence = biology_directory_path.exists()
-    # ---- Error evaluation (if applicable)
-    if not directory_existence:
-        raise FileNotFoundError(
-            f"The acoustic data directory [{biology_directory_path}] does not exist."
+
+    # Iterate through the different biology datasets and read them in
+    for dataset in list(biology_file_ids.keys()):
+        # ---- Get dataset-specific file lists
+        dataset_files = filter_filenames(directory_path, 
+                                         file_settings["file_name_formats"][dataset], 
+                                         new_biology_files, 
+                                         file_settings["extension"])
+        # ---- If there are dataset files available
+        if dataset_files:
+            # ---- Read in validated biology data
+            dataframe_list = [read_biology_csv(Path(file), 
+                                               file_settings["file_name_formats"][dataset], 
+                                               biology_config_map[dataset]) 
+                              for file in dataset_files]
+            # ---- Concatenate the dataset
+            dataframe_combined = pd.concat(dataframe_list, ignore_index=True)
+            # ---- Lower-case sex
+            if "sex" in dataframe_combined.columns: 
+                dataframe_combined["sex"] = dataframe_combined["sex"].str.lower()
+            # ---- Lower-case trawl partition type
+            if "trawl_partition" in dataframe_combined.columns: 
+                dataframe_combined["trawl_partition"] = dataframe_combined["trawl_partition"].str.lower()
+            # ---- Reformat datetime column
+            if "datetime" in dataframe_combined.columns:
+                dataframe_combined["datetime"] = convert_datetime(dataframe_combined["datetime"])
+            # ---- Add to the data dictionary
+            biology_output[f"{dataset}_df"] = dataframe_combined
+
+    # Pre-process and return the results
+    return preprocess_biology_data(biology_output, file_configuration)
+
+SPATIAL_CONFIG_MAP = {
+    "closest_haul": {
+        "proximity": {
+            "choices": ["distance", "time"],
+        },
+    },
+    "global" : {},
+    "griddify": {
+        "bounds": {
+            "longitude": {
+                "types": [float]
+            },
+            "latitude": {
+                "types": [float]
+            },
+            "northings": {
+                "types": [float]
+            },
+            "eastings": {
+                "types": [float]
+            },
+            "pairs": [("longitude", "latitude"), ("northings", "eastings")],
+        },
+        "grid_resolution": {
+            "x_distance": {
+                "types": float,
+            },
+            "y_distance": {
+                "types": float,
+            },
+            "d_longitude": {
+                "types": float,
+            },
+            "d_latitude": {
+                "types": float,
+            },
+            "grid_size_x": {
+                "types": int,
+            },
+            "grid_size_y": {
+                "types": int,
+            },
+            "pairs": [("x_distance", "y_distance"), ("d_longitude", "d_latitude"), 
+                      ("grid_size_x", "grid_size_y")],       
+        },
+    },
+    "inpfc": {
+        "stratum_names": {
+                "types": [int, str]
+            },
+        "latitude_max": {
+            "types": [float],
+        },
+    },
+    "weighted_haul": {
+        "proximity": {
+            "choices": ["distance", "time"]
+        },
+    },
+}
+
+def validate_spatial_config(spatial_config: dict):
+
+    # Check the link method
+    # ---- Extract string-formatted method name
+    link_method = spatial_config["link_biology_acoustics"].lower()
+    # ---- Validate
+    if link_method not in SPATIAL_CONFIG_MAP.keys():
+        raise ValueError(
+            f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options "
+            f"include: 'global', 'closest_haul', 'weighted_haul', 'griddify', and 'INPFC'."
         )
-    # ---- Get the defined file extension
-    file_extension = biology_file_settings["extension"]
-    # ---- Create Path.glob generator object
-    file_path_obj = biology_directory_path.glob(f"*{'.'+file_extension}")
-    #---- Create list of `*.csv`` files
-    csv_files = list(file_path_obj)
-    # ---- Ensure files exist or raise error otherwise
-    if len(csv_files) < 1:
-        raise FileNotFoundError(
-            f"No `*.csv` files found in [{biology_directory_path}]!"
+    
+    # Verify that associated parameters are present in the configuration settings
+    # ---- Get keys as a list
+    config_keys = list(spatial_config.keys())
+    # ---- Check for specific methods
+    if link_method not in config_keys and link_method != "global":
+        raise ValueError(
+            f"No parameters provided for the biology-acoustic linking ([{link_method}])."
+        )
+    
+    # Check key settings
+    if link_method == "griddify": 
+        validate_griddify_config(spatial_config, link_method)
+    elif link_method == "inpfc": 
+        validate_inpfc_config(spatial_config, link_method)
+    elif link_method != "global": 
+        validate_hauls_config(spatial_config, link_method)
+
+def validate_hauls_config(spatial_config: dict, link_method: str):
+
+    # Get the link method configuration map
+    link_method_settings = SPATIAL_CONFIG_MAP[link_method]
+
+    # Extract the defined settings
+    input_method_settings = spatial_config[link_method]
+
+    # Check for `proximity` 
+    if "proximity" not in input_method_settings.keys():
+        raise KeyError(
+            "The following parameters are missing from the biology-acoustic linking method: "
+            "'proximity'!"
         )
-    else: 
-        # ---- Create Path to SQL database file
-        db_directory = Path(file_configuration["data_root_dir"]) / "database"
-        # ---- Create the directory if it does not already exist
-        db_directory.mkdir(parents=True, exist_ok=True)
-        # ---- Complete path to `biology.db`
-        db_file = db_directory / "biology.db"
-        # ---- Query the external SQL database to see if the file tracking table exists
-        tables = SQL(db_file, "inspect")
-        # ---- Create a list of string-formatted Path names
-        csv_files_str = [str(file) for file in csv_files]
-        # ---- Create DataFrame
-        current_files = pd.DataFrame(csv_files_str, columns=["filepath"])
-        # ---- Create if it is missing and then advance `csv_files`
-        if "files_read" not in tables:
-            # ---- Insert into the SQL database file
-            _ = SQL(db_file, "insert", table_name="files_read", columns="filepath",
-                     dataframe=current_files)        
-            # ---- Create empty list for later comparison
-            new_files = []
-        else:
-            # ---- Pull already processed filenames
-            previous_files = SQL(db_file, "select", table_name="files_read")
-            # ---- Compare against the current filelist 
-            new_files = (
-                [file for file in csv_files_str if file not in set(previous_files["filepath"])]
-            )  
-            # ---- Create a DataFrame for the new files
-            new_files_df = pd.DataFrame(new_files, columns=["filepath"])
-            # ---- Insert into the SQL database file
-            _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) 
-
-    # Iterate through each of the file ids and read in the data 
-    for id in list(biology_file_ids.keys()): 
-        # ---- Extract the specific config mapping for this tag/id
-        sub_config_map = biology_config_map[id]
-        # ---- Drop the `{FIELD_ID}` tag identifier
-        file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id])
-        # ---- Replace all other tags with `*` placeholders
-        file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
-        # ---- Create Path object with the generalized format
-        subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}")
-        # ---- List all files that match this pattern
-        subcsv_files_str = [str(file) for file in list(subfile_path_obj)]
-        # ---- Filter for only new files
-        subset_files = set(subcsv_files_str).intersection(set(new_files))
-        # ---- Pull from SQL database, if applicable
-        if f"{id}_df" in tables:
-            # ---- SELECT
-            sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*")
-            # ---- Concatenate to the dictionary
-            sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df])
-        # ---- Add data files not stored in SQL database
-        if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables:
-            if len(subset_files) > 0:
-                file_list = subset_files
+    
+    # Evaluate valid options for `proximity`
+    if input_method_settings["proximity"] not in link_method_settings["proximity"]["choices"]:
+        raise KeyError(
+            f"Value biology-acoustic linking method parameter `proximity` must be one of the : "
+            f"following: {link_method_settings["proximity"]["choices"]}."
+        )       
+    
+def validate_griddify_config(spatial_config: dict, link_method: str):
+
+    # Get the link method configuration map
+    link_method_settings = SPATIAL_CONFIG_MAP[link_method]
+
+    # Extract the defined settings
+    input_method_settings = spatial_config[link_method]
+
+    # Check for the required keys
+    key_diff = set(input_method_settings.keys()).difference(set(link_method_settings.keys()))
+    # ---- Raise Error
+    if key_diff:
+        raise KeyError(
+            f"The following parameters are missing from the biology-acoustic linking method: "
+            f"{list(key_diff)}!"
+        )    
+    
+    # Iterate through the keys to evaluate inputs
+    for key in list(input_method_settings.keys()):
+        # ---- Subset the input method config
+        input = input_method_settings[key]
+        # ---- Get the original config of the dtypes
+        model = link_method_settings[key]
+        # ---- Compare entries
+        parameter_diff = set(input.keys()).difference(set(model.keys()))
+        # ---- Raise Error
+        if parameter_diff:
+            raise KeyError(
+                f"Unexpected parameter(s) ('{parameter_diff}') detected in '{link_method}' "
+                f"configuration."
+            )    
+        # ---- Check if the appropriate coordinate pairs are present
+        coordinate_pairs = [set(param).intersection(set(input.keys())) for param in model["pairs"]]
+        # ---- Count the number of paired coordinates
+        pair_counts = [len(param) for param in coordinate_pairs]
+        # ---- If there are multiple pairs
+        if (np.array(pair_counts) == 2).sum() != 1:
+            raise ValueError(
+                f"A single coordinate-pair is allowed (and required) within the '{key}' parameter "
+                f"for the link method '{link_method}' defined via the following options: "
+                f"{model["pairs"]}."
+            )
+        # ---- Check the datatypes
+        for parameter in input.keys():
+            # ---- Get the datatypes
+            config_dtypes = model[parameter]["types"]
+            # ---- Get input parameter
+            input_parameter = input[parameter]
+            # ---- If List
+            if isinstance(config_dtypes, list):
+                if not isinstance(input_parameter, list):
+                    raise TypeError(
+                        f"Biology-acoustic linking method argument '{parameter}' within '{key}' "
+                        f"for method '{link_method}' must be contained within a list."
+                    )
             else:
-                file_list = subcsv_files_str
-            # ---- Create a list of relevant dataframes
-            sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map) 
-                            for file in file_list]
-            # ---- Concatenate into a single DataFrame
-            sub_df = pd.concat(sub_df_lst, ignore_index=True)
-            # ---- Lower-case sex
-            if "sex" in sub_df.columns:
-                sub_df["sex"] = sub_df["sex"].str.lower()
-            # ---- Concatenate to the dictionary DataFrame
-            biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df])
+                input_parameter = [input_parameter]
+                config_dtypes = [config_dtypes]
+            # ---- Check correct datatypes
+            if not np.all([type(value) in config_dtypes for value in input_parameter]):
+                raise TypeError(
+                    f"Biology-acoustic linking method argument '{parameter}' within '{key}' "
+                    f"for method '{link_method}' must be one of the following types within a list: "
+                    f"{config_dtypes}."
+                )    
 
-    # Get contrasts used for filtering the dataset
-    # ---- Species
-    species_filter = file_configuration["species"]["number_code"]
-    # ---- Trawl partition information
-    trawl_filter = biology_analysis_settings["catch"]["partition"]
-    # ---- Apply the filter
-    filtered_biology_output = {
-        key: df[
-            (df['species_id'] == species_filter if 'species_id' in df.columns else True) &
-            (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns else True)
-        ]
-        for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty
-    }
+def validate_inpfc_config(spatial_config: dict, link_method: str):
 
-    # Update the SQL database
-    for table_name, df in filtered_biology_output.items():
-        # ---- Update        
-        _ = SQL(db_file, "insert", table_name=table_name, columns="*", 
-                dataframe=df)
+    # Get the link method configuration map
+    link_method_settings = SPATIAL_CONFIG_MAP[link_method]
+
+    # Extract the defined settings
+    input_method_settings = spatial_config[link_method]
+
+    # Check for the required keys
+    key_diff = set(input_method_settings.keys()).difference(set(link_method_settings.keys()))
+    # ---- Raise Error
+    if key_diff:
+        raise KeyError(
+            f"The following parameters are missing from the biology-acoustic linking method: "
+            f"{list(key_diff)}!"
+        )
+    
+    # Iterate through the keys to evaluate inputs
+    for key in list(input_method_settings.keys()):
+        # ---- Subset the input method config
+        input = input_method_settings[key]
+        # ---- Get the original config of the dtypes
+        model = link_method_settings[key]["types"]
+        # ---- Evaluate if a list 
+        if not isinstance(input, list):
+            raise TypeError(
+                f"Biology-acoustic linking method argument '{key}' for method '{link_method}' must "
+                f"be contained within a list."
+            )
+        # ---- Evaluate if it is a type within the list
+        if not type(input[0]) in model:
+            raise TypeError(
+                f"Biology-acoustic linking method argument '{key}' for method '{link_method}' must "
+                f"be one of the following types within a list: {model}."
+            )    
         
-    # Combine the two datasets 
-    merged_output = {
-        key: pd.concat([
-            sql_biology_output.get(key, pd.DataFrame()), 
-            filtered_biology_output.get(key, pd.DataFrame())
-        ]).drop_duplicates().reset_index(drop=True)
-        for key in set(sql_biology_output) | set(filtered_biology_output)
-    }
-    # ---- Return output
-    if update_config:
-        if file_configuration["database"]["biology"] is None: 
-            file_configuration["database"]["biology"] = db_file
-        return merged_output, file_configuration
-    else:
-        return merged_output
+def apply_inpfc_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict):
 
-# TODO: Expand data validator and limit cases to '*.zarr' (for now)
-# TODO: Refactor "extra" components such as the validation steps, xarray-to-dataframe piping, etc.
-# TODO: Documentation
-def load_acoustic_data(file_configuration: dict, update_config: bool = True) -> Tuple[pd.DataFrame, xr.Dataset]:
-    # Get acoustic directory and initialization settings
-    # ---- Files
-    acoustic_file_settings = file_configuration["input_directories"]["acoustic"]
-    # ---- General settings
-    acoustic_analysis_settings = file_configuration["acoustics"]
+    # Extract the INPFC definitions
+    inpfc_definitions = spatial_config["inpfc"]
+
+    # Create latitude bins
+    latitude_bins = np.concatenate([[-90.0], inpfc_definitions["latitude_max"], [90.0]])
+    # ---- Append 1 more stratum layer
+    bin_names = np.concatenate([inpfc_definitions["stratum_names"],
+                                [np.max(inpfc_definitions["stratum_names"]) + 1]])
     
-    # Get the file-specific settings, datatypes, columns, etc.
-    # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
-    acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"]
-    # ---- Create list of coordinate data variables
-    specified_vars = list(acoustics_config_map["xarray_variables"].keys())
-    # ---- Create set of coordinate variables
-    specified_coords = list(acoustics_config_map["xarray_coordinates"].keys())      
-    # ---- Concatenate into a full configuration map
-    full_config_map = {**acoustics_config_map["xarray_coordinates"],
-                        **acoustics_config_map["xarray_variables"]} 
-    # ---- Initialize the dictionary that will define this key in the `input` attribute
-    acoustics_output = {"prc_nasc_df": pd.DataFrame(), 
-                        "nasc_df": pd.DataFrame()}
-    # ---- Initialize the SQL dictionary
-    # sql_acoustics_output = {"sv_df": pd.DataFrame()}
-
-    # Create full filepath
-    acoustic_directory_path = (
-        Path(file_configuration["data_root_dir"]) / acoustic_file_settings["directory"]
+    # Create spatial key
+    spatial_config["spatial_key"] = pd.DataFrame({
+        "latitude_limit": inpfc_definitions["latitude_max"],
+    })
+    # ---- Cut
+    spatial_config["spatial_key"]["stratum"] = (
+        pd.cut(inpfc_definitions["latitude_max"],
+               latitude_bins,
+               right = True,
+               labels = bin_names)
     )
-    
-    # Validate filepath, columns, datatypes
-    # ---- Directory check
-    directory_existence = acoustic_directory_path.exists()
-    # ---- Error evaluation (if applicable)
-    if not directory_existence:
-        raise FileNotFoundError(
-            f"The acoustic data directory [{acoustic_directory_path}] does not exist."
+
+    # Get the `prc_nasc_df` values, if they exist, and apply stratification information
+    if not acoustic_data["prc_nasc_df"].empty:
+        # ---- Bin the latitude data
+        acoustic_data["prc_nasc_df"]["stratum"] = pd.cut(
+            acoustic_data["prc_nasc_df"]["latitude"],
+            latitude_bins,
+            right = True,
+            labels = bin_names,
         )
-    # ---- Get the defined file extension
-    file_extension = acoustic_file_settings["extension"]
-    # ---- Create Path.glob generator object (the case of a *.zarr file)
-    file_path_obj = acoustic_directory_path.glob(f"*{'.'+file_extension}")
-    # ---- Find all zarr files
-    zarr_files = list(file_path_obj)
-    # ---- Ensure files exist or raise error otherwise
-    if len(zarr_files) < 1:
-        raise FileNotFoundError(
-            f"No `*.zarr` files found in [{acoustic_directory_path}]!"
+
+    # Get the `trawl_info_df` values, if they exist, and apply stratification information
+    if not biology_data["trawl_info_df"].empty:
+        # ---- Bin the latitude data
+        biology_data["trawl_info_df"]["stratum"] = pd.cut(
+            biology_data["trawl_info_df"]["latitude"],
+            latitude_bins,
+            right = True,
+            labels = bin_names,
         )
+
+def define_boundary_box(boundary_dict: dict, projection: str):
+    
+    # Get x-coordinates
+    if "longitude" in boundary_dict.keys():
+        x = np.array(boundary_dict["longitude"])
     else:
-        # ---- Create Path to SQL database file
-        db_directory = Path(file_configuration["data_root_dir"]) / "database"
-        # ---- Create the directory if it does not already exist
-        db_directory.mkdir(parents=True, exist_ok=True)
-        # ---- Complete path to `biology.db`
-        db_file = db_directory / "acoustics.db"
-        # ---- Query the external SQL database to see if the file tracking table exists
-        tables = SQL(db_file, "inspect")
-        # ---- Create a list of string-formatted Path names
-        zarr_files_str = [str(file) for file in zarr_files]
-        # ---- Create DataFrame
-        current_files = pd.DataFrame(zarr_files_str, columns=["filepath"])
-        # ---- Create if it is missing and then advance `zarr_files`
-        if "files_read" not in tables:
-            # ---- Insert into the SQL database file
-            _ = SQL(db_file, "insert", table_name="files_read", columns="filepath",
-                    dataframe=current_files)        
-            # ---- Create empty list for later comparison
-            new_files = []
-        else:
-            # ---- Pull already processed filenames
-            previous_files = SQL(db_file, "select", table_name="files_read")
-            # ---- Compare against the current filelist 
-            new_files = (
-                [file for file in zarr_files_str if file not in set(previous_files["filepath"])]
-            )  
-            # ---- Create a DataFrame for the new files
-            new_files_df = pd.DataFrame(new_files, columns=["filepath"])
-            # ---- Insert into the SQL database file
-            _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) 
-
-    # Find new files that have not yet been processed
-    if not new_files: 
-        subset_files = zarr_files
+        x = np.array(boundary_dict["northings"])
+
+    # Get y-coordinates
+    if "latitude" in boundary_dict.keys():
+        y = np.array(boundary_dict["latitude"])
     else:
-        subset_files = set(zarr_files).intersection(set(new_files))
+        y = np.array(boundary_dict["eastings"])
 
-    # Read in the `*.zarr` file(s)
-    # ! [REQUIRES DASK] ---- Read in the listed file
-    if len(subset_files) > 1:
-        zarr_data_ds = xr.open_mfdataset(subset_files, engine="zarr", chunks="auto", 
-                                         data_vars=specified_vars, coords=specified_coords)
-    elif len(subset_files) == 1:
-        zarr_data_ds = xr.open_dataset(subset_files[0], engine="zarr", chunks="auto")
+    # Create a boundary DataFrame
+    bound_df = pd.DataFrame({
+        "x": np.array([x.min(), x.max(), x.max(), x.min(), x.min()]),
+        "y":np.array([y.min(), y.max(), y.max(), y.min(), y.min()]),
+    })
 
-    # Pre-process the Dataset, convert it to a DataFrame, and validate the structure
-    # ---- Extract coordinate metadata
-    coordinate_metadata = zarr_data_ds[["longitude", "latitude"]]    
-    # ---- Convert to a DataFrame
-    zarr_data_df = zarr_data_ds.to_dataframe().reset_index()
-    # ---- Check for any missing columns
-    missing_columns = (
-        [key for key in full_config_map.keys() if key not in zarr_data_df.columns]
+    # Convert to a GeoDataFrame and return the GeoDataFrame
+    return gpd.GeoDataFrame(
+        data=bound_df,
+        geometry=gpd.points_from_xy(bound_df["x"], bound_df["y"]),
+        crs=projection,
     )
-    # ---- Raise Error, if needed
-    if missing_columns: 
-        raise ValueError(
-            f"The following columns are missing from at least one *.{file_extension} file in "
-            f"[{acoustic_directory_path}]: {', '.join(missing_columns)}!"    
-        )
-    # ---- Select defined columns
-    zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map)
 
-    # Extract defined acoustic frequency
-    # ---- From the configuration 
-    transmit_settings = acoustic_analysis_settings["transmit"]
-    # ---- Transform `frequency_nominal`, if necessary
-    zarr_data_df_filtered["frequency_nominal"] = (
-        configure_transmit_frequency(zarr_data_df_filtered["frequency_nominal"],
-                                     transmit_settings,
-                                     zarr_data_ds["frequency_nominal"].units)
-    )
-    # ---- Filter out any unused frequency coordinates
-    zarr_data_df_output = (
-        zarr_data_df_filtered
-        [zarr_data_df_filtered["frequency_nominal"] == transmit_settings["frequency"]]
-    )
-    
-    # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object
-    # ---- Replace NASC `NaN` values with `0.0`
-    zarr_data_df_output.loc[:, "NASC"] = zarr_data_df_output.loc[:, "NASC"].fillna(0.0)
-    # ---- Drop frequency column and return the output
-    acoustics_output["prc_nasc_df"] = zarr_data_df_output.drop(columns = ["frequency_nominal"])
-    # ---- Return output
-    if update_config:
-        if file_configuration["database"]["acoustics"] is None: 
-            file_configuration["database"]["acoustics"] = db_file
-        return acoustics_output, file_configuration
+
+def apply_griddify_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict):
+
+    # Extract the griddification definitions
+    griddify_definitions = spatial_config["griddify"]
+
+    # Get the projection definition
+    projection = spatial_config["projection"]
+
+    # Compute the boundary box GeoDataFrame
+    boundary_box = define_boundary_box(griddify_definitions["bounds"], projection)
+
+    # Convert the coordinates, if needed
+    if not set(["northings", "eastings"]).intersection(set(griddify_definitions["bounds"].keys())):
+        # ---- Compute the equivalent UTM string
+        utm_num = int(utm_string_generator(np.median(boundary_box.loc[0:3, "x"]), 
+                                           np.median(boundary_box.loc[0:3, "y"])))
+        # ---- Compute the boundary box GeoDataFrame with the new projection
+        boundary_box = boundary_box.to_crs(utm_num)
+        # ---- Create a new projection for later
+        projection_new = f"epsg:{utm_num}"
     else:
-        return acoustics_output
\ No newline at end of file
+        projection_new = projection
+
+    # Define the step sizes
+    # ---- Define x step size
+    x_step = distance(nautical=griddify_definitions["grid_resolution"]["x_distance"]).meters
+    # ---- Define y step size
+    y_step = distance(nautical=griddify_definitions["grid_resolution"]["y_distance"]).meters
+
+    # Get the boundary tuple
+    xmin, ymin, xmax, ymax = boundary_box.total_bounds
+
+    # Generate the cells
+    grid_cells = []
+    # ---- Iterate through
+    for y0 in np.arange(ymin, ymax+y_step, y_step):
+        for x0 in np.arange(xmin, xmax+x_step, x_step):
+            x1 = x0-x_step
+            y1 = y0+y_step
+            grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))
+
+    # Convert to a GeoDataFrame
+    cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=projection_new)
+
+    # Get the centroids
+    cells_gdf["cell_centroid"] = cells_gdf["geometry"].centroid
+
+    # Get the `prc_nasc_df` values, if they exist, and apply stratification information
+    if not acoustic_data["prc_nasc_df"].empty:
+
+        #
+        prc_nasc_df = acoustic_data["prc_nasc_df"]
+
+        # to GDF
+        prc_nasc_gdf = gpd.GeoDataFrame(
+            data=prc_nasc_df,
+            geometry=gpd.points_from_xy(prc_nasc_df["longitude"], prc_nasc_df["latitude"]),
+            crs=projection,
+        )
+        # to UTM
+        prc_nasc_new = prc_nasc_gdf.to_crs(projection_new)
+
+        prc_nasc_new["x"] = prc_nasc_new["geometry"].x
+        prc_nasc_new["y"] = prc_nasc_new["geometry"].y
+
+        # ---- Bin the latitude data
+        prc_nasc_new["stratum_x"] = pd.cut(
+            prc_nasc_new["x"],
+            np.arange(xmin, xmax+x_step, x_step),
+            right = True,
+            labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1),
+        ).astype(int) + 1
+
+        prc_nasc_new["stratum_y"] = pd.cut(
+            prc_nasc_new["y"],
+            np.arange(ymin, ymax+y_step, y_step),
+            right = True,
+            labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1),
+        ).astype(int) + 1
+
+        #
+        acoustic_data["prc_nasc_df"]["stratum"] = (
+            prc_nasc_new["stratum_x"].astype(str) + "-" + prc_nasc_new["stratum_y"].astype(str)
+        )
+
+    if not biology_data["trawl_info_df"].empty:
+
+        #
+        trawl_info_df = biology_data["trawl_info_df"]
+
+        # to GDF
+        trawl_info_gdf = gpd.GeoDataFrame(
+            data=trawl_info_df,
+            geometry=gpd.points_from_xy(trawl_info_df["longitude"], trawl_info_df["latitude"]),
+            crs=projection,
+        )
+        # to UTM
+        trawl_info_new = trawl_info_gdf.to_crs(projection_new)
+
+        trawl_info_new["x"] = trawl_info_new["geometry"].x
+        trawl_info_new["y"] = trawl_info_new["geometry"].y
+
+        # ---- Bin the latitude data
+        trawl_info_new["stratum_x"] = pd.cut(
+            trawl_info_new["x"],
+            np.arange(xmin, xmax+x_step, x_step),
+            right = True,
+            labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1),
+        ).astype(int) + 1
+
+        trawl_info_new["stratum_y"] = pd.cut(
+            trawl_info_new["y"],
+            np.arange(ymin, ymax+y_step, y_step),
+            right = True,
+            labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1),
+        ).astype(int) + 1
+
+        #
+        biology_data["trawl_info_df"]["stratum"] = (
+            trawl_info_new["stratum_x"].astype(str) + "-" + trawl_info_new["stratum_y"].astype(str)
+        )
+
diff --git a/echopop/live/livesurvey.py b/echopop/live/live_survey.py
similarity index 78%
rename from echopop/live/livesurvey.py
rename to echopop/live/live_survey.py
index 6d6a8621..e8c60da5 100644
--- a/echopop/live/livesurvey.py
+++ b/echopop/live/live_survey.py
@@ -3,10 +3,8 @@
 import copy
 import yaml
 
-from .livecore import(
+from .live_core import(
     LIVE_DATA_STRUCTURE,
-    LIVE_FILE_FORMAT_MAP,
-    LIVE_INPUT_FILE_CONFIG_MAP
 )
 
 from ..acoustics import (
@@ -37,7 +35,11 @@ def __init__(
         # initialize the Survey class object
         self.config = eldp.live_configuration(Path(live_init_config_path), 
                                               Path(live_file_config_path))
-
+        # ---- Initialize config key for database files
+        self.config.update(
+            {"database": {key: None for key in self.config["input_directories"].keys()}}
+        )
+        
         # Initialize input attribute
         self.input = copy.deepcopy(LIVE_DATA_STRUCTURE["input"])
 
@@ -50,11 +52,9 @@ def __init__(
         # TODO: Replace Tuple output by appending the "database" key to the respective dataset dict
         # Ingest data
         # ---- Acoustics
-        self.input["acoustics"]["prc_nasc_df"], self.config = eldp.load_acoustic_data(self.config,
-                                                                                      update_config)
+        self.input["acoustics"]["prc_nasc_df"] = eldp.load_acoustic_data(self.config)
         # ---- Biology
-        self.input["biology"], self.config = eldp.load_biology_data(self.config,
-                                                                    update_config)
+        self.input["biology"] = eldp.load_biology_data(self.config)
         
         # TODO: Add verbosity for printing database filepaths/connections 
         if verbose: 
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index e8d8de93..4b282e13 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -3,7 +3,7 @@
 import pandas as pd
 from typing import Optional
 
-def sql_create(connection: sqla.Connection, df: pd.DataFrame, table_name: str, 
+def sql_create(connection: sqla.Connection, dataframe: pd.DataFrame, table_name: str, 
                primary_keys: Optional[list] = None):
     """
     Generate a SQL command to create a table with dynamic columns, primary keys, and indices.
@@ -18,8 +18,8 @@ def sql_create(connection: sqla.Connection, df: pd.DataFrame, table_name: str,
     """
     # Generate column definitions
     column_definitions = (
-        ",\n".join(f"{col} {SQL_DTYPES[type(col).__name__]}" 
-                   for col in df.columns)
+        ",\n".join(f"{col} {SQL_DTYPES[type(dataframe[col][0]).__name__]}" 
+                   for col in dataframe.columns)
         )
     
     # Generate primary key definition
@@ -38,6 +38,12 @@ def sql_create(connection: sqla.Connection, df: pd.DataFrame, table_name: str,
     # Execute
     connection.execute(text(create_table_command.strip()))
 
+def sql_map_tables(connection: sqla.Connection):
+    """
+    """
+    inspector = inspect(connection)
+    return inspector.get_table_names()
+
 def sql_validate(connection: sqla.Connection, table_name: str): 
     """
     Check if a table exists in the database.
@@ -52,7 +58,7 @@ def sql_validate(connection: sqla.Connection, table_name: str):
     inspector = inspect(connection)
     return table_name in inspector.get_table_names()
 
-def sql_inspect(connection: sqla.Connection):
+def sql_inspect(connection: sqla.Connection, table_name: str):
     """
     Get a list of all tables present
 
@@ -62,7 +68,16 @@ def sql_inspect(connection: sqla.Connection):
     Returns:
         list: True if the table exists, False otherwise.
     """  
-    return inspect(connection).get_table_names()
+
+    # Create 'inspector' for the db file
+    inspector = inspect(connection)
+
+    # Retrieve column information
+    column_info =  inspector.get_columns(table_name)
+
+    # Format as a dictionary
+    return {col['name']: {k: v for k, v in col.items() if k != 'name'} for col in column_info}
+
 
 def sql_drop(connection: sqla.Connection, table_name: str):
     """
@@ -83,26 +98,42 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data
     """
     
     # Prepare the SQL statement for insertion
+    # ---- Check whether `columns` is '*'
+    if "*" in columns:
+        # ---- Create 'inspector' for the db file
+        inspector = inspect(connection)
+        # ---- Get the column names from the db file
+        columns = [col['name'] for col in inspector.get_columns(table_name)]
     # ---- If not a List
-    if not isinstance(columns, list):
-        columns = list(columns)
-        
+    elif not isinstance(columns, list):
+        columns = [columns]
+    # ---- Prepare the columns as a string of column names
     column_names = ", ".join(columns)
+
+    # Format `id_columns`
+    if id_columns is not None and not isinstance(id_columns, list):
+        id_columns = [id_columns]
     
     # Convert the DataFrame into a tuple and then into a string
+    # ---- Replace NaN with None
+    dataframe = dataframe.replace([np.nan], [None])
+    # ---- Identify any possible DATETIME columns
+    # datetime_columns = (
+    #     {col["name"]: str for col in columns_info 
+    #      if isinstance(col["type"], sqla.sql.sqltypes.DATETIME)}
+    # )
+    # ---- Encapsulate datetimes with quotes by converting to string
+    # dataframe = dataframe.astype(datetime_columns)
     # ---- DataFrame to Tuple
     data_tuple = [tuple(row) for row in dataframe.itertuples(index=False)]
     # ---- Tuple to String
-    if dataframe.columns.size == 1:
-        data_str = ", ".join(
-            f"{', '.join(map(str, row))}"
-            for row in data_tuple
-        )
-    else:   
-        data_str = ", ".join(
-            f"({', '.join(map(str, row))})"
-            for row in data_tuple
-        )
+    data_str = ", ".join(
+        # f"({', '.join(map(lambda x: f'\'{x}\'' if isinstance(x, str) else str(x), row))})"
+                f"({', '.join(map(lambda x: f'\'{x}\'' 
+                                  if isinstance(x, str) or isinstance(x, pd.Timestamp) 
+                                  else 'NULL' if x is None else str(x), row))})"
+        for row in data_tuple
+    )
     
     # Construct the "ON CONFLICT, DO UPDATE SET" if needed
     on_conflict_clause = ""
@@ -115,57 +146,114 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data
     # Construct the SQL query
     sql_command = f"""
     INSERT INTO {table_name} ({column_names})
-    VALUES ({data_str})
+    VALUES {data_str}
     {on_conflict_clause}
     """    
-
+    
     # Execute
     connection.execute(text(sql_command.strip()))
     
     # Commit
     connection.commit()
-    
 
+from typing import Literal
+import numpy as np
+def sql_select(connection: sqla.Connection, table_name: str, columns: list, 
+               output_type: type = pd.DataFrame):
+
+    # Prepare the columns as a string of column names
+    column_names = ", ".join(columns)
+
+    # Format the SQL command
+    sql_command = f"SELECT {column_names} FROM {table_name};"
+
+    # Execute the command 
+    table = connection.execute(text(sql_command))
+
+    # Fetch the data from the table
+    data = table.fetchall()
     
+    # Inspect the table to construct a dictionary of expected datatypes for each column
+    table_info = sql_inspect(connection, table_name=table_name)
+    # ---- Whittle down the information dictionary to isolate just the column datatypes
+    table_dtypes = {col: info['type'] for col, info in table_info.items()}
+
+    # Raise error if `output_type` is invalid
+    if output_type not in [pd.DataFrame, np.ndarray, str, tuple]:
+        raise TypeError(
+            f"Argument `output_type` ([{output_type}]) must be either `str`, `tuple`, "
+            f"`pandas.DataFrame`, or `numpy.ndarray`."
+        )
+
+    # Format the output 
+    # ---- DataFrame
+    if output_type is pd.DataFrame:
+        # ---- Create DataFrame
+        output_df = pd.DataFrame(data, columns=table.keys())
+        # ---- Format the expected datatypes
+        df_dtypes = {col: SQL_DTYPES[type(dtype).__name__] for col, dtype in table_dtypes.items()}
+        # ---- Apply the dtypes
+        return output_df.astype(df_dtypes)
+    else:
+        # ---- Get the datatypes that will correspond to each value of the tuples
+        tuple_dtypes = [SQL_DTYPES[type(dtype).__name__] for _, dtype in table_dtypes.items()]
+        # ---- Convert the `Row` objects to tuples 
+        converted_data = [
+            tuple(dtype(value) if value is not None else None
+                for value, dtype in zip(row, tuple_dtypes))
+            for row in data
+        ]
+        # ---- String
+        if output_type is str:
+            return [item[0] for item in converted_data]
+        # ---- Array
+        elif output_type is np.ndarray:
+            return np.array([item[0] for item in converted_data])
+        # ---- Tuple
+        else:
+            return converted_data
+
 SQL_COMMANDS = {
-    "create": sql_create,
-    "drop": sql_drop,
-    "inspect": sql_inspect,
-    "validate": sql_validate,
-    
+    "create": dict(function=sql_create, args=["table_name", "dataframe", "primary_keys"]),
+    "drop": dict(function=sql_drop, args=["table_name"]),
+    "insert": dict(function=sql_insert, args=["table_name", "columns", "dataframe", "id_columns"]),
+    "inspect": dict(function=sql_inspect, args=["table_name"]),
+    "map": dict(function=sql_map_tables, args=[]),
+    "select": dict(function=sql_select, args=["table_name", "columns", "output_type"]),
+    "validate": dict(function=sql_validate, args=["table_name"]),
+}
     
     
-    "check": "SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';",
-    "drop": "DROP TABLE IF EXISTS {table_name};",
-    "select": "SELECT {columns} FROM {table_name};",
-    "index": "CREATE UNIQUE INDEX IF NOT EXISTS {index_name} ON {table_name} ({columns})",
-    # "insert": "INSERT INTO {table_name} ({columns});",
-    "insert": """
-        INSERT INTO {table_name} ({columns}) 
-        SELECT {columns} 
-        FROM (SELECT VALUES {values} FROM (VALUES {value_placeholder})) AS source ({columns}) 
-        {filter_clause};
-        """,
-    "inspect": None,
-}
-
 SQL_DTYPES = {
     'int32': 'INTEGER',
     'int64': 'INTEGER',
     'float64': 'FLOAT',
+    "float": "FLOAT",
+    "int": "INTEGER",
     'bool': 'BOOLEAN',
-    'datetime64[ns]': 'DATETIME',
+    "Timestamp": "DATETIME",
     'object': 'TEXT',
     "str": "TEXT",
-}
+    "FLOAT": float,
+    "INTEGER": int,
+    "DATETIME": str,
+    "TEXT": str,
+} 
 
 def format_sql_columns(kwargs: dict):
+
+    # Columns
     if "columns" in kwargs:
-        if isinstance(kwargs["columns"], list):
+        if isinstance(kwargs["columns"], list) or isinstance(kwargs["columns"], pd.Index):
             kwargs["columns"] = ", ".join(kwargs["columns"])
     else: 
         kwargs["columns"] = "*"
-        
+
+    # ID/Conflict columns
+    if "id_columns" in kwargs:
+        if isinstance(kwargs["id_columns"], list) or isinstance(kwargs["id_columns"], pd.Index):
+            kwargs["id_columns"] = ", ".join(kwargs["id_columns"])      
+
     # Return the updated `kwargs` dictionary
     return kwargs
 
@@ -184,31 +272,39 @@ def SQL(db_file: str, command: str, **kwargs):
     # Run the command
     try:
         with engine.connect() as connection:
-            # ---- SELECT
-            if command == "select":
-                return pd.read_sql(text(SQL_COMMANDS[command].format(**kwargs)), con=connection)
-            # ---- REPLACE
-            elif command == "replace":
-                # ---- Extract dataframe
-                df_to_add = kwargs["dataframe"]
-                # ---- Replace current
-                df_to_add.to_sql(name=kwargs["table_name"], 
-                                 con=connection, 
-                                 if_exists="replace", index=False)
-
-            # ---- INSERT
-            elif command == "insert": 
-                # ---- Extract dataframe
-                df_to_add = kwargs["dataframe"]
-                # ---- Insert into the table
-                df_to_add.to_sql(name=kwargs["table_name"], con=connection, if_exists="append", 
-                                 index=False)
-            # ---- INSPECT
-            elif command == "inspect":
-                return inspect(engine).get_table_names()
-            # ---- OTHER COMMAND
-            else: 
-                connection.execute(text(SQL_COMMANDS[command].format(**kwargs)))
+            # ---- Get the function name
+            command_function = SQL_COMMANDS[command]["function"]
+            # ---- Get the function arguments
+            command_args = SQL_COMMANDS[command]["args"]
+            # ---- Drop unnecessary keys (update `kwargs`)
+            kwargs = {key: value for key, value in kwargs.items() if key in command_args}
+            # ---- Return output
+            return command_function(connection, **kwargs)
+            # # ---- SELECT
+            # if command == "select":
+            #     return pd.read_sql(text(SQL_COMMANDS[command].format(**kwargs)), con=connection)
+            # # ---- REPLACE
+            # elif command == "replace":
+            #     # ---- Extract dataframe
+            #     df_to_add = kwargs["dataframe"]
+            #     # ---- Replace current
+            #     df_to_add.to_sql(name=kwargs["table_name"], 
+            #                      con=connection, 
+            #                      if_exists="replace", index=False)
+
+            # # ---- INSERT
+            # elif command == "insert": 
+            #     # ---- Extract dataframe
+            #     df_to_add = kwargs["dataframe"]
+            #     # ---- Insert into the table
+            #     df_to_add.to_sql(name=kwargs["table_name"], con=connection, if_exists="append", 
+            #                      index=False)
+            # # ---- INSPECT
+            # elif command == "inspect":
+            #     return inspect(engine).get_table_names()
+            # # ---- OTHER COMMAND
+            # else: 
+            #     connection.execute(text(SQL_COMMANDS[command].format(**kwargs)))
     finally: 
         # ---- Dispose of the engine to release any resources being pooled/used
         engine.dispose()
diff --git a/echopop/mesh_generation.py b/echopop/mesh_generation.py
index 077d9c93..bb78e1ba 100644
--- a/echopop/mesh_generation.py
+++ b/echopop/mesh_generation.py
@@ -12,8 +12,109 @@
 
 # Create the grid points
 grid_points = [(i, j, 0) for i in x for j in y]
+def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame]:
+
+    # Get the acoustic file settings and root directory
+    # ---- File settings
+    file_settings = file_configuration["input_directories"]["acoustics"]
+    # ---- Root directory
+    root_directory = file_configuration["data_root_dir"]
+    
+    # Get and validate the acoustic data directory and files
+    acoustic_files = validate_data_directory(root_directory, file_settings)
+
+    # Query `acoustics.db` to process only new files (or create the db file in the first place)
+    new_acoustic_files = query_acoustic_db_files(file_configuration, acoustic_files)
+
+    # Read in the acoustic data files
+    # ! [REQUIRES DASK] ---- Read in the listed file
+    prc_nasc_df, acoustic_data_units = read_acoustic_zarr(new_acoustic_files)
+    # ---- Add the `acoustic_data_units` to the dictionary
+    file_configuration["acoustics"]["dataset_units"] = acoustic_data_units
+
+    # Preprocess the acoustic dataset
+    prc_nasc_df_processed = preprocess_acoustic_data(prc_nasc_df, file_configuration)
+
+    # Return output
+    return prc_nasc_df_processed
+
+def read_acoustic_zarr(acoustic_files: Path) -> tuple:
+
+    # Iterate through each of the file ids and read in the data 
+    for id in list(biology_file_ids.keys()): 
+        # ---- Extract the specific config mapping for this tag/id
+        sub_config_map = biology_config_map[id]
+        # ---- Drop the `{FIELD_ID}` tag identifier
+        file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id])
+        # ---- Replace all other tags with `*` placeholders
+        file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
+        # ---- Create Path object with the generalized format
+        subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}")
+        # ---- List all files that match this pattern
+        subcsv_files_str = [str(file) for file in list(subfile_path_obj)]
+        # ---- Filter for only new files
+        subset_files = set(subcsv_files_str).intersection(set(new_files))
+        # ---- Pull from SQL database, if applicable
+        if f"{id}_df" in tables:
+            # ---- SELECT
+            sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*")
+            # ---- Concatenate to the dictionary
+            sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df])
+        # ---- Add data files not stored in SQL database
+        if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables:
+            if len(subset_files) > 0:
+                file_list = subset_files
+            else:
+                file_list = subcsv_files_str
+            # ---- Create a list of relevant dataframes
+            sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map) 
+                            for file in file_list]
+            # ---- Concatenate into a single DataFrame
+            sub_df = pd.concat(sub_df_lst, ignore_index=True)
+            # ---- Lower-case sex
+            if "sex" in sub_df.columns:
+                sub_df["sex"] = sub_df["sex"].str.lower()
+            # ---- Concatenate to the dictionary DataFrame
+            biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df])
+
+    # Get contrasts used for filtering the dataset
+    # ---- Species
+    species_filter = file_configuration["species"]["number_code"]
+    # ---- Trawl partition information
+    trawl_filter = biology_analysis_settings["catch"]["partition"]
+    # ---- Apply the filter
+    filtered_biology_output = {
+        key: df[
+            (df['species_id'] == species_filter if 'species_id' in df.columns else True) &
+            (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns else True)
+        ]
+        for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty
+    }
+
+    # Update the SQL database
+    for table_name, df in filtered_biology_output.items():
+        # ---- Update        
+        _ = SQL(db_file, "insert", table_name=table_name, columns="*", 
+                dataframe=df)
+        
+    # Combine the two datasets 
+    merged_output = {
+        key: pd.concat([
+            sql_biology_output.get(key, pd.DataFrame()), 
+            filtered_biology_output.get(key, pd.DataFrame())
+        ]).drop_duplicates().reset_index(drop=True)
+        for key in set(sql_biology_output) | set(filtered_biology_output)
+    }
+    # ---- Return output
+    if update_config:
+        if file_configuration["database"]["biology"] is None: 
+            file_configuration["database"]["biology"] = db_file
+        return merged_output, file_configuration
+    else:
+        return merged_output
+
+
 
-#
 data_root_dir = Path("C:/Users/Brandyn/Documents/GitHub/EchoPro_data/")
 db_directory = data_root_dir / "database"
 # ---- Create the directory if it does not already exist
@@ -118,7 +219,7 @@ def create_table_sql(table_name, columns, primary_keys=None, index_columns=None)
 for row in rows:
     print(row)
     
-    
+converted_data[0]
 check_table_exists(engine, "files_read")
 
 zarr_files_str = ["A", "B", "C", "D"]
@@ -247,11 +348,13 @@ def update_specific_rows(engine, table_name, updates, conditions):
 condition_columns = ['x', 'y']
 
 # Define the updates and conditions
-dd = {"x": np.array([1, 2, 3 , 4, 5]), "y": np.array([1, 2, 3 , 4, 5]), "value": np.array([1, 2, 3 , 4, 5]).astype(float)}
+dd = {"x": np.array([1, 2, 3 , 4, 5]),"" "y": np.array([1, 2, 3 , 4, 5]), "value": np.array([1, 2, 3 , 4, 5]).astype(float)}
 new_data = pd.DataFrame(dd)
 new_data
 df = new_data
 
+kwargs = {"table_name": "grid", "columns": df.columns, "df": df}
+
 with engine.connect() as connection: 
     # sql_create(connection, table_name = "grid", df = df)
     # sql_validate(connection, "grid")
@@ -1911,6 +2014,8 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict):
     # Return the resulting DataFrame
     return df_validated
 
+boundary_dict = griddify_definitions["bounds"]
+
 ##
 grid_settings["grid_resolution"]["x"] = 50
 grid_settings["grid_resolution"]["y"] = 50
@@ -1928,7 +2033,7 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict):
     geometry=gpd.points_from_xy(bound_df["lon"], bound_df["lat"]),
     crs = projection
 )
-
+from echopop.spatial.projection import utm_string_generator
 utm_string_generator(-117.0, 33.75)
 bound_gdf.total_bounds
 # Convert to UTM
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
index 44a83ab4..c01445b3 100644
--- a/echopop/zarr_read_ingest_test.py
+++ b/echopop/zarr_read_ingest_test.py
@@ -13,49 +13,9 @@
 import re
 import contextlib
 from sqlalchemy import create_engine, text, Engine, inspect
+from echopop.live.live_core import LIVE_DATA_STRUCTURE, LIVE_FILE_FORMAT_MAP, LIVE_INPUT_FILE_CONFIG_MAP
+from echopop.live import live_data_processing as eldp
 
-####################################################################################################
-# * Functionality for a) loading YAML configuration file, b) search defined directory for 
-# * input files, c) ingest *.zarr/*.csv
-# TODO: Incorporate complete YAML file validator
-# TODO: Documentation
-def live_configuration(live_init_config_path: Union[str, Path], 
-                       live_file_config_path: Union[str, Path]):
-    
-    # Validate file existence
-    # ---- str-to-Path conversion, if necessary
-    live_init_config_path = Path(live_init_config_path)
-    live_file_config_path = Path(live_file_config_path)
-    # ---- Create list of both config paths
-    config_files = [live_init_config_path, live_file_config_path]
-    # ---- List of file existence checks
-    config_existence = [live_init_config_path.exists(), live_file_config_path.exists()]
-    # ---- Error evaluation and print message (if applicable)
-    if not all(config_existence):
-        missing_config = [
-            files for files, exists in zip(config_files, config_existence) if not exists
-        ]
-        raise FileNotFoundError(f"The following configuration files do not exist: {missing_config}")
-
-    # Read the YAML configuration/recipe file to parameterize the `LiveSurvey` class
-    # ---- Initialization settings
-    init_config = yaml.safe_load(Path(live_init_config_path).read_text())
-    # ---- Filepath/directory settings
-    file_config = yaml.safe_load(Path(live_file_config_path).read_text())
-    
-    # Check for intersecting/duplicative configuration keys
-    # ---- Compare sets of keys from each dictionary
-    config_intersect = set(init_config.keys()).intersection(set(file_config.keys()))
-    # ---- Raise error if needed
-    if config_intersect:
-        raise ValueError(
-            f"The initialization and file configuration files comprise the following intersecting "
-            f"keys: {' ,'.join(config_intersect)}. Key names must be unique for each configuration "
-            f"file."
-        )
-    
-    # Combine both into a dictionary output that can be added to the `LiveSurvey` class object
-    return {**init_config, **file_config}
 ####################################################################################################  
 # TEST: YAML FILE CONFIGURATION
 # ---- Define filepaths
@@ -66,460 +26,394 @@ def live_configuration(live_init_config_path: Union[str, Path],
 file_configuration.update({"database": {"acoustics": None, "biology": None}})
 ####################################################################################################
 # * Accessory function for tuning the acoustic transmit frequency units/scaling
+
+
+
+
+
+####################################################################################################
+# * Functionality for reading in processed acoustic data
+# TODO: Expand data validator and limit cases to '*.zarr' (for now)
+# TODO: Refactor "extra" components such as the validation steps, xarray-to-dataframe piping, etc.
 # TODO: Documentation
-def configure_transmit_frequency(frequency_values: pd.Series,
-                                 transmit_settings: dict, 
-                                 current_units: str):
-    
-    # Extract transmit frequency units defined in configuration file
-    configuration_units = transmit_settings["units"]
-    
-    # Transform the units, if necessary
-    # ---- Hz to kHz
-    if current_units == "Hz" and configuration_units == "kHz":
-        return frequency_values * 1e-3
-    # ---- kHz to Hz
-    elif current_units == "kHz" and configuration_units == "Hz":
-        return frequency_values * 1e3
-    # ---- No change
+file_settings = file_configuration["input_directories"]["acoustics"]
+root_directory = file_configuration["data_root_dir"]
+
+
+####################################################################################################  
+# TEST: ACOUSTIC ZARR FILE INGESTION CONFIGURATION
+# NOTE: 
+# ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration`
+acoustic_data = load_acoustic_data(file_configuration)
+acoustic_data
+file_configuration["database"]
+
+def estimate_echometrics(acoustic_data_df: pd.DataFrame):
+
+    # Create copy
+    acoustic_df = acoustic_data_df.copy().reset_index(drop=True)
+
+    # Pre-compute the change in depth
+    acoustic_df["dz"] = acoustic_df["depth"].diff()
+
+    # Initialize echometrics dictionary
+    echometrics = {}
+
+    # Compute the metrics center-of-mass
+    if acoustic_df["NASC"].sum() == 0.0:
+        echometrics.update({
+            "n_layers": 0,
+            "mean_Sv": -999,
+            "max_Sv": -999,
+            "nasc_db": np.nan,
+            "center_of_mass": np.nan,
+            "dispersion": np.nan,
+            "evenness": np.nan,
+            "aggregation": np.nan,    
+            "occupied_area": 0.0,        
+        })
     else:
-        return frequency_values
+        
+        # Compute the number of layers
+        echometrics.update({
+            "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size
+        })
+
+        # Compute ABC
+        # ---- Convert NASC to ABC
+        acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2)
+        # ---- Estimate mean Sv
+        echometrics.update({
+            "mean_Sv": 10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max())
+        })
+        # --- Estimate max Sv (i.e. )
+        echometrics.update({
+            "max_Sv": 10 * np.log10(acoustic_df["ABC"].max() 
+                                    / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"])
+        })
+
+        # Compute (acoustic) abundance
+        echometrics.update({
+            "nasc_db": 10 * np.log10(acoustic_df["ABC"].sum())
+        })
+
+        # Compute center of mass
+        echometrics.update({
+            "center_of_mass": (
+                (acoustic_df["depth"] * acoustic_df["NASC"]).sum()
+                / (acoustic_df["NASC"]).sum()
+            )
+        })
+
+        # Compute the dispersion
+        echometrics.update({
+            "dispersion": (
+                ((acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2 
+                * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum()                
+            )
+        })
+
+        # Compute the evenness
+        echometrics.update({
+            "evenness": (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2
+        })
+
+        # Compute the index of aggregation
+        echometrics.update({
+            "aggregation": 1 / echometrics["evenness"]
+        })
+
+        # Get the occupied area
+        echometrics.update({
+            "occupied_area": (
+                acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max()
+            )
+        })
+
+    # Return the dictionary
+    return echometrics
+
+def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True):
+
+    # Vertically integrate PRC NASC
+    nasc_dict = {"nasc": acoustic_data_df["NASC"].sum()}
+    
+    # Horizontally concatenate `echometrics`, if `True`
+    if echometrics:
+        # ---- Compute values
+        # NOTE: This uses NASC instead of linear `sv`
+        echometrics_dict = estimate_echometrics(acoustic_data_df)
+        # ---- Merge
+        nasc_dict.update(echometrics_dict)
+
+    # Convert `nasc_dict` to a DataFrame and return the output
+    return pd.Series(nasc_dict)
+
+
+acoustic_data_df = acoustic_data["prc_nasc_df"]
+
+
+
+# SQL(database_file, "drop", table_name="nasc_df")
+# SQL(database_file, "validate", **kwargs)
+# SQL(database_file, "create", table_name="nasc_df", primary_keys=["latitude", "longitude", "ping_time"], dataframe=nasc_data_df)
+# SQL(database_file, "validate", **kwargs)
+# SQL(database_file, "select", table_name="nasc_df")
+# SQL(database_file, "insert", table_name="nasc_df", id_columns=["latitude", "longitude", "ping_time"], dataframe=nasc_data_df)
+# SQL(database_file, "select", table_name="nasc_df")
+# SQL(database_file, "insert", table_name="nasc_df", id_columns=["latitude", "longitude", "ping_time"], dataframe=nasc_data_df)
+# SQL(database_file, "select", table_name="nasc_df")
+# SQL(database_file, "insert", table_name="nasc_df", dataframe=nasc_data_df)
+# SQL(database_file, "drop", table_name="nasc_df")
+# SQL_DTYPES[type(dataframe["ping_time"][0]).__name__]
+
+def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: dict, 
+                          echometrics: bool = True):
+
+    # Integrate NASC (and compute the echometrics, if necessary)
+    nasc_data_df = (
+        acoustic_data_df.groupby(["longitude", "latitude", "ping_time"])
+        .apply(lambda group: integrate_nasc(group, echometrics), include_groups=False)
+        .reset_index()
+    )
+    # ---- Amend the dtypes if echometrics were computed
+    if echometrics:
+        nasc_data_df = (
+            nasc_data_df
+            .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float,
+                    "center_of_mass": float, "dispersion": float, "evenness": float,
+                    "aggregation": float, "occupied_area": float})
+        )
+
+    # Get the acoustics database file
+    acoustics_db = file_configuration["database"]["acoustics"]
+
+    # Insert the new data into the database and pull in the combined previous and new data combined
+    full_nasc_df = sql_data_exchange(acoustics_db, dataframe=nasc_data_df, 
+                                     table_name="nasc_df", 
+                                     id_columns=["longitude", "latitude", "ping_time"],
+                                     primary_keys=["longitude", "latitude", "ping_time"],
+                                     output_type=pd.DataFrame)
+
+    # Return the output
+    return full_nasc_df
+
 ####################################################################################################
-# * Define `LIVE_INPUT_FILE_CONFIG_MAP` configuration mapping (this will be in an equivalent 
-# * `core.py`)
-# TODO: Update structure with additional information (as needed)
-# TODO: Documentation
-LIVE_INPUT_FILE_CONFIG_MAP = {
-    "acoustics": {
-        "xarray_coordinates": {
-            "distance": float,
-            "depth": float,
+def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str, List[str]]] = None):
+
+    # Get all database files
+    database_files = file_configuration["database"]
+
+    # Iterate through all keys
+    for _, db_file in database_files.items():
+        # ---- Map the table names
+        table_names = SQL(db_file, "map")
+        # ---- Drop any noted exceptions
+        if not isinstance(table_exception, list):
+            table_exception = [table_exception]
+        # ---- Drop exception table name
+        if None not in table_exception:
+            table_names = list(set(table_names) - set(table_exception))
+        _ = [SQL(db_file, "drop", table_name=table) for table in table_names]
+        # ---- Validate that all tables were removed        
+        if set(table_names).intersection(set(SQL(table_names, "map"))):
+            raise ValueError(
+                f"Attempted reset of [{str(db_file)}] failed."
+            )
+
+SPATIAL_CONFIG_MAP = {
+    "closest_haul": {
+        "proximity": {
+            "choices": ["distance", "time"],
         },
-        "xarray_variables": {
-            "NASC": float,
-            "frequency_nominal": float, 
-            "latitude": float,
-            "longitude": float,
-            "ping_time": "datetime64[ns]",
-        }
     },
-    "biology": {
-        "catch": {
-            "dtypes": {
-                "partition": str,
-                "species_code": int,
-                "sample_weight_kg": float,
-                "catch_perc": float,
+    "global" : {},
+    "griddify": {
+        "bounds": {
+            "longitude": {
+                "types": [float]
             },
-            "names": {
-                "partition": "trawl_partition",
-                "species_code": "species_id",
-                "sample_weight_kg": "haul_weight",
-                "catch_perc": "catch_percentage",
-            }
-        },
-        "length": {
-            "dtypes": {
-                "sex": str,
-                "rounded_length": int,
-                "frequency": int,
+            "latitude": {
+                "types": [float]
+            },
+            "northings": {
+                "types": [float]
+            },
+            "eastings": {
+                "types": [float]
             },
-            "names": {
-                "sex": "sex",
-                "rounded_length": "length",
-                "frequency": "length_count",
-            }
+            "pairs": [("longitude", "latitude"), ("northings", "eastings")],
         },
-        "specimen": {
-            "dtypes": {
-                "rounded_length": int,
-                "organism_weight": float,
-                "sex": str,
+        "grid_resolution": {
+            "x_distance": {
+                "types": float,
             },
-            "names": {
-                "sex": "sex",
-                "rounded_length": "length",
-                "organism_weight": "weight"
+            "y_distance": {
+                "types": float,
             },
+            "d_longitude": {
+                "types": float,
+            },
+            "d_latitude": {
+                "types": float,
+            },
+            "grid_size_x": {
+                "types": int,
+            },
+            "grid_size_y": {
+                "types": int,
+            },
+            "pairs": [("x_distance", "y_distance"), ("d_longitude", "d_latitude"), 
+                      ("grid_size_x", "grid_size_y")],       
         },
     },
-}
-
-LIVE_FILE_FORMAT_MAP = {
-    "DATE:YYYYMM": {
-        "name": "date",
-        "dtype": "datetime[ns]",
-        "expression": r"(?P<DATE>\d{6})",
-    },
-    "DATE:YYYYMMDD": {
-        "name": "date",
-        "dtype": "datetime[ns]",
-        "expression": r"(?P<DATE>\d{8})",
-    },
-    "HAUL": {
-        "name": "haul_num",
-        "dtype": int,
-        "expression": r"(?P<HAUL>\d+)",
-    },
-    "SPECIES_CODE": {
-        "name": "species_id",
-        "dtype": int,
-        "expression": r"(?P<SPECIES_CODE>\d+)"
+    "inpfc": {
+        "stratum_names": {
+                "types": [int, str]
+            },
+        "latitude_max": {
+            "types": [float],
+        },
     },
-    "FILE_ID": {
-        "name": "file_id",
-        "dtype": str,
-        "expression": r"(?P<FILE_ID>.+)"
+    "weighted_haul": {
+        "proximity": {
+            "choices": ["distance", "time"]
+        },
     },
 }
 
-def compile_filename_format(file_name_format: str):
 
-    # Create a copy of `file_name_format`
-    regex_pattern = file_name_format
-    
-    # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern
-    for key, value in LIVE_FILE_FORMAT_MAP.items():
-        regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"])
-    # ---- Replace the `FILE_ID` tag
-    regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P<FILE_ID>\1)', regex_pattern)
 
-    # Compile the regex pattern and return the output
-    return re.compile(regex_pattern)
+reset_db_files(file_configuration, table_exception = "files_read")
+reset_db_files(file_configuration)
 
-def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict):
-
-    # Read in the `*.csv` file
-    df = pd.read_csv(file, usecols=list(config_settings["dtypes"].keys()))
+stamp = 20240714194248
+stamp.astype(int)
+int(stamp)
+import re
+from datetime import datetime
 
-    # Validate the dataframe
-    # ---- Check for any missing columns
-    missing_columns = (
-        [key for key in config_settings["dtypes"].keys() if key not in df.columns]
-    )
-    # ---- Raise Error, if needed
-    if missing_columns: 
-        raise ValueError(
-            f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!"
-        )
-    # ---- Ensure the correct datatypes
-    df_validated = df.astype(config_settings["dtypes"])
-    # ---- Replace column names and drop 
-    df_validated = df_validated.rename(columns=config_settings["names"])
-
-    # Get the substring components that can be added to the DataFrame
-    filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern)
-    # ---- Create sub-list of columns that can be added to the DataFrame
-    valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings)))
-
-    # Compile the filename regular expression
-    compiled_regex = compile_filename_format(pattern)
-    # ---- Create the `Match` object that will be used to parse the string
-    match_obj = compiled_regex.search(file.name)
-
-    # Iterate through the filename-derived tags and add them to the DataFrame
-    for i in valid_tags: 
-        matched_key = LIVE_FILE_FORMAT_MAP[i]
-        df_validated[matched_key["name"]] = matched_key["dtype"](match_obj.group(i))
-
-    # Return the resulting DataFrame
-    return df_validated
-####################################################################################################
-# * Functionality for reading in processed acoustic data
-# TODO: Expand data validator and limit cases to '*.zarr' (for now)
-# TODO: Refactor "extra" components such as the validation steps, xarray-to-dataframe piping, etc.
-# TODO: Documentation
-def load_acoustic_data(file_configuration: dict, update_config: bool = True) -> Tuple[pd.DataFrame, xr.Dataset]:
-    # Get acoustic directory and initialization settings
-    # ---- Files
-    acoustic_file_settings = file_configuration["input_directories"]["acoustic"]
-    # ---- General settings
-    acoustic_analysis_settings = file_configuration["acoustics"]
+def infer_datetime_format(timestamp_str: Union[int, str]):
+    patterns = {
+        r"^\d{14}$": "%Y%m%d%H%M%S",             # YYYYMMDDHHMMSS
+        r"^\d{8}$": "%Y%m%d",                     # YYYYMMDD
+        r"^\d{6}$": "%H%M%S",                     # HHMMSS
+        r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$": "%Y-%m-%d %H:%M:%S",  # YYYY-MM-DD HH:MM:SS
+        r"^\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}$": "%Y/%m/%d %H:%M:%S",  # YYYY/MM/DD HH:MM:SS
+        r"^\d{4}-\d{2}-\d{2}$": "%Y-%m-%d",       # YYYY-MM-DD
+        r"^\d{4}/\d{2}/\d{2}$": "%Y/%m/%d"        # YYYY/MM/DD
+    }
     
-    # Get the file-specific settings, datatypes, columns, etc.
-    # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
-    acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"]
-    # ---- Create list of coordinate data variables
-    specified_vars = list(acoustics_config_map["xarray_variables"].keys())
-    # ---- Create set of coordinate variables
-    specified_coords = list(acoustics_config_map["xarray_coordinates"].keys())      
-    # ---- Concatenate into a full configuration map
-    full_config_map = {**acoustics_config_map["xarray_coordinates"],
-                        **acoustics_config_map["xarray_variables"]} 
-    # ---- Initialize the dictionary that will define this key in the `input` attribute
-    acoustics_output = {"prc_nasc_df": pd.DataFrame(), 
-                        "nasc_df": pd.DataFrame()}
-    # ---- Initialize the SQL dictionary
-    # sql_acoustics_output = {"sv_df": pd.DataFrame()}
-
-    # Create full filepath
-    acoustic_directory_path = (
-        Path(file_configuration["data_root_dir"]) / acoustic_file_settings["directory"]
-    )
+    for pattern, date_format in patterns.items():
+        if re.match(pattern, timestamp_str):
+            return date_format
     
-    # Validate filepath, columns, datatypes
-    # ---- Directory check
-    directory_existence = acoustic_directory_path.exists()
-    # ---- Error evaluation (if applicable)
-    if not directory_existence:
-        raise FileNotFoundError(
-            f"The acoustic data directory [{acoustic_directory_path}] does not exist."
-        )
-    # ---- Get the defined file extension
-    file_extension = acoustic_file_settings["extension"]
-    # ---- Create Path.glob generator object (the case of a *.zarr file)
-    file_path_obj = acoustic_directory_path.glob(f"*{'.'+file_extension}")
-    # ---- Find all zarr files
-    zarr_files = list(file_path_obj)
-    # ---- Ensure files exist or raise error otherwise
-    if len(zarr_files) < 1:
-        raise FileNotFoundError(
-            f"No `*.zarr` files found in [{acoustic_directory_path}]!"
-        )
+    raise ValueError("Unknown timestamp format")
+
+filter_dict = dict(species_filer=species_filter, trawl_filter=trawl_filter)
+
+def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict):
+
+    # Create dataframe copy
+    data_copy = biology_data.copy()
+
+    # Iterate through dictionary to apply filters (if present)
+    for column, value in filter_dict.items():
+        if column in data_copy.columns:
+            data_copy = data_copy[data_copy[column] == value]
+
+    # Return output
+    return data_copy
+
+
+
+df[(df['species_id'] == species_filter if 'species_id' in df.columns else True)]
+df[(df["species_id"] == 17 if "species_id" in df.columns)]
+
+(df[df["haul_num"] == 17 if "haul_num" in df.columns] else True)
+
+
+from datetime import datetime
+
+df = biology_output["trawl_info_df"]
+df.loc[(df['species_id'] == species_filter if 'species_id' in df.columns else True), :]
+df.index
+
+biology_output["trawl_info_df"].reset_index().index
+df = biology_output["catch_df"]
+df = df.loc[0, :].to_frame().T
+df.index
+df.loc[(df['species_id'] == species_filter if 'species_id' in df.columns else True)]
+
+def convert_datetime(timestamp: Union[int, str, pd.Series]):
+
+    if isinstance(timestamp, pd.Series):
+        test_timestamp = str(timestamp[0])
     else:
-        # ---- Create Path to SQL database file
-        db_directory = Path(file_configuration["data_root_dir"]) / "database"
-        # ---- Create the directory if it does not already exist
-        db_directory.mkdir(parents=True, exist_ok=True)
-        # ---- Complete path to `biology.db`
-        db_file = db_directory / "acoustics.db"
-        # ---- Query the external SQL database to see if the file tracking table exists
-        tables = SQL(db_file, "inspect")
-        # ---- Create a list of string-formatted Path names
-        zarr_files_str = [str(file) for file in zarr_files]
-        # ---- Create DataFrame
-        current_files = pd.DataFrame(zarr_files_str, columns=["filepath"])
-        # ---- Create if it is missing and then advance `zarr_files`
-        if "files_read" not in tables:
-            # ---- Insert into the SQL database file
-            _ = SQL(db_file, "insert", table_name="files_read", columns="filepath",
-                    dataframe=current_files)        
-            # ---- Create empty list for later comparison
-            new_files = []
-        else:
-            # ---- Pull already processed filenames
-            previous_files = SQL(db_file, "select", table_name="files_read")
-            # ---- Compare against the current filelist 
-            new_files = (
-                [file for file in zarr_files_str if file not in set(previous_files["filepath"])]
-            )  
-            # ---- Create a DataFrame for the new files
-            new_files_df = pd.DataFrame(new_files, columns=["filepath"])
-            # ---- Insert into the SQL database file
-            _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) 
-
-    # Find new files that have not yet been processed
-    if not new_files: 
-        subset_files = zarr_files
+        test_timestamp = str(timestamp)
+
+    # Approximate the datetime format
+    datetime_format = infer_datetime_format(str(test_timestamp))
+
+    #
+    if isinstance(timestamp, pd.Series):
+        return timestamp.apply(lambda x: datetime.strptime(x, datetime_format))
     else:
-        subset_files = set(zarr_files).intersection(set(new_files))
-
-    # Read in the `*.zarr` file(s)
-    # ! [REQUIRES DASK] ---- Read in the listed file
-    if len(subset_files) > 1:
-        zarr_data_ds = xr.open_mfdataset(subset_files, engine="zarr", chunks="auto", 
-                                         data_vars=specified_vars, coords=specified_coords)
-    elif len(subset_files) == 1:
-        zarr_data_ds = xr.open_dataset(subset_files[0], engine="zarr", chunks="auto")
-
-    # Pre-process the Dataset, convert it to a DataFrame, and validate the structure
-    # ---- Extract coordinate metadata
-    coordinate_metadata = zarr_data_ds[["longitude", "latitude"]]    
-    # ---- Convert to a DataFrame
-    zarr_data_df = zarr_data_ds.to_dataframe().reset_index()
-    # ---- Check for any missing columns
-    missing_columns = (
-        [key for key in full_config_map.keys() if key not in zarr_data_df.columns]
-    )
-    # ---- Raise Error, if needed
-    if missing_columns: 
-        raise ValueError(
-            f"The following columns are missing from at least one *.{file_extension} file in "
-            f"[{acoustic_directory_path}]: {', '.join(missing_columns)}!"    
-        )
-    # ---- Select defined columns
-    zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map)
-
-    # Extract defined acoustic frequency
-    # ---- From the configuration 
-    transmit_settings = acoustic_analysis_settings["transmit"]
-    # ---- Transform `frequency_nominal`, if necessary
-    zarr_data_df_filtered["frequency_nominal"] = (
-        configure_transmit_frequency(zarr_data_df_filtered["frequency_nominal"],
-                                     transmit_settings,
-                                     zarr_data_ds["frequency_nominal"].units)
-    )
-    # ---- Filter out any unused frequency coordinates
-    zarr_data_df_output = (
-        zarr_data_df_filtered
-        [zarr_data_df_filtered["frequency_nominal"] == transmit_settings["frequency"]]
-    )
+        return datetime.strptime(timestamp, datetime_format)
     
-    # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object
-    # ---- Replace NASC `NaN` values with `0.0`
-    zarr_data_df_output.loc[:, "NASC"] = zarr_data_df_output.loc[:, "NASC"].fillna(0.0)
-    # ---- Drop frequency column and return the output
-    acoustics_output["prc_nasc_df"] = zarr_data_df_output.drop(columns = ["frequency_nominal"])
-    # ---- Return output
-    if update_config:
-        if file_configuration["database"]["acoustics"] is None: 
-            file_configuration["database"]["acoustics"] = db_file
-        return acoustics_output, file_configuration
-    else:
-        return acoustics_output
-####################################################################################################  
-# TEST: ACOUSTIC ZARR FILE INGESTION CONFIGURATION
-# NOTE: 
-# ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration`
-acoustic_data, file_configuration = load_acoustic_data(file_configuration)
-acoustic_data
-####################################################################################################
-def load_biology_data(file_configuration: dict, update_config: bool = True):
-
-    # Get acoustic directory and initialization settings
-    # ---- Files
-    biology_file_settings = file_configuration["input_directories"]["biological"]
-    # ---- General settings
-    biology_analysis_settings = file_configuration["biology"]
-
-    # Get the file-specific settings, datatypes, columns, etc.
-    # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
-    biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"]
-        # ---- Extract the expected file name ID's
-    biology_file_ids = biology_file_settings["file_name_formats"]
-    # ---- Extract all of the file ids
-    biology_config_ids = list(biology_file_ids.keys())
-    # ---- Initialize the dictionary that will define this key in the `input` attribute
-    biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
-    # ---- Initialize the SQL dictionary
-    sql_biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
+infer_datetime_format(stamp)
+convert_datetime(stamp)
+infer_datetime_format(202407)
+
+# {'global': False, 'INPFC': True, 'closest_haul': False, 'weighted_haul': False}
+file_configuration["geospatial"]["link_biology_acoustics"] = "INPFC"
+file_configuration["geospatial"]
+spatial_config = file_configuration["geospatial"]
+###############
+
+acoustic_data = self.input["acoustics"]
+biology_data = self.input["biology"]
+
+def load_spatial_data(acoustic_data: dict,
+                      biology_data: dict,                      
+                      file_configuration: dict,):
     
-    # Create full filepath
-    biology_directory_path = (
-        Path(file_configuration["data_root_dir"]) / biology_file_settings["directory"]
+    # Extract spatial strata *only* if spatial information from the configuration settings
+    # ---- Get (geo)spatial config
+    spatial_config = file_configuration["geospatial"]
+    # ---- Remove case sensitivity
+    spatial_config = {key.lower(): value for key, value in spatial_config.items()}
+    # ---- Extract the projection
+    projection = spatial_config["projection"]
+    # ---- Extract the biology-acoustics linking method options
+    acoustics_biology_link = spatial_config["link_biology_acoustics"]
+
+    # Validate the configuration
+    validate_spatial_config(spatial_config)
+
+    # Assign the spatial link constraints to the acoustic and biological data
+    if acoustics_biology_link == "INPFC":
+        apply_inpfc_definitions(acoustic_data, biology_data, spatial_config)
+
+
+
+    # Convert the DataFrame to a GeoDataFrame
+    acoustic_data_gdf = gpd.GeoDataFrame(
+        data=acoustic_data,
+        geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]),
+        crs=projection
     )
-    # ---- Directory check
-    directory_existence = biology_directory_path.exists()
-    # ---- Error evaluation (if applicable)
-    if not directory_existence:
-        raise FileNotFoundError(
-            f"The acoustic data directory [{biology_directory_path}] does not exist."
-        )
-    # ---- Get the defined file extension
-    file_extension = biology_file_settings["extension"]
-    # ---- Create Path.glob generator object
-    file_path_obj = biology_directory_path.glob(f"*{'.'+file_extension}")
-    #---- Create list of `*.csv`` files
-    csv_files = list(file_path_obj)
-    # ---- Ensure files exist or raise error otherwise
-    if len(csv_files) < 1:
-        raise FileNotFoundError(
-            f"No `*.csv` files found in [{biology_directory_path}]!"
-        )
-    else: 
-        # ---- Create Path to SQL database file
-        db_directory = Path(file_configuration["data_root_dir"]) / "database"
-        # ---- Create the directory if it does not already exist
-        db_directory.mkdir(parents=True, exist_ok=True)
-        # ---- Complete path to `biology.db`
-        db_file = db_directory / "biology.db"
-        # ---- Query the external SQL database to see if the file tracking table exists
-        tables = SQL(db_file, "inspect")
-        # ---- Create a list of string-formatted Path names
-        csv_files_str = [str(file) for file in csv_files]
-        # ---- Create DataFrame
-        current_files = pd.DataFrame(csv_files_str, columns=["filepath"])
-        # ---- Create if it is missing and then advance `csv_files`
-        if "files_read" not in tables:
-            # ---- Insert into the SQL database file
-            _ = SQL(db_file, "insert", table_name="files_read", columns="filepath",
-                     dataframe=current_files)        
-            # ---- Create empty list for later comparison
-            new_files = []
-        else:
-            # ---- Pull already processed filenames
-            previous_files = SQL(db_file, "select", table_name="files_read")
-            # ---- Compare against the current filelist 
-            new_files = (
-                [file for file in csv_files_str if file not in set(previous_files["filepath"])]
-            )  
-            # ---- Create a DataFrame for the new files
-            new_files_df = pd.DataFrame(new_files, columns=["filepath"])
-            # ---- Insert into the SQL database file
-            _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) 
-
-    # Iterate through each of the file ids and read in the data 
-    for id in list(biology_file_ids.keys()): 
-        # ---- Extract the specific config mapping for this tag/id
-        sub_config_map = biology_config_map[id]
-        # ---- Drop the `{FIELD_ID}` tag identifier
-        file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id])
-        # ---- Replace all other tags with `*` placeholders
-        file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
-        # ---- Create Path object with the generalized format
-        subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}")
-        # ---- List all files that match this pattern
-        subcsv_files_str = [str(file) for file in list(subfile_path_obj)]
-        # ---- Filter for only new files
-        subset_files = set(subcsv_files_str).intersection(set(new_files))
-        # ---- Pull from SQL database, if applicable
-        if f"{id}_df" in tables:
-            # ---- SELECT
-            sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*")
-            # ---- Concatenate to the dictionary
-            sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df])
-        # ---- Add data files not stored in SQL database
-        if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables:
-            if len(subset_files) > 0:
-                file_list = subset_files
-            else:
-                file_list = subcsv_files_str
-            # ---- Create a list of relevant dataframes
-            sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map) 
-                            for file in file_list]
-            # ---- Concatenate into a single DataFrame
-            sub_df = pd.concat(sub_df_lst, ignore_index=True)
-            # ---- Lower-case sex
-            if "sex" in sub_df.columns:
-                sub_df["sex"] = sub_df["sex"].str.lower()
-            # ---- Concatenate to the dictionary DataFrame
-            biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df])
-
-    # Get contrasts used for filtering the dataset
-    # ---- Species
-    species_filter = file_configuration["species"]["number_code"]
-    # ---- Trawl partition information
-    trawl_filter = biology_analysis_settings["catch"]["partition"]
-    # ---- Apply the filter
-    filtered_biology_output = {
-        key: df[
-            (df['species_id'] == species_filter if 'species_id' in df.columns else True) &
-            (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns else True)
-        ]
-        for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty
-    }
 
-    # Update the SQL database
-    for table_name, df in filtered_biology_output.items():
-        # ---- Update        
-        _ = SQL(db_file, "insert", table_name=table_name, columns="*", 
-                dataframe=df)
-        
-    # Combine the two datasets 
-    merged_output = {
-        key: pd.concat([
-            sql_biology_output.get(key, pd.DataFrame()), 
-            filtered_biology_output.get(key, pd.DataFrame())
-        ]).drop_duplicates().reset_index(drop=True)
-        for key in set(sql_biology_output) | set(filtered_biology_output)
-    }
-    # ---- Return output
-    if update_config:
-        if file_configuration["database"]["biology"] is None: 
-            file_configuration["database"]["biology"] = db_file
-        return merged_output, file_configuration
-    else:
-        return merged_output
+    # Validate the spatial biology-acoustics linking method
+    # ---- Get the biology-acoustics linking method
+    link_method = next(key for key, value in acoustics_biology_link.items() if value)
+    # ---- Flag Error if unexpected method
+    if link_method not in ["global", "closest_haul", "INPFC", "weighted_haul"]:
+        raise ValueError(
+            f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options "
+            f"include: 'global', 'closest_haul', 'weighted_haul', and 'INPFC'."
+        )
+    
 ####################################################################################################  
 # TEST: BIOLOGY FILE INGESTION CONFIGURATION
 # NOTE: 

From 7f49f316a7dcacca48940b076df8b75923250b96 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Thu, 25 Jul 2024 09:51:06 -0700
Subject: [PATCH 08/81] Reorganize loading functions

---
 echopop/live/live_core.py            |  61 ++
 echopop/live/live_data_loading.py    | 612 +++++++++++++++++++
 echopop/live/live_data_processing.py | 877 +--------------------------
 echopop/live/live_spatial_methods.py | 198 ++++++
 echopop/live/live_survey.py          |   8 +-
 echopop/live/sql_methods.py          | 137 +++--
 echopop/zarr_read_ingest_test.py     |  17 +-
 7 files changed, 986 insertions(+), 924 deletions(-)
 create mode 100644 echopop/live/live_data_loading.py
 create mode 100644 echopop/live/live_spatial_methods.py

diff --git a/echopop/live/live_core.py b/echopop/live/live_core.py
index 95750f5f..28a63237 100644
--- a/echopop/live/live_core.py
+++ b/echopop/live/live_core.py
@@ -131,3 +131,64 @@
         "expression": r"(?P<FILE_ID>.+)"
     },
 }
+
+SPATIAL_CONFIG_MAP = {
+    "closest_haul": {
+        "proximity": {
+            "choices": ["distance", "time"],
+        },
+    },
+    "global" : {},
+    "griddify": {
+        "bounds": {
+            "longitude": {
+                "types": [float]
+            },
+            "latitude": {
+                "types": [float]
+            },
+            "northings": {
+                "types": [float]
+            },
+            "eastings": {
+                "types": [float]
+            },
+            "pairs": [("longitude", "latitude"), ("northings", "eastings")],
+        },
+        "grid_resolution": {
+            "x_distance": {
+                "types": float,
+            },
+            "y_distance": {
+                "types": float,
+            },
+            "d_longitude": {
+                "types": float,
+            },
+            "d_latitude": {
+                "types": float,
+            },
+            "grid_size_x": {
+                "types": int,
+            },
+            "grid_size_y": {
+                "types": int,
+            },
+            "pairs": [("x_distance", "y_distance"), ("d_longitude", "d_latitude"), 
+                      ("grid_size_x", "grid_size_y")],       
+        },
+    },
+    "inpfc": {
+        "stratum_names": {
+                "types": [int, str]
+            },
+        "latitude_max": {
+            "types": [float],
+        },
+    },
+    "weighted_haul": {
+        "proximity": {
+            "choices": ["distance", "time"]
+        },
+    },
+}
\ No newline at end of file
diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
new file mode 100644
index 00000000..ce5a06f7
--- /dev/null
+++ b/echopop/live/live_data_loading.py
@@ -0,0 +1,612 @@
+from pathlib import Path
+from typing import Union, Tuple, Optional, List
+import yaml
+import re
+from .sql_methods import SQL, query_processed_files, sql_data_exchange
+import pandas as pd
+from datetime import datetime
+import xarray as xr
+
+from .live_core import(
+    LIVE_FILE_FORMAT_MAP,
+    LIVE_INPUT_FILE_CONFIG_MAP,
+    SPATIAL_CONFIG_MAP
+)
+
+# TODO: Incorporate complete YAML file validator
+# TODO: Documentation
+def live_configuration(live_init_config_path: Union[str, Path], 
+                       live_file_config_path: Union[str, Path]):
+    
+    # Validate file existence
+    # ---- str-to-Path conversion, if necessary
+    live_init_config_path = Path(live_init_config_path)
+    live_file_config_path = Path(live_file_config_path)
+    # ---- Create list of both config paths
+    config_files = [live_init_config_path, live_file_config_path]
+    # ---- List of file existence checks
+    config_existence = [live_init_config_path.exists(), live_file_config_path.exists()]
+    # ---- Error evaluation and print message (if applicable)
+    if not all(config_existence):
+        missing_config = [
+            files for files, exists in zip(config_files, config_existence) if not exists
+        ]
+        raise FileNotFoundError(
+            f"The following configuration files do not exist: {missing_config}."
+            )
+
+    # Read the YAML configuration/recipe file to parameterize the `LiveSurvey` class
+    # ---- Initialization settings
+    init_config = yaml.safe_load(Path(live_init_config_path).read_text())
+    # ---- Filepath/directory settings
+    file_config = yaml.safe_load(Path(live_file_config_path).read_text())
+    
+    # Check for intersecting/duplicative configuration keys
+    # ---- Compare sets of keys from each dictionary
+    config_intersect = set(init_config.keys()).intersection(set(file_config.keys()))
+    # ---- Raise error if needed
+    if config_intersect:
+        raise ValueError(
+            f"The initialization and file configuration files comprise the following intersecting "
+            f"keys: {' ,'.join(config_intersect)}. Key names must be unique for each configuration "
+            f"file."
+        )
+    
+    # Combine both into a dictionary output that can be added to the `LiveSurvey` class object
+    return {**init_config, **file_config}
+
+# TODO: Documentation
+def validate_data_directory(root_directory: str, file_settings: dict) -> List[Path]:
+
+    # Get acoustic directory and initialization settings
+    # ---- Create the full filepath 
+    directory_path = Path(root_directory) / file_settings["directory"]
+    # ---- Get the defined file extension
+    file_extension = file_settings["extension"]
+
+    # Validate filepath, columns, datatypes
+    # ---- Error evaluation (if applicable)
+    if not directory_path.exists():
+        raise FileNotFoundError(
+            f"The acoustic data directory [{directory_path}] does not exist."
+        )
+    
+    # Validate that files even exist
+    # ---- List available *.zarr files
+    data_files = list(directory_path.glob(f"*{'.'+file_extension}"))
+    # ---- Error evaluation (if applicable)
+    if not data_files:
+        raise FileNotFoundError(
+            f"No `*.{file_extension}` files found in [{directory_path}]!"
+        )
+    
+    # Return the output
+    return data_files
+
+def read_acoustic_zarr(acoustic_files: Path) -> tuple:
+
+    # Get the file-specific settings, datatypes, columns, etc.
+    # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
+    acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"]
+    # ---- Create list of coordinate data variables
+    specified_vars = list(acoustics_config_map["xarray_variables"].keys())
+    # ---- Create set of coordinate variables
+    specified_coords = list(acoustics_config_map["xarray_coordinates"].keys())      
+    # ---- Concatenate into a full configuration map
+    full_config_map = {**acoustics_config_map["xarray_coordinates"],
+                        **acoustics_config_map["xarray_variables"]} 
+    
+    # Determine the file loading method for the `acoustic_files`
+    if len(acoustic_files) > 1:
+        zarr_data_ds = xr.open_mfdataset(acoustic_files, engine="zarr", chunks="auto", 
+                                         data_vars=specified_vars, coords=specified_coords)
+    else:
+        zarr_data_ds = xr.open_dataset(acoustic_files[0], engine="zarr", chunks="auto")
+
+    # Pre-process the Dataset, convert it to a DataFrame, and validate the structure
+    # ---- Convert to a DataFrame
+    zarr_data_df = zarr_data_ds.to_dataframe().reset_index()
+    # ---- Check for any missing columns
+    missing_columns = (
+        [key for key in full_config_map.keys() if key not in zarr_data_df.columns]
+    )
+    # ---- Raise Error, if needed
+    if missing_columns: 
+        raise ValueError(
+            f"The following columns are missing from at least one file: in "
+            f"{', '.join(missing_columns)}!"
+        )
+    # ---- Select defined columns
+    zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map)
+
+    # Gather some of the units
+    data_units = {
+        "longitude": zarr_data_ds.longitude.units,
+        "latitude": zarr_data_ds.latitude.units,
+        "frequency": zarr_data_ds.frequency_nominal.units,
+    }
+
+    # Return a Tuple
+    return zarr_data_df_filtered, data_units
+
+# TODO: Documentation
+def configure_transmit_frequency(frequency_values: pd.Series,
+                                 transmit_settings: dict, 
+                                 current_units: str):
+    
+    # Extract transmit frequency units defined in configuration file
+    configuration_units = transmit_settings["units"]
+    
+    # Transform the units, if necessary
+    # ---- Hz to kHz
+    if current_units == "Hz" and configuration_units == "kHz":
+        return frequency_values * 1e-3
+    # ---- kHz to Hz
+    elif current_units == "kHz" and configuration_units == "Hz":
+        return frequency_values * 1e3
+    # ---- No change
+    else:
+        return frequency_values
+    
+# TODO: Documentation
+def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame,
+                             file_configuration: dict) -> pd.DataFrame:
+
+    # Get acoustic processing settings
+    acoustic_analysis_settings = file_configuration["acoustics"]
+    # ---- Extract the fined acoustic frequency
+    transmit_settings = acoustic_analysis_settings["transmit"]
+
+    # Filter the dataset
+    # ---- Configure `frequency_nominal`, if necessary
+    prc_nasc_df["frequency_nominal"] = (
+        configure_transmit_frequency(prc_nasc_df["frequency_nominal"],
+                                     transmit_settings,
+                                     acoustic_analysis_settings["dataset_units"]["frequency"])
+    )
+    # ---- Filter out any unused frequency coordinates
+    prc_nasc_df_filtered = (
+        prc_nasc_df[prc_nasc_df["frequency_nominal"] == transmit_settings["frequency"]]
+    )
+
+    # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object
+    # ---- Replace NASC `NaN` values with `0.0`
+    prc_nasc_df_filtered.loc[:, "NASC"] = prc_nasc_df_filtered.loc[:, "NASC"].fillna(0.0)
+    # ---- Drop the `frequency_nominal` column and return the output 
+    return prc_nasc_df_filtered.drop(columns = ["frequency_nominal"])
+
+# TODO: Documentation
+def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame]:
+
+    # Get the acoustic file settings and root directory
+    # ---- File settings
+    file_settings = file_configuration["input_directories"]["acoustics"]
+    # ---- Root directory
+    root_directory = file_configuration["data_root_dir"]
+    
+    # Get and validate the acoustic data directory and files
+    acoustic_files = validate_data_directory(root_directory, file_settings)
+
+    # Query `acoustics.db` to process only new files (or create the db file in the first place)
+    new_acoustic_files, file_configuration["database"]["acoustics"] = (
+        query_processed_files(root_directory, file_settings, acoustic_files)  
+    )  
+
+    # Read in the acoustic data files
+    if new_acoustic_files:
+        # ! [REQUIRES DASK] ---- Read in the listed file
+        prc_nasc_df, acoustic_data_units = read_acoustic_zarr(new_acoustic_files)
+        # ---- Add the `acoustic_data_units` to the dictionary
+        file_configuration["acoustics"]["dataset_units"] = acoustic_data_units
+        # ---- Preprocess the acoustic dataset
+        prc_nasc_df_processed = preprocess_acoustic_data(prc_nasc_df, file_configuration)
+        # ---- Return output
+        return prc_nasc_df_processed
+    else:
+        return None
+    
+def filter_filenames(directory_path: Path, filename_id: str, 
+                     files: List[Path],
+                     file_extension: str):
+
+    # Drop the `{FIELD_ID}` tag identifier
+    file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', filename_id)
+    # ---- Replace all other tags with `*` placeholders
+    file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
+    # ---- Create Path object with the generalized format
+    subfile_path_obj = directory_path.glob(f"{file_id_format}.{file_extension}")
+    # ---- List all files that match this pattern
+    subfile_str = [str(file) for file in list(subfile_path_obj)]
+
+    # Convert list of proposed files from Path to String
+    file_str = [str(file) for file in list(files)]
+    
+    # Find intersection with the proposed filenames and return the output
+    return list(set(subfile_str).intersection(set(file_str)))
+
+def compile_filename_format(file_name_format: str):
+
+    # Create a copy of `file_name_format`
+    regex_pattern = file_name_format
+    
+    # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern
+    for key, value in LIVE_FILE_FORMAT_MAP.items():
+        regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"])
+    # ---- Replace the `FILE_ID` tag
+    regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P<FILE_ID>\1)', regex_pattern)
+
+    # Compile the regex pattern and return the output
+    return re.compile(regex_pattern)
+
+def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict):
+
+    # Read in the `*.csv` file
+    df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys()))
+
+    # Validate the dataframe
+    # ---- Check for any missing columns
+    missing_columns = (
+        [key for key in config_map["dtypes"].keys() if key not in df.columns]
+    )
+    # ---- Raise Error, if needed
+    if missing_columns: 
+        raise ValueError(
+            f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!"
+        )
+    # ---- Ensure the correct datatypes
+    df_validated = df.astype(config_map["dtypes"])
+    # ---- Replace column names and drop 
+    df_validated = df_validated.rename(columns=config_map["names"])
+
+    # Get the substring components that can be added to the DataFrame
+    filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern)
+    # ---- Create sub-list of columns that can be added to the DataFrame
+    valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings)))
+
+    # Compile the filename regular expression
+    compiled_regex = compile_filename_format(pattern)
+    # ---- Create the `Match` object that will be used to parse the string
+    match_obj = compiled_regex.search(file.name)
+
+    # Iterate through the filename-derived tags and add them to the DataFrame
+    for i in valid_tags: 
+        matched_key = LIVE_FILE_FORMAT_MAP[i]
+        df_validated[matched_key["name"]] = matched_key["dtype"](match_obj.group(i))
+
+    # Return the resulting DataFrame
+    return df_validated
+
+def get_table_key_names(db_file: Path, data_dict: dict, table_name: str) -> List[str]:
+
+    # Get the data input column names
+    if data_dict[table_name].empty:
+        # ---- Inspect the table
+        inspected_table = SQL(db_file, "inspect", table_name=table_name)
+        # ---- Create a list of the data columns
+        table_columns = list(inspected_table.keys())
+    else:
+        # ---- Get the DataFrame column names
+        table_columns = data_dict[table_name].columns
+
+    # Create a list of the primary keys
+    key_columns = (
+           set(table_columns)
+           .intersection(["trawl_partition", "sex", "haul_num", "species_id", "longitude", 
+                          "latitude"]) 
+        )
+
+    # Return a list of the output
+    return list(key_columns)
+
+def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict):
+
+    # Create dataframe copy
+    data_copy = biology_data.copy()
+
+    # Iterate through dictionary to apply filters (if present)
+    for column, value in filter_dict.items():
+        if column in data_copy.columns:
+            data_copy = data_copy[data_copy[column] == value]
+
+    # Return output
+    return data_copy
+
+def preprocess_biology_data(biology_output: dict, file_configuration: dict):
+  
+    # Get SQL database file
+    biology_db = file_configuration["database"]["biology"]
+    
+    # Get contrasts used for filtering the dataset
+    # ---- Species
+    species_filter = file_configuration["species"]["number_code"]
+    # ---- Trawl partition information
+    trawl_filter = file_configuration["biology"]["catch"]["partition"]
+    # ---- Create filter dictionary
+    filter_dict = dict(species_id=species_filter, trawl_partition=trawl_filter)
+
+    # Apply the filter
+    filtered_biology_output = {
+        key: biology_data_filter(df, filter_dict) 
+        for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty
+    }
+    # ---- Swap this out if no new files are present
+    if not filtered_biology_output:
+        # ---- Get available tables
+        table_list = list(set(SQL(biology_db, "map")) - set(["files_read"]))
+        # ---- Plug into the dictionary
+        filtered_biology_output.update({key: pd.DataFrame() for key in table_list})
+    # ---- Initialize the results dictionary   
+    results_dict = {key: pd.DataFrame() for key in filtered_biology_output.keys()}
+
+    # Update the SQL database
+    for table_name, df in filtered_biology_output.items():
+        # ---- Get identifier columns
+        key_columns = get_table_key_names(biology_db, filtered_biology_output, table_name)
+        # ---- Create copy
+        df = df.copy()
+        # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint
+        df.loc[:, "id"] = "row" + df.index.astype(str) + "-" + "-".join(key_columns)        
+        # ---- Insert the new data into the database & pull in the combined dataset
+        table_df = sql_data_exchange(biology_db, 
+                                     dataframe=df, 
+                                     table_name=table_name, 
+                                     id_columns=["id"],
+                                     primary_keys=["id"],
+                                     output_type=pd.DataFrame)
+        # ---- Add to the outgoing dictionary (and drop SQL db identifier)
+        results_dict.update({table_name: table_df.drop(columns="id")})
+    
+    # Return the output
+    return results_dict
+
+def infer_datetime_format(timestamp_str: Union[int, str]):
+    patterns = {
+        r"^\d{14}$": "%Y%m%d%H%M%S",             # YYYYMMDDHHMMSS
+        r"^\d{8}$": "%Y%m%d",                     # YYYYMMDD
+        r"^\d{6}$": "%H%M%S",                     # HHMMSS
+        r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$": "%Y-%m-%d %H:%M:%S",  # YYYY-MM-DD HH:MM:SS
+        r"^\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}$": "%Y/%m/%d %H:%M:%S",  # YYYY/MM/DD HH:MM:SS
+        r"^\d{4}-\d{2}-\d{2}$": "%Y-%m-%d",       # YYYY-MM-DD
+        r"^\d{4}/\d{2}/\d{2}$": "%Y/%m/%d"        # YYYY/MM/DD
+    }
+    
+    for pattern, date_format in patterns.items():
+        if re.match(pattern, timestamp_str):
+            return date_format
+    
+    raise ValueError("Unknown timestamp format")
+
+def convert_datetime(timestamp: Union[int, str, pd.Series]):
+
+    if isinstance(timestamp, pd.Series):
+        test_timestamp = str(timestamp[0])
+    else:
+        test_timestamp = str(timestamp)
+
+    # Approximate the datetime format
+    datetime_format = infer_datetime_format(str(test_timestamp))
+
+    #
+    if isinstance(timestamp, pd.Series):
+        return timestamp.apply(lambda x: datetime.strptime(x, datetime_format))
+    else:
+        return datetime.strptime(timestamp, datetime_format)
+
+def load_biology_data(file_configuration: dict):
+
+    # Get the acoustic file settings and root directory
+    # ---- File settings
+    file_settings = file_configuration["input_directories"]["biology"]
+    # ---- Root directory
+    root_directory = file_configuration["data_root_dir"]
+
+    # Get and validate the acoustic data directory and files
+    biology_files = validate_data_directory(root_directory, file_settings)
+
+    # Query `biology.db` to process only new files (or create the db file in the first place)
+    # SQL(biology_db, "drop", table_name="files_read")
+    new_biology_files, file_configuration["database"]["biology"] = (
+        query_processed_files(root_directory, file_settings, biology_files)
+    )
+
+    # Get the file-specific settings, datatypes, columns, etc.
+    # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
+    biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"]
+    # ---- Extract the expected file name ID's
+    biology_file_ids = file_settings["file_name_formats"]
+    # ---- Extract all of the file ids
+    biology_config_ids = list(biology_file_ids.keys())
+    # ---- Initialize the dictionary that will define this key in the `input` attribute
+    biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
+    # ---- Create filepath object
+    directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"]
+    
+    # Add SQL file to dict
+    file_configuration["database"]["biology"] = (
+        Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"]         
+    )
+
+    # Iterate through the different biology datasets and read them in
+    for dataset in list(biology_file_ids.keys()):
+        # ---- Get dataset-specific file lists
+        dataset_files = filter_filenames(directory_path, 
+                                         file_settings["file_name_formats"][dataset], 
+                                         new_biology_files, 
+                                         file_settings["extension"])
+        # ---- If there are dataset files available
+        if dataset_files:
+            # ---- Read in validated biology data
+            dataframe_list = [read_biology_csv(Path(file), 
+                                               file_settings["file_name_formats"][dataset], 
+                                               biology_config_map[dataset]) 
+                              for file in dataset_files]
+            # ---- Concatenate the dataset
+            dataframe_combined = pd.concat(dataframe_list, ignore_index=True)
+            # ---- Lower-case sex
+            if "sex" in dataframe_combined.columns: 
+                dataframe_combined["sex"] = dataframe_combined["sex"].str.lower()
+            # ---- Lower-case trawl partition type
+            if "trawl_partition" in dataframe_combined.columns: 
+                dataframe_combined["trawl_partition"] = dataframe_combined["trawl_partition"].str.lower()
+            # ---- Reformat datetime column
+            if "datetime" in dataframe_combined.columns:
+                dataframe_combined["datetime"] = convert_datetime(dataframe_combined["datetime"])
+            # ---- Add to the data dictionary
+            biology_output[f"{dataset}_df"] = dataframe_combined
+
+    # Pre-process and return the results
+    return preprocess_biology_data(biology_output, file_configuration)
+
+def validate_hauls_config(spatial_config: dict, link_method: str):
+
+    # Get the link method configuration map
+    link_method_settings = SPATIAL_CONFIG_MAP[link_method]
+
+    # Extract the defined settings
+    input_method_settings = spatial_config[link_method]
+
+    # Check for `proximity` 
+    if "proximity" not in input_method_settings.keys():
+        raise KeyError(
+            "The following parameters are missing from the biology-acoustic linking method: "
+            "'proximity'!"
+        )
+    
+    # Evaluate valid options for `proximity`
+    if input_method_settings["proximity"] not in link_method_settings["proximity"]["choices"]:
+        raise KeyError(
+            f"Value biology-acoustic linking method parameter `proximity` must be one of the : "
+            f"following: {link_method_settings['proximity']['choices']}."
+        )       
+    
+def validate_griddify_config(spatial_config: dict, link_method: str):
+
+    # Get the link method configuration map
+    link_method_settings = SPATIAL_CONFIG_MAP[link_method]
+
+    # Extract the defined settings
+    input_method_settings = spatial_config[link_method]
+
+    # Check for the required keys
+    key_diff = set(input_method_settings.keys()).difference(set(link_method_settings.keys()))
+    # ---- Raise Error
+    if key_diff:
+        raise KeyError(
+            f"The following parameters are missing from the biology-acoustic linking method: "
+            f"{list(key_diff)}!"
+        )    
+    
+    # Iterate through the keys to evaluate inputs
+    for key in list(input_method_settings.keys()):
+        # ---- Subset the input method config
+        input = input_method_settings[key]
+        # ---- Get the original config of the dtypes
+        model = link_method_settings[key]
+        # ---- Compare entries
+        parameter_diff = set(input.keys()).difference(set(model.keys()))
+        # ---- Raise Error
+        if parameter_diff:
+            raise KeyError(
+                f"Unexpected parameter(s) ('{parameter_diff}') detected in '{link_method}' "
+                f"configuration."
+            )    
+        # ---- Check if the appropriate coordinate pairs are present
+        coordinate_pairs = [set(param).intersection(set(input.keys())) for param in model["pairs"]]
+        # ---- Count the number of paired coordinates
+        pair_counts = [len(param) for param in coordinate_pairs]
+        # ---- If there are multiple pairs
+        if (np.array(pair_counts) == 2).sum() != 1:
+            raise ValueError(
+                f"A single coordinate-pair is allowed (and required) within the '{key}' parameter "
+                f"for the link method '{link_method}' defined via the following options: "
+                f"{model['pairs']}."
+            )
+        # ---- Check the datatypes
+        for parameter in input.keys():
+            # ---- Get the datatypes
+            config_dtypes = model[parameter]["types"]
+            # ---- Get input parameter
+            input_parameter = input[parameter]
+            # ---- If List
+            if isinstance(config_dtypes, list):
+                if not isinstance(input_parameter, list):
+                    raise TypeError(
+                        f"Biology-acoustic linking method argument '{parameter}' within '{key}' "
+                        f"for method '{link_method}' must be contained within a list."
+                    )
+            else:
+                input_parameter = [input_parameter]
+                config_dtypes = [config_dtypes]
+            # ---- Check correct datatypes
+            if not np.all([type(value) in config_dtypes for value in input_parameter]):
+                raise TypeError(
+                    f"Biology-acoustic linking method argument '{parameter}' within '{key}' "
+                    f"for method '{link_method}' must be one of the following types within a list: "
+                    f"{config_dtypes}."
+                )    
+
+def validate_inpfc_config(spatial_config: dict, link_method: str):
+
+    # Get the link method configuration map
+    link_method_settings = SPATIAL_CONFIG_MAP[link_method]
+
+    # Extract the defined settings
+    input_method_settings = spatial_config[link_method]
+
+    # Check for the required keys
+    key_diff = set(input_method_settings.keys()).difference(set(link_method_settings.keys()))
+    # ---- Raise Error
+    if key_diff:
+        raise KeyError(
+            f"The following parameters are missing from the biology-acoustic linking method: "
+            f"{list(key_diff)}!"
+        )
+    
+    # Iterate through the keys to evaluate inputs
+    for key in list(input_method_settings.keys()):
+        # ---- Subset the input method config
+        input = input_method_settings[key]
+        # ---- Get the original config of the dtypes
+        model = link_method_settings[key]["types"]
+        # ---- Evaluate if a list 
+        if not isinstance(input, list):
+            raise TypeError(
+                f"Biology-acoustic linking method argument '{key}' for method '{link_method}' must "
+                f"be contained within a list."
+            )
+        # ---- Evaluate if it is a type within the list
+        if not type(input[0]) in model:
+            raise TypeError(
+                f"Biology-acoustic linking method argument '{key}' for method '{link_method}' must "
+                f"be one of the following types within a list: {model}."
+            )    
+        
+
+def validate_spatial_config(spatial_config: dict):
+
+    # Check the link method
+    # ---- Extract string-formatted method name
+    link_method = spatial_config["link_biology_acoustics"].lower()
+    # ---- Validate
+    if link_method not in SPATIAL_CONFIG_MAP.keys():
+        raise ValueError(
+            f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options "
+            f"include: 'global', 'closest_haul', 'weighted_haul', 'griddify', and 'INPFC'."
+        )
+    
+    # Verify that associated parameters are present in the configuration settings
+    # ---- Get keys as a list
+    config_keys = list(spatial_config.keys())
+    # ---- Check for specific methods
+    if link_method not in config_keys and link_method != "global":
+        raise ValueError(
+            f"No parameters provided for the biology-acoustic linking ([{link_method}])."
+        )
+    
+    # Check key settings
+    if link_method == "griddify": 
+        validate_griddify_config(spatial_config, link_method)
+    elif link_method == "inpfc": 
+        validate_inpfc_config(spatial_config, link_method)
+    elif link_method != "global": 
+        validate_hauls_config(spatial_config, link_method)
diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py
index fd89993c..cf126230 100644
--- a/echopop/live/live_data_processing.py
+++ b/echopop/live/live_data_processing.py
@@ -5,885 +5,10 @@
 from typing import Union, Tuple, Optional, List
 
 import pandas as pd
-import xarray as xr
+
 import numpy as np
 
 from .live_core import(
-    LIVE_DATA_STRUCTURE,
     LIVE_FILE_FORMAT_MAP,
     LIVE_INPUT_FILE_CONFIG_MAP
 )
-
-from .sql_methods import SQL
-
-# TODO: Incorporate complete YAML file validator
-# TODO: Documentation
-def live_configuration(live_init_config_path: Union[str, Path], 
-                       live_file_config_path: Union[str, Path]):
-    
-    # Validate file existence
-    # ---- str-to-Path conversion, if necessary
-    live_init_config_path = Path(live_init_config_path)
-    live_file_config_path = Path(live_file_config_path)
-    # ---- Create list of both config paths
-    config_files = [live_init_config_path, live_file_config_path]
-    # ---- List of file existence checks
-    config_existence = [live_init_config_path.exists(), live_file_config_path.exists()]
-    # ---- Error evaluation and print message (if applicable)
-    if not all(config_existence):
-        missing_config = [
-            files for files, exists in zip(config_files, config_existence) if not exists
-        ]
-        raise FileNotFoundError(
-            f"The following configuration files do not exist: {missing_config}."
-            )
-
-    # Read the YAML configuration/recipe file to parameterize the `LiveSurvey` class
-    # ---- Initialization settings
-    init_config = yaml.safe_load(Path(live_init_config_path).read_text())
-    # ---- Filepath/directory settings
-    file_config = yaml.safe_load(Path(live_file_config_path).read_text())
-    
-    # Check for intersecting/duplicative configuration keys
-    # ---- Compare sets of keys from each dictionary
-    config_intersect = set(init_config.keys()).intersection(set(file_config.keys()))
-    # ---- Raise error if needed
-    if config_intersect:
-        raise ValueError(
-            f"The initialization and file configuration files comprise the following intersecting "
-            f"keys: {' ,'.join(config_intersect)}. Key names must be unique for each configuration "
-            f"file."
-        )
-    
-    # Combine both into a dictionary output that can be added to the `LiveSurvey` class object
-    return {**init_config, **file_config}
-
-def validate_data_directory(root_directory: str, file_settings: dict) -> List[Path]:
-
-    # Get acoustic directory and initialization settings
-    # ---- Create the full filepath 
-    directory_path = Path(root_directory) / file_settings["directory"]
-    # ---- Get the defined file extension
-    file_extension = file_settings["extension"]
-
-    # Validate filepath, columns, datatypes
-    # ---- Error evaluation (if applicable)
-    if not directory_path.exists():
-        raise FileNotFoundError(
-            f"The acoustic data directory [{directory_path}] does not exist."
-        )
-    
-    # Validate that files even exist
-    # ---- List available *.zarr files
-    data_files = list(directory_path.glob(f"*{'.'+file_extension}"))
-    # ---- Error evaluation (if applicable)
-    if not data_files:
-        raise FileNotFoundError(
-            f"No `*.{file_extension}` files found in [{directory_path}]!"
-        )
-    
-    # Return the output
-    return data_files
-
-def query_processed_files(root_directory: str, file_settings: dict, files: List[Path]) -> dict:
-
-    # Get the database name 
-    db_name = file_settings["database_name"]
-
-    # Create filepath to the SQL database
-    # ---- Create Path to SQL database file
-    db_directory = Path(root_directory) / "database"
-    # ---- Create the directory if it does not already exist
-    db_directory.mkdir(parents=True, exist_ok=True)
-    # ---- Complete path to the database file
-    db_file = db_directory / db_name
-
-    # Create a list of string-formatted Path names
-    files_str = [str(file) for file in files]
-    # ---- Create DataFrame
-    current_files = pd.DataFrame(files_str, columns=["filepath"])
-
-    # Check for the table `files_read`
-    files_read_tbl = SQL(db_file, "validate", table_name="files_read")
-
-    # Validate whether the table exists; if not, create the table and then insert
-    if not files_read_tbl:
-        # ---- Create table
-        SQL(db_file, "create", table_name="files_read", dataframe=current_files, 
-            primary_keys = ["filepath"])
-        # ---- Populate table
-        SQL(db_file, "insert", table_name="files_read", dataframe=current_files)
-        # ---- Break early
-        return files_str, db_file
-    
-    # Query already existing files
-    previous_files = SQL(db_file, "select", table_name="files_read", output_type=str)
-    # ---- Insert file list
-    SQL(db_file, "insert", table_name="files_read", dataframe=current_files, id_columns="filepath")
-
-    # Filter out previously processed files
-    # ---- Apply filter by comparing sets and return the output
-    return list(set(files_str) - set(previous_files)), db_file
-
-def sql_data_exchange(database_file: Path, **kwargs):
-
-    # Check whether the `table_name` table exists
-    table_exists = SQL(database_file, "validate", **kwargs)
-
-    # If empty and table does not exist
-    if kwargs["dataframe"].empty and table_exists:
-        return SQL(database_file, "select", **kwargs)
-
-    # Create table if it does not exist and run the initial insertion
-    if not table_exists:
-        # ---- Create table
-        SQL(database_file, "create", **kwargs)
-        # ---- Ignore the `id_columns` argument, if present
-        try:
-            del kwargs["id_columns"]
-        except KeyError:
-            pass
-        # ---- Insert into table        
-        SQL(database_file, "insert", **kwargs)
-        # ---- Return the initial dataframe
-        return kwargs.get("dataframe")
-    
-    # Insert into the table
-    SQL(database_file, "insert", **kwargs)
-    
-    # Select existing data frame the database and return the output
-    return SQL(database_file, "select", **kwargs)
-
-def read_acoustic_zarr(acoustic_files: Path) -> tuple:
-
-    # Get the file-specific settings, datatypes, columns, etc.
-    # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
-    acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"]
-    # ---- Create list of coordinate data variables
-    specified_vars = list(acoustics_config_map["xarray_variables"].keys())
-    # ---- Create set of coordinate variables
-    specified_coords = list(acoustics_config_map["xarray_coordinates"].keys())      
-    # ---- Concatenate into a full configuration map
-    full_config_map = {**acoustics_config_map["xarray_coordinates"],
-                        **acoustics_config_map["xarray_variables"]} 
-    
-    # Determine the file loading method for the `acoustic_files`
-    if len(acoustic_files) > 1:
-        zarr_data_ds = xr.open_mfdataset(acoustic_files, engine="zarr", chunks="auto", 
-                                         data_vars=specified_vars, coords=specified_coords)
-    else:
-        zarr_data_ds = xr.open_dataset(acoustic_files[0], engine="zarr", chunks="auto")
-
-    # Pre-process the Dataset, convert it to a DataFrame, and validate the structure
-    # ---- Convert to a DataFrame
-    zarr_data_df = zarr_data_ds.to_dataframe().reset_index()
-    # ---- Check for any missing columns
-    missing_columns = (
-        [key for key in full_config_map.keys() if key not in zarr_data_df.columns]
-    )
-    # ---- Raise Error, if needed
-    if missing_columns: 
-        raise ValueError(
-            f"The following columns are missing from at least one file: in "
-            f"{', '.join(missing_columns)}!"
-        )
-    # ---- Select defined columns
-    zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map)
-
-    # Gather some of the units
-    data_units = {
-        "longitude": zarr_data_ds.longitude.units,
-        "latitude": zarr_data_ds.latitude.units,
-        "frequency": zarr_data_ds.frequency_nominal.units,
-    }
-
-    # Return a Tuple
-    return zarr_data_df_filtered, data_units
-
-# TODO: Documentation
-def configure_transmit_frequency(frequency_values: pd.Series,
-                                 transmit_settings: dict, 
-                                 current_units: str):
-    
-    # Extract transmit frequency units defined in configuration file
-    configuration_units = transmit_settings["units"]
-    
-    # Transform the units, if necessary
-    # ---- Hz to kHz
-    if current_units == "Hz" and configuration_units == "kHz":
-        return frequency_values * 1e-3
-    # ---- kHz to Hz
-    elif current_units == "kHz" and configuration_units == "Hz":
-        return frequency_values * 1e3
-    # ---- No change
-    else:
-        return frequency_values
-
-def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame,
-                             file_configuration: dict) -> pd.DataFrame:
-
-    # Get acoustic processing settings
-    acoustic_analysis_settings = file_configuration["acoustics"]
-    # ---- Extract the fined acoustic frequency
-    transmit_settings = acoustic_analysis_settings["transmit"]
-
-    # Filter the dataset
-    # ---- Configure `frequency_nominal`, if necessary
-    prc_nasc_df["frequency_nominal"] = (
-        configure_transmit_frequency(prc_nasc_df["frequency_nominal"],
-                                     transmit_settings,
-                                     acoustic_analysis_settings["dataset_units"]["frequency"])
-    )
-    # ---- Filter out any unused frequency coordinates
-    prc_nasc_df_filtered = (
-        prc_nasc_df[prc_nasc_df["frequency_nominal"] == transmit_settings["frequency"]]
-    )
-
-    # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object
-    # ---- Replace NASC `NaN` values with `0.0`
-    prc_nasc_df_filtered.loc[:, "NASC"] = prc_nasc_df_filtered.loc[:, "NASC"].fillna(0.0)
-    # ---- Drop the `frequency_nominal` column and return the output 
-    return prc_nasc_df_filtered.drop(columns = ["frequency_nominal"])
-
-def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame]:
-
-    # Get the acoustic file settings and root directory
-    # ---- File settings
-    file_settings = file_configuration["input_directories"]["acoustics"]
-    # ---- Root directory
-    root_directory = file_configuration["data_root_dir"]
-    
-    # Get and validate the acoustic data directory and files
-    acoustic_files = validate_data_directory(root_directory, file_settings)
-
-    # Query `acoustics.db` to process only new files (or create the db file in the first place)
-    new_acoustic_files, file_configuration["database"]["acoustics"] = (
-        query_processed_files(root_directory, file_settings, acoustic_files)  
-    )  
-
-    # Read in the acoustic data files
-    if new_acoustic_files:
-        # ! [REQUIRES DASK] ---- Read in the listed file
-        prc_nasc_df, acoustic_data_units = read_acoustic_zarr(new_acoustic_files)
-        # ---- Add the `acoustic_data_units` to the dictionary
-        file_configuration["acoustics"]["dataset_units"] = acoustic_data_units
-        # ---- Preprocess the acoustic dataset
-        prc_nasc_df_processed = preprocess_acoustic_data(prc_nasc_df, file_configuration)
-        # ---- Return output
-        return prc_nasc_df_processed
-    else:
-        return None
-    
-def filter_filenames(directory_path: Path, filename_id: str, 
-                     files: List[Path],
-                     file_extension: str):
-
-    # Drop the `{FIELD_ID}` tag identifier
-    file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', filename_id)
-    # ---- Replace all other tags with `*` placeholders
-    file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
-    # ---- Create Path object with the generalized format
-    subfile_path_obj = directory_path.glob(f"{file_id_format}.{file_extension}")
-    # ---- List all files that match this pattern
-    subfile_str = [str(file) for file in list(subfile_path_obj)]
-
-    # Convert list of proposed files from Path to String
-    file_str = [str(file) for file in list(files)]
-    
-    # Find intersection with the proposed filenames and return the output
-    return list(set(subfile_str).intersection(set(file_str)))
-
-def compile_filename_format(file_name_format: str):
-
-    # Create a copy of `file_name_format`
-    regex_pattern = file_name_format
-    
-    # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern
-    for key, value in LIVE_FILE_FORMAT_MAP.items():
-        regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"])
-    # ---- Replace the `FILE_ID` tag
-    regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P<FILE_ID>\1)', regex_pattern)
-
-    # Compile the regex pattern and return the output
-    return re.compile(regex_pattern)
-
-def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict):
-
-    # Read in the `*.csv` file
-    df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys()))
-
-    # Validate the dataframe
-    # ---- Check for any missing columns
-    missing_columns = (
-        [key for key in config_map["dtypes"].keys() if key not in df.columns]
-    )
-    # ---- Raise Error, if needed
-    if missing_columns: 
-        raise ValueError(
-            f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!"
-        )
-    # ---- Ensure the correct datatypes
-    df_validated = df.astype(config_map["dtypes"])
-    # ---- Replace column names and drop 
-    df_validated = df_validated.rename(columns=config_map["names"])
-
-    # Get the substring components that can be added to the DataFrame
-    filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern)
-    # ---- Create sub-list of columns that can be added to the DataFrame
-    valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings)))
-
-    # Compile the filename regular expression
-    compiled_regex = compile_filename_format(pattern)
-    # ---- Create the `Match` object that will be used to parse the string
-    match_obj = compiled_regex.search(file.name)
-
-    # Iterate through the filename-derived tags and add them to the DataFrame
-    for i in valid_tags: 
-        matched_key = LIVE_FILE_FORMAT_MAP[i]
-        df_validated[matched_key["name"]] = matched_key["dtype"](match_obj.group(i))
-
-    # Return the resulting DataFrame
-    return df_validated
-
-def preprocess_biology_data(biology_output: dict, file_configuration: dict):
-  
-    # Get SQL database file
-    biology_db = file_configuration["database"]["biology"]
-    
-    # Get contrasts used for filtering the dataset
-    # ---- Species
-    species_filter = file_configuration["species"]["number_code"]
-    # ---- Trawl partition information
-    trawl_filter = file_configuration["biology"]["catch"]["partition"]
-    # ---- Create filter dictionary
-    filter_dict = dict(species_id=species_filter, trawl_partition=trawl_filter)
-
-    # Apply the filter
-    filtered_biology_output = {
-        key: biology_data_filter(df, filter_dict) 
-        for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty
-    }
-    # ---- Swap this out if no new files are present
-    if not filtered_biology_output:
-        # ---- Get available tables
-        table_list = list(set(SQL(biology_db, "map")) - set(["files_read"]))
-        # ---- Plug into the dictionary
-        filtered_biology_output.update({key: pd.DataFrame() for key in table_list})
-    # ---- Initialize the results dictionary   
-    results_dict = {key: pd.DataFrame() for key in filtered_biology_output.keys()}
-
-    # Update the SQL database
-    for table_name, df in filtered_biology_output.items():
-        # ---- Get identifier columns
-        key_columns = get_table_key_names(biology_db, filtered_biology_output, table_name)
-        # ---- Create copy
-        df = df.copy()
-        # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint
-        df.loc[:, "id"] = "row" + df.index.astype(str) + "-" + "-".join(key_columns)        
-        # ---- Insert the new data into the database & pull in the combined dataset
-        table_df = sql_data_exchange(biology_db, 
-                                     dataframe=df, 
-                                     table_name=table_name, 
-                                     id_columns=["id"],
-                                     primary_keys=["id"],
-                                     output_type=pd.DataFrame)
-        # ---- Add to the outgoing dictionary (and drop SQL db identifier)
-        results_dict.update({table_name: table_df.drop(columns="id")})
-    
-    # Return the output
-    return results_dict
-
-def get_table_key_names(db_file: Path, data_dict: dict, table_name: str) -> List[str]:
-
-    # Get the data input column names
-    if data_dict[table_name].empty:
-        # ---- Inspect the table
-        inspected_table = SQL(db_file, "inspect", table_name=table_name)
-        # ---- Create a list of the data columns
-        table_columns = list(inspected_table.keys())
-    else:
-        # ---- Get the DataFrame column names
-        table_columns = data_dict[table_name].columns
-
-    # Create a list of the primary keys
-    key_columns = (
-           set(table_columns)
-           .intersection(["trawl_partition", "sex", "haul_num", "species_id", "longitude", 
-                          "latitude"]) 
-        )
-
-    # Return a list of the output
-    return list(key_columns)
-
-def load_biology_data(file_configuration: dict):
-
-    # Get the acoustic file settings and root directory
-    # ---- File settings
-    file_settings = file_configuration["input_directories"]["biology"]
-    # ---- Root directory
-    root_directory = file_configuration["data_root_dir"]
-
-    # Get and validate the acoustic data directory and files
-    biology_files = validate_data_directory(root_directory, file_settings)
-
-    # Query `biology.db` to process only new files (or create the db file in the first place)
-    # SQL(biology_db, "drop", table_name="files_read")
-    new_biology_files, file_configuration["database"]["biology"] = (
-        query_processed_files(root_directory, file_settings, biology_files)
-    )
-
-    # Get the file-specific settings, datatypes, columns, etc.
-    # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
-    biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"]
-    # ---- Extract the expected file name ID's
-    biology_file_ids = file_settings["file_name_formats"]
-    # ---- Extract all of the file ids
-    biology_config_ids = list(biology_file_ids.keys())
-    # ---- Initialize the dictionary that will define this key in the `input` attribute
-    biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
-    # ---- Create filepath object
-    directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"]
-    
-    # Add SQL file to dict
-    file_configuration["database"]["biology"] = (
-        Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"]         
-    )
-
-    # Iterate through the different biology datasets and read them in
-    for dataset in list(biology_file_ids.keys()):
-        # ---- Get dataset-specific file lists
-        dataset_files = filter_filenames(directory_path, 
-                                         file_settings["file_name_formats"][dataset], 
-                                         new_biology_files, 
-                                         file_settings["extension"])
-        # ---- If there are dataset files available
-        if dataset_files:
-            # ---- Read in validated biology data
-            dataframe_list = [read_biology_csv(Path(file), 
-                                               file_settings["file_name_formats"][dataset], 
-                                               biology_config_map[dataset]) 
-                              for file in dataset_files]
-            # ---- Concatenate the dataset
-            dataframe_combined = pd.concat(dataframe_list, ignore_index=True)
-            # ---- Lower-case sex
-            if "sex" in dataframe_combined.columns: 
-                dataframe_combined["sex"] = dataframe_combined["sex"].str.lower()
-            # ---- Lower-case trawl partition type
-            if "trawl_partition" in dataframe_combined.columns: 
-                dataframe_combined["trawl_partition"] = dataframe_combined["trawl_partition"].str.lower()
-            # ---- Reformat datetime column
-            if "datetime" in dataframe_combined.columns:
-                dataframe_combined["datetime"] = convert_datetime(dataframe_combined["datetime"])
-            # ---- Add to the data dictionary
-            biology_output[f"{dataset}_df"] = dataframe_combined
-
-    # Pre-process and return the results
-    return preprocess_biology_data(biology_output, file_configuration)
-
-SPATIAL_CONFIG_MAP = {
-    "closest_haul": {
-        "proximity": {
-            "choices": ["distance", "time"],
-        },
-    },
-    "global" : {},
-    "griddify": {
-        "bounds": {
-            "longitude": {
-                "types": [float]
-            },
-            "latitude": {
-                "types": [float]
-            },
-            "northings": {
-                "types": [float]
-            },
-            "eastings": {
-                "types": [float]
-            },
-            "pairs": [("longitude", "latitude"), ("northings", "eastings")],
-        },
-        "grid_resolution": {
-            "x_distance": {
-                "types": float,
-            },
-            "y_distance": {
-                "types": float,
-            },
-            "d_longitude": {
-                "types": float,
-            },
-            "d_latitude": {
-                "types": float,
-            },
-            "grid_size_x": {
-                "types": int,
-            },
-            "grid_size_y": {
-                "types": int,
-            },
-            "pairs": [("x_distance", "y_distance"), ("d_longitude", "d_latitude"), 
-                      ("grid_size_x", "grid_size_y")],       
-        },
-    },
-    "inpfc": {
-        "stratum_names": {
-                "types": [int, str]
-            },
-        "latitude_max": {
-            "types": [float],
-        },
-    },
-    "weighted_haul": {
-        "proximity": {
-            "choices": ["distance", "time"]
-        },
-    },
-}
-
-def validate_spatial_config(spatial_config: dict):
-
-    # Check the link method
-    # ---- Extract string-formatted method name
-    link_method = spatial_config["link_biology_acoustics"].lower()
-    # ---- Validate
-    if link_method not in SPATIAL_CONFIG_MAP.keys():
-        raise ValueError(
-            f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options "
-            f"include: 'global', 'closest_haul', 'weighted_haul', 'griddify', and 'INPFC'."
-        )
-    
-    # Verify that associated parameters are present in the configuration settings
-    # ---- Get keys as a list
-    config_keys = list(spatial_config.keys())
-    # ---- Check for specific methods
-    if link_method not in config_keys and link_method != "global":
-        raise ValueError(
-            f"No parameters provided for the biology-acoustic linking ([{link_method}])."
-        )
-    
-    # Check key settings
-    if link_method == "griddify": 
-        validate_griddify_config(spatial_config, link_method)
-    elif link_method == "inpfc": 
-        validate_inpfc_config(spatial_config, link_method)
-    elif link_method != "global": 
-        validate_hauls_config(spatial_config, link_method)
-
-def validate_hauls_config(spatial_config: dict, link_method: str):
-
-    # Get the link method configuration map
-    link_method_settings = SPATIAL_CONFIG_MAP[link_method]
-
-    # Extract the defined settings
-    input_method_settings = spatial_config[link_method]
-
-    # Check for `proximity` 
-    if "proximity" not in input_method_settings.keys():
-        raise KeyError(
-            "The following parameters are missing from the biology-acoustic linking method: "
-            "'proximity'!"
-        )
-    
-    # Evaluate valid options for `proximity`
-    if input_method_settings["proximity"] not in link_method_settings["proximity"]["choices"]:
-        raise KeyError(
-            f"Value biology-acoustic linking method parameter `proximity` must be one of the : "
-            f"following: {link_method_settings["proximity"]["choices"]}."
-        )       
-    
-def validate_griddify_config(spatial_config: dict, link_method: str):
-
-    # Get the link method configuration map
-    link_method_settings = SPATIAL_CONFIG_MAP[link_method]
-
-    # Extract the defined settings
-    input_method_settings = spatial_config[link_method]
-
-    # Check for the required keys
-    key_diff = set(input_method_settings.keys()).difference(set(link_method_settings.keys()))
-    # ---- Raise Error
-    if key_diff:
-        raise KeyError(
-            f"The following parameters are missing from the biology-acoustic linking method: "
-            f"{list(key_diff)}!"
-        )    
-    
-    # Iterate through the keys to evaluate inputs
-    for key in list(input_method_settings.keys()):
-        # ---- Subset the input method config
-        input = input_method_settings[key]
-        # ---- Get the original config of the dtypes
-        model = link_method_settings[key]
-        # ---- Compare entries
-        parameter_diff = set(input.keys()).difference(set(model.keys()))
-        # ---- Raise Error
-        if parameter_diff:
-            raise KeyError(
-                f"Unexpected parameter(s) ('{parameter_diff}') detected in '{link_method}' "
-                f"configuration."
-            )    
-        # ---- Check if the appropriate coordinate pairs are present
-        coordinate_pairs = [set(param).intersection(set(input.keys())) for param in model["pairs"]]
-        # ---- Count the number of paired coordinates
-        pair_counts = [len(param) for param in coordinate_pairs]
-        # ---- If there are multiple pairs
-        if (np.array(pair_counts) == 2).sum() != 1:
-            raise ValueError(
-                f"A single coordinate-pair is allowed (and required) within the '{key}' parameter "
-                f"for the link method '{link_method}' defined via the following options: "
-                f"{model["pairs"]}."
-            )
-        # ---- Check the datatypes
-        for parameter in input.keys():
-            # ---- Get the datatypes
-            config_dtypes = model[parameter]["types"]
-            # ---- Get input parameter
-            input_parameter = input[parameter]
-            # ---- If List
-            if isinstance(config_dtypes, list):
-                if not isinstance(input_parameter, list):
-                    raise TypeError(
-                        f"Biology-acoustic linking method argument '{parameter}' within '{key}' "
-                        f"for method '{link_method}' must be contained within a list."
-                    )
-            else:
-                input_parameter = [input_parameter]
-                config_dtypes = [config_dtypes]
-            # ---- Check correct datatypes
-            if not np.all([type(value) in config_dtypes for value in input_parameter]):
-                raise TypeError(
-                    f"Biology-acoustic linking method argument '{parameter}' within '{key}' "
-                    f"for method '{link_method}' must be one of the following types within a list: "
-                    f"{config_dtypes}."
-                )    
-
-def validate_inpfc_config(spatial_config: dict, link_method: str):
-
-    # Get the link method configuration map
-    link_method_settings = SPATIAL_CONFIG_MAP[link_method]
-
-    # Extract the defined settings
-    input_method_settings = spatial_config[link_method]
-
-    # Check for the required keys
-    key_diff = set(input_method_settings.keys()).difference(set(link_method_settings.keys()))
-    # ---- Raise Error
-    if key_diff:
-        raise KeyError(
-            f"The following parameters are missing from the biology-acoustic linking method: "
-            f"{list(key_diff)}!"
-        )
-    
-    # Iterate through the keys to evaluate inputs
-    for key in list(input_method_settings.keys()):
-        # ---- Subset the input method config
-        input = input_method_settings[key]
-        # ---- Get the original config of the dtypes
-        model = link_method_settings[key]["types"]
-        # ---- Evaluate if a list 
-        if not isinstance(input, list):
-            raise TypeError(
-                f"Biology-acoustic linking method argument '{key}' for method '{link_method}' must "
-                f"be contained within a list."
-            )
-        # ---- Evaluate if it is a type within the list
-        if not type(input[0]) in model:
-            raise TypeError(
-                f"Biology-acoustic linking method argument '{key}' for method '{link_method}' must "
-                f"be one of the following types within a list: {model}."
-            )    
-        
-def apply_inpfc_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict):
-
-    # Extract the INPFC definitions
-    inpfc_definitions = spatial_config["inpfc"]
-
-    # Create latitude bins
-    latitude_bins = np.concatenate([[-90.0], inpfc_definitions["latitude_max"], [90.0]])
-    # ---- Append 1 more stratum layer
-    bin_names = np.concatenate([inpfc_definitions["stratum_names"],
-                                [np.max(inpfc_definitions["stratum_names"]) + 1]])
-    
-    # Create spatial key
-    spatial_config["spatial_key"] = pd.DataFrame({
-        "latitude_limit": inpfc_definitions["latitude_max"],
-    })
-    # ---- Cut
-    spatial_config["spatial_key"]["stratum"] = (
-        pd.cut(inpfc_definitions["latitude_max"],
-               latitude_bins,
-               right = True,
-               labels = bin_names)
-    )
-
-    # Get the `prc_nasc_df` values, if they exist, and apply stratification information
-    if not acoustic_data["prc_nasc_df"].empty:
-        # ---- Bin the latitude data
-        acoustic_data["prc_nasc_df"]["stratum"] = pd.cut(
-            acoustic_data["prc_nasc_df"]["latitude"],
-            latitude_bins,
-            right = True,
-            labels = bin_names,
-        )
-
-    # Get the `trawl_info_df` values, if they exist, and apply stratification information
-    if not biology_data["trawl_info_df"].empty:
-        # ---- Bin the latitude data
-        biology_data["trawl_info_df"]["stratum"] = pd.cut(
-            biology_data["trawl_info_df"]["latitude"],
-            latitude_bins,
-            right = True,
-            labels = bin_names,
-        )
-
-def define_boundary_box(boundary_dict: dict, projection: str):
-    
-    # Get x-coordinates
-    if "longitude" in boundary_dict.keys():
-        x = np.array(boundary_dict["longitude"])
-    else:
-        x = np.array(boundary_dict["northings"])
-
-    # Get y-coordinates
-    if "latitude" in boundary_dict.keys():
-        y = np.array(boundary_dict["latitude"])
-    else:
-        y = np.array(boundary_dict["eastings"])
-
-    # Create a boundary DataFrame
-    bound_df = pd.DataFrame({
-        "x": np.array([x.min(), x.max(), x.max(), x.min(), x.min()]),
-        "y":np.array([y.min(), y.max(), y.max(), y.min(), y.min()]),
-    })
-
-    # Convert to a GeoDataFrame and return the GeoDataFrame
-    return gpd.GeoDataFrame(
-        data=bound_df,
-        geometry=gpd.points_from_xy(bound_df["x"], bound_df["y"]),
-        crs=projection,
-    )
-
-
-def apply_griddify_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict):
-
-    # Extract the griddification definitions
-    griddify_definitions = spatial_config["griddify"]
-
-    # Get the projection definition
-    projection = spatial_config["projection"]
-
-    # Compute the boundary box GeoDataFrame
-    boundary_box = define_boundary_box(griddify_definitions["bounds"], projection)
-
-    # Convert the coordinates, if needed
-    if not set(["northings", "eastings"]).intersection(set(griddify_definitions["bounds"].keys())):
-        # ---- Compute the equivalent UTM string
-        utm_num = int(utm_string_generator(np.median(boundary_box.loc[0:3, "x"]), 
-                                           np.median(boundary_box.loc[0:3, "y"])))
-        # ---- Compute the boundary box GeoDataFrame with the new projection
-        boundary_box = boundary_box.to_crs(utm_num)
-        # ---- Create a new projection for later
-        projection_new = f"epsg:{utm_num}"
-    else:
-        projection_new = projection
-
-    # Define the step sizes
-    # ---- Define x step size
-    x_step = distance(nautical=griddify_definitions["grid_resolution"]["x_distance"]).meters
-    # ---- Define y step size
-    y_step = distance(nautical=griddify_definitions["grid_resolution"]["y_distance"]).meters
-
-    # Get the boundary tuple
-    xmin, ymin, xmax, ymax = boundary_box.total_bounds
-
-    # Generate the cells
-    grid_cells = []
-    # ---- Iterate through
-    for y0 in np.arange(ymin, ymax+y_step, y_step):
-        for x0 in np.arange(xmin, xmax+x_step, x_step):
-            x1 = x0-x_step
-            y1 = y0+y_step
-            grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))
-
-    # Convert to a GeoDataFrame
-    cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=projection_new)
-
-    # Get the centroids
-    cells_gdf["cell_centroid"] = cells_gdf["geometry"].centroid
-
-    # Get the `prc_nasc_df` values, if they exist, and apply stratification information
-    if not acoustic_data["prc_nasc_df"].empty:
-
-        #
-        prc_nasc_df = acoustic_data["prc_nasc_df"]
-
-        # to GDF
-        prc_nasc_gdf = gpd.GeoDataFrame(
-            data=prc_nasc_df,
-            geometry=gpd.points_from_xy(prc_nasc_df["longitude"], prc_nasc_df["latitude"]),
-            crs=projection,
-        )
-        # to UTM
-        prc_nasc_new = prc_nasc_gdf.to_crs(projection_new)
-
-        prc_nasc_new["x"] = prc_nasc_new["geometry"].x
-        prc_nasc_new["y"] = prc_nasc_new["geometry"].y
-
-        # ---- Bin the latitude data
-        prc_nasc_new["stratum_x"] = pd.cut(
-            prc_nasc_new["x"],
-            np.arange(xmin, xmax+x_step, x_step),
-            right = True,
-            labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1),
-        ).astype(int) + 1
-
-        prc_nasc_new["stratum_y"] = pd.cut(
-            prc_nasc_new["y"],
-            np.arange(ymin, ymax+y_step, y_step),
-            right = True,
-            labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1),
-        ).astype(int) + 1
-
-        #
-        acoustic_data["prc_nasc_df"]["stratum"] = (
-            prc_nasc_new["stratum_x"].astype(str) + "-" + prc_nasc_new["stratum_y"].astype(str)
-        )
-
-    if not biology_data["trawl_info_df"].empty:
-
-        #
-        trawl_info_df = biology_data["trawl_info_df"]
-
-        # to GDF
-        trawl_info_gdf = gpd.GeoDataFrame(
-            data=trawl_info_df,
-            geometry=gpd.points_from_xy(trawl_info_df["longitude"], trawl_info_df["latitude"]),
-            crs=projection,
-        )
-        # to UTM
-        trawl_info_new = trawl_info_gdf.to_crs(projection_new)
-
-        trawl_info_new["x"] = trawl_info_new["geometry"].x
-        trawl_info_new["y"] = trawl_info_new["geometry"].y
-
-        # ---- Bin the latitude data
-        trawl_info_new["stratum_x"] = pd.cut(
-            trawl_info_new["x"],
-            np.arange(xmin, xmax+x_step, x_step),
-            right = True,
-            labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1),
-        ).astype(int) + 1
-
-        trawl_info_new["stratum_y"] = pd.cut(
-            trawl_info_new["y"],
-            np.arange(ymin, ymax+y_step, y_step),
-            right = True,
-            labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1),
-        ).astype(int) + 1
-
-        #
-        biology_data["trawl_info_df"]["stratum"] = (
-            trawl_info_new["stratum_x"].astype(str) + "-" + trawl_info_new["stratum_y"].astype(str)
-        )
-
diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py
new file mode 100644
index 00000000..f38b130b
--- /dev/null
+++ b/echopop/live/live_spatial_methods.py
@@ -0,0 +1,198 @@
+import geopandas as gpd
+import pandas as pd
+import numpy as np
+from geopy.distance import distance
+from ..spatial.projection import utm_string_generator
+import shapely.geometry
+
+def apply_inpfc_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict):
+
+    # Extract the INPFC definitions
+    inpfc_definitions = spatial_config["inpfc"]
+
+    # Create latitude bins
+    latitude_bins = np.concatenate([[-90.0], inpfc_definitions["latitude_max"], [90.0]])
+    # ---- Append 1 more stratum layer
+    bin_names = np.concatenate([inpfc_definitions["stratum_names"],
+                                [np.max(inpfc_definitions["stratum_names"]) + 1]])
+    
+    # Create spatial key
+    spatial_config["spatial_key"] = pd.DataFrame({
+        "latitude_limit": inpfc_definitions["latitude_max"],
+    })
+    # ---- Cut
+    spatial_config["spatial_key"]["stratum"] = (
+        pd.cut(inpfc_definitions["latitude_max"],
+               latitude_bins,
+               right = True,
+               labels = bin_names)
+    )
+
+    # Get the `prc_nasc_df` values, if they exist, and apply stratification information
+    if not acoustic_data["prc_nasc_df"].empty:
+        # ---- Bin the latitude data
+        acoustic_data["prc_nasc_df"]["stratum"] = pd.cut(
+            acoustic_data["prc_nasc_df"]["latitude"],
+            latitude_bins,
+            right = True,
+            labels = bin_names,
+        )
+
+    # Get the `trawl_info_df` values, if they exist, and apply stratification information
+    if not biology_data["trawl_info_df"].empty:
+        # ---- Bin the latitude data
+        biology_data["trawl_info_df"]["stratum"] = pd.cut(
+            biology_data["trawl_info_df"]["latitude"],
+            latitude_bins,
+            right = True,
+            labels = bin_names,
+        )
+
+def define_boundary_box(boundary_dict: dict, projection: str):
+    
+    # Get x-coordinates
+    if "longitude" in boundary_dict.keys():
+        x = np.array(boundary_dict["longitude"])
+    else:
+        x = np.array(boundary_dict["northings"])
+
+    # Get y-coordinates
+    if "latitude" in boundary_dict.keys():
+        y = np.array(boundary_dict["latitude"])
+    else:
+        y = np.array(boundary_dict["eastings"])
+
+    # Create a boundary DataFrame
+    bound_df = pd.DataFrame({
+        "x": np.array([x.min(), x.max(), x.max(), x.min(), x.min()]),
+        "y":np.array([y.min(), y.max(), y.max(), y.min(), y.min()]),
+    })
+
+    # Convert to a GeoDataFrame and return the GeoDataFrame
+    return gpd.GeoDataFrame(
+        data=bound_df,
+        geometry=gpd.points_from_xy(bound_df["x"], bound_df["y"]),
+        crs=projection,
+    )
+
+def apply_griddify_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict):
+
+    # Extract the griddification definitions
+    griddify_definitions = spatial_config["griddify"]
+
+    # Get the projection definition
+    projection = spatial_config["projection"]
+
+    # Compute the boundary box GeoDataFrame
+    boundary_box = define_boundary_box(griddify_definitions["bounds"], projection)
+
+    # Convert the coordinates, if needed
+    if not set(["northings", "eastings"]).intersection(set(griddify_definitions["bounds"].keys())):
+        # ---- Compute the equivalent UTM string
+        utm_num = int(utm_string_generator(np.median(boundary_box.loc[0:3, "x"]), 
+                                           np.median(boundary_box.loc[0:3, "y"])))
+        # ---- Compute the boundary box GeoDataFrame with the new projection
+        boundary_box = boundary_box.to_crs(utm_num)
+        # ---- Create a new projection for later
+        projection_new = f"epsg:{utm_num}"
+    else:
+        projection_new = projection
+
+    # Define the step sizes
+    # ---- Define x step size
+    x_step = distance(nautical=griddify_definitions["grid_resolution"]["x_distance"]).meters
+    # ---- Define y step size
+    y_step = distance(nautical=griddify_definitions["grid_resolution"]["y_distance"]).meters
+
+    # Get the boundary tuple
+    xmin, ymin, xmax, ymax = boundary_box.total_bounds
+
+    # Generate the cells
+    grid_cells = []
+    # ---- Iterate through
+    for y0 in np.arange(ymin, ymax+y_step, y_step):
+        for x0 in np.arange(xmin, xmax+x_step, x_step):
+            x1 = x0-x_step
+            y1 = y0+y_step
+            grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))
+
+    # Convert to a GeoDataFrame
+    cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=projection_new)
+
+    # Get the centroids
+    cells_gdf["cell_centroid"] = cells_gdf["geometry"].centroid
+
+    # Get the `prc_nasc_df` values, if they exist, and apply stratification information
+    if not acoustic_data["prc_nasc_df"].empty:
+
+        #
+        prc_nasc_df = acoustic_data["prc_nasc_df"]
+
+        # to GDF
+        prc_nasc_gdf = gpd.GeoDataFrame(
+            data=prc_nasc_df,
+            geometry=gpd.points_from_xy(prc_nasc_df["longitude"], prc_nasc_df["latitude"]),
+            crs=projection,
+        )
+        # to UTM
+        prc_nasc_new = prc_nasc_gdf.to_crs(projection_new)
+
+        prc_nasc_new["x"] = prc_nasc_new["geometry"].x
+        prc_nasc_new["y"] = prc_nasc_new["geometry"].y
+
+        # ---- Bin the latitude data
+        prc_nasc_new["stratum_x"] = pd.cut(
+            prc_nasc_new["x"],
+            np.arange(xmin, xmax+x_step, x_step),
+            right = True,
+            labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1),
+        ).astype(int) + 1
+
+        prc_nasc_new["stratum_y"] = pd.cut(
+            prc_nasc_new["y"],
+            np.arange(ymin, ymax+y_step, y_step),
+            right = True,
+            labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1),
+        ).astype(int) + 1
+
+        #
+        acoustic_data["prc_nasc_df"]["stratum"] = (
+            prc_nasc_new["stratum_x"].astype(str) + "-" + prc_nasc_new["stratum_y"].astype(str)
+        )
+
+    if not biology_data["trawl_info_df"].empty:
+
+        #
+        trawl_info_df = biology_data["trawl_info_df"]
+
+        # to GDF
+        trawl_info_gdf = gpd.GeoDataFrame(
+            data=trawl_info_df,
+            geometry=gpd.points_from_xy(trawl_info_df["longitude"], trawl_info_df["latitude"]),
+            crs=projection,
+        )
+        # to UTM
+        trawl_info_new = trawl_info_gdf.to_crs(projection_new)
+
+        trawl_info_new["x"] = trawl_info_new["geometry"].x
+        trawl_info_new["y"] = trawl_info_new["geometry"].y
+
+        # ---- Bin the latitude data
+        trawl_info_new["stratum_x"] = pd.cut(
+            trawl_info_new["x"],
+            np.arange(xmin, xmax+x_step, x_step),
+            right = True,
+            labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1),
+        ).astype(int) + 1
+
+        trawl_info_new["stratum_y"] = pd.cut(
+            trawl_info_new["y"],
+            np.arange(ymin, ymax+y_step, y_step),
+            right = True,
+            labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1),
+        ).astype(int) + 1
+
+        #
+        biology_data["trawl_info_df"]["stratum"] = (
+            trawl_info_new["stratum_x"].astype(str) + "-" + trawl_info_new["stratum_y"].astype(str)
+        )
diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index e8c60da5..579cf463 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -1,7 +1,6 @@
 from typing import Union
 from pathlib import Path
 import copy
-import yaml
 
 from .live_core import(
     LIVE_DATA_STRUCTURE,
@@ -14,7 +13,7 @@
 )
 
 from . import live_data_processing as eldp
-
+from . import live_data_loading as eldl
 class LiveSurvey:
     """
     A real-time processing version of the `echopop` base `Survey` class that ingests biological, 
@@ -25,7 +24,6 @@ def __init__(
         self,
         live_init_config_path: Union[str, Path], 
         live_file_config_path: Union[str, Path],
-        update_config: bool = True,
         verbose: bool = True,
     ):
         # Initialize `meta` attribute
@@ -33,7 +31,7 @@ def __init__(
 
         # Loading the configuration settings and definitions that are used to
         # initialize the Survey class object
-        self.config = eldp.live_configuration(Path(live_init_config_path), 
+        self.config = eldl.live_configuration(Path(live_init_config_path), 
                                               Path(live_file_config_path))
         # ---- Initialize config key for database files
         self.config.update(
@@ -52,7 +50,7 @@ def __init__(
         # TODO: Replace Tuple output by appending the "database" key to the respective dataset dict
         # Ingest data
         # ---- Acoustics
-        self.input["acoustics"]["prc_nasc_df"] = eldp.load_acoustic_data(self.config)
+        self.input["acoustics"]["prc_nasc_df"] = eldl.load_acoustic_data(self.config)
         # ---- Biology
         self.input["biology"] = eldp.load_biology_data(self.config)
         
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index 4b282e13..0d6a6d58 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -1,7 +1,9 @@
 from sqlalchemy import create_engine, text, Engine, inspect
 import sqlalchemy as sqla
 import pandas as pd
-from typing import Optional
+from typing import Optional, Literal, Union, List
+import numpy as np
+from pathlib import Path
 
 def sql_create(connection: sqla.Connection, dataframe: pd.DataFrame, table_name: str, 
                primary_keys: Optional[list] = None):
@@ -117,23 +119,31 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data
     # Convert the DataFrame into a tuple and then into a string
     # ---- Replace NaN with None
     dataframe = dataframe.replace([np.nan], [None])
-    # ---- Identify any possible DATETIME columns
-    # datetime_columns = (
-    #     {col["name"]: str for col in columns_info 
-    #      if isinstance(col["type"], sqla.sql.sqltypes.DATETIME)}
-    # )
-    # ---- Encapsulate datetimes with quotes by converting to string
-    # dataframe = dataframe.astype(datetime_columns)
     # ---- DataFrame to Tuple
     data_tuple = [tuple(row) for row in dataframe.itertuples(index=False)]
+    
+    def format_value(x):
+        if isinstance(x, str):
+            return "'{}'".format(x.replace("'", "''"))
+        elif isinstance(x, pd.Timestamp):
+            return "'{}'".format(x)
+        elif x is None:
+            return 'NULL'
+        else:
+            return str(x)
+        
     # ---- Tuple to String
-    data_str = ", ".join(
-        # f"({', '.join(map(lambda x: f'\'{x}\'' if isinstance(x, str) else str(x), row))})"
-                f"({', '.join(map(lambda x: f'\'{x}\'' 
-                                  if isinstance(x, str) or isinstance(x, pd.Timestamp) 
-                                  else 'NULL' if x is None else str(x), row))})"
-        for row in data_tuple
-    )
+    # data_str = ", ".join(
+    #     # f"({', '.join(map(lambda x: f'\'{x}\'' if isinstance(x, str) or isinstance(x, pd.Timestamp) else 'NULL' if x is None else str(x), row))})"
+    #     f"({', '.join(map(lambda x: f'\'{x.replace('\\', '\\\\')}\'' if isinstance(x, str) or isinstance(x, pd.Timestamp) else 'NULL' if x is None else str(x), row))})"
+    #     for row in data_tuple
+    # )
+    flattened_data = [format_value(x) for row in data_tuple for x in row]
+    data_str = "({})".format(", ".join(flattened_data))
+    # data_str = ", ".join(
+    #     "({})".format(", ".join(map(format_value, row)))
+    #     for row in data_tuple
+    # )
     
     # Construct the "ON CONFLICT, DO UPDATE SET" if needed
     on_conflict_clause = ""
@@ -156,8 +166,7 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data
     # Commit
     connection.commit()
 
-from typing import Literal
-import numpy as np
+
 def sql_select(connection: sqla.Connection, table_name: str, columns: list, 
                output_type: type = pd.DataFrame):
 
@@ -258,6 +267,75 @@ def format_sql_columns(kwargs: dict):
     return kwargs
 
 # TODO: Documentation
+def query_processed_files(root_directory: str, file_settings: dict, files: List[Path]) -> dict:
+
+    # Get the database name 
+    db_name = file_settings["database_name"]
+
+    # Create filepath to the SQL database
+    # ---- Create Path to SQL database file
+    db_directory = Path(root_directory) / "database"
+    # ---- Create the directory if it does not already exist
+    db_directory.mkdir(parents=True, exist_ok=True)
+    # ---- Complete path to the database file
+    db_file = db_directory / db_name
+
+    # Create a list of string-formatted Path names
+    files_str = [str(file) for file in files]
+    # ---- Create DataFrame
+    current_files = pd.DataFrame(files_str, columns=["filepath"])
+
+    # Check for the table `files_read`
+    files_read_tbl = SQL(db_file, "validate", table_name="files_read")
+
+    # Validate whether the table exists; if not, create the table and then insert
+    if not files_read_tbl:
+        # ---- Create table
+        SQL(db_file, "create", table_name="files_read", dataframe=current_files, 
+            primary_keys = ["filepath"])
+        # ---- Populate table
+        SQL(db_file, "insert", table_name="files_read", dataframe=current_files)
+        # ---- Break early
+        return files_str, db_file
+    
+    # Query already existing files
+    previous_files = SQL(db_file, "select", table_name="files_read", output_type=str)
+    # ---- Insert file list
+    SQL(db_file, "insert", table_name="files_read", dataframe=current_files, id_columns="filepath")
+
+    # Filter out previously processed files
+    # ---- Apply filter by comparing sets and return the output
+    return list(set(files_str) - set(previous_files)), db_file
+
+# TODO: Documentation
+def sql_data_exchange(database_file: Path, **kwargs):
+
+    # Check whether the `table_name` table exists
+    table_exists = SQL(database_file, "validate", **kwargs)
+
+    # If empty and table does not exist
+    if kwargs["dataframe"].empty and table_exists:
+        return SQL(database_file, "select", **kwargs)
+
+    # Create table if it does not exist and run the initial insertion
+    if not table_exists:
+        # ---- Create table
+        SQL(database_file, "create", **kwargs)
+        # ---- Ignore the `id_columns` argument, if present
+        try:
+            del kwargs["id_columns"]
+        except KeyError:
+            pass
+        # ---- Insert into table        
+        SQL(database_file, "insert", **kwargs)
+        # ---- Return the initial dataframe
+        return kwargs.get("dataframe")
+    
+    # Insert into the table
+    SQL(database_file, "insert", **kwargs)
+    
+    # Select existing data frame the database and return the output
+    return SQL(database_file, "select", **kwargs)
 
 
 # TODO: Documentation
@@ -280,31 +358,6 @@ def SQL(db_file: str, command: str, **kwargs):
             kwargs = {key: value for key, value in kwargs.items() if key in command_args}
             # ---- Return output
             return command_function(connection, **kwargs)
-            # # ---- SELECT
-            # if command == "select":
-            #     return pd.read_sql(text(SQL_COMMANDS[command].format(**kwargs)), con=connection)
-            # # ---- REPLACE
-            # elif command == "replace":
-            #     # ---- Extract dataframe
-            #     df_to_add = kwargs["dataframe"]
-            #     # ---- Replace current
-            #     df_to_add.to_sql(name=kwargs["table_name"], 
-            #                      con=connection, 
-            #                      if_exists="replace", index=False)
-
-            # # ---- INSERT
-            # elif command == "insert": 
-            #     # ---- Extract dataframe
-            #     df_to_add = kwargs["dataframe"]
-            #     # ---- Insert into the table
-            #     df_to_add.to_sql(name=kwargs["table_name"], con=connection, if_exists="append", 
-            #                      index=False)
-            # # ---- INSPECT
-            # elif command == "inspect":
-            #     return inspect(engine).get_table_names()
-            # # ---- OTHER COMMAND
-            # else: 
-            #     connection.execute(text(SQL_COMMANDS[command].format(**kwargs)))
     finally: 
         # ---- Dispose of the engine to release any resources being pooled/used
         engine.dispose()
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
index c01445b3..ba7c2a2c 100644
--- a/echopop/zarr_read_ingest_test.py
+++ b/echopop/zarr_read_ingest_test.py
@@ -26,8 +26,23 @@
 file_configuration.update({"database": {"acoustics": None, "biology": None}})
 ####################################################################################################
 # * Accessory function for tuning the acoustic transmit frequency units/scaling
+def format_vlaue(x):
+    pass
+
+def format_value(x):
+    if isinstance(x, str):
+        return "'{}'".format(x.replace("'", "''"))
+    elif isinstance(x, pd.Timestamp):
+        return "'{}'".format(x)
+    elif x is None:
+        return 'NULL'
+    else:
+        return str(x)
 
-
+data_str = ", ".join(
+    "({})".format(", ".join(format_value(x) for x in row))
+    for row in data_tuple
+)
 
 
 

From 6d439bb18a8118fe12d4dd982627182ac9208b13 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 31 Jul 2024 18:31:20 -0700
Subject: [PATCH 09/81] General changes

---
 config_files/live_initialization_config.yml |   6 +-
 echopop/live/live_acoustics.py              | 156 ++--
 echopop/live/live_biology.py                | 812 ++++++++++++++++++++
 echopop/live/live_core.py                   |  12 +-
 echopop/live/live_data_loading.py           | 498 ++++++------
 echopop/live/live_spatial_methods.py        | 112 ++-
 echopop/live/live_survey.py                 |  79 +-
 echopop/live/sql_methods.py                 | 289 ++++++-
 echopop/utils/operations.py                 |   9 +-
 echopop/zarr_read_ingest_test.py            | 266 ++++++-
 10 files changed, 1809 insertions(+), 430 deletions(-)
 create mode 100644 echopop/live/live_biology.py

diff --git a/config_files/live_initialization_config.yml b/config_files/live_initialization_config.yml
index a407520e..9436cefc 100644
--- a/config_files/live_initialization_config.yml
+++ b/config_files/live_initialization_config.yml
@@ -45,11 +45,7 @@
   # `INPFC`         --> NASC for each INPFC stratum associated with matched stratum-specific sigma_bs 
   # `closest_haul`  --> NASC associated with sigma_bs calculated from the closest (spatially) trawls
   # `weighted_haul` --> NASC associated with sigma_bs calculated from all survey data weighted by distance from haul coordinates
-    link_biology_acoustics:         
-      global: False
-      INPFC: True
-      closest_haul: False
-      weighted_haul: False
+    link_biology_acoustics: INPFC
 
   #####################################################################################################################
   # Acoustics settings#
diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py
index f526f578..21ba1e23 100644
--- a/echopop/live/live_acoustics.py
+++ b/echopop/live/live_acoustics.py
@@ -1,9 +1,55 @@
 from typing import Union, Optional
-
+import numpy as np
 import pandas as pd
 
 from echopop.acoustics import ts_length_regression, to_linear, to_dB
 
+# TODO: Documentation
+def configure_transmit_frequency(frequency_values: pd.Series,
+                                 transmit_settings: dict, 
+                                 current_units: str):
+    
+    # Extract transmit frequency units defined in configuration file
+    configuration_units = transmit_settings["units"]
+    
+    # Transform the units, if necessary
+    # ---- Hz to kHz
+    if current_units == "Hz" and configuration_units == "kHz":
+        return frequency_values * 1e-3
+    # ---- kHz to Hz
+    elif current_units == "kHz" and configuration_units == "Hz":
+        return frequency_values * 1e3
+    # ---- No change
+    else:
+        return frequency_values
+    
+# TODO: Documentation
+def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame,
+                             file_configuration: dict) -> pd.DataFrame:
+
+    # Get acoustic processing settings
+    acoustic_analysis_settings = file_configuration["acoustics"]
+    # ---- Extract the fined acoustic frequency
+    transmit_settings = acoustic_analysis_settings["transmit"]
+
+    # Filter the dataset
+    # ---- Configure `frequency_nominal`, if necessary
+    prc_nasc_df["frequency_nominal"] = (
+        configure_transmit_frequency(prc_nasc_df["frequency_nominal"],
+                                     transmit_settings,
+                                     acoustic_analysis_settings["dataset_units"]["frequency"])
+    )
+    # ---- Filter out any unused frequency coordinates
+    prc_nasc_df_filtered = (
+        prc_nasc_df[prc_nasc_df["frequency_nominal"] == transmit_settings["frequency"]]
+    )
+
+    # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object
+    # ---- Replace NASC `NaN` values with `0.0`
+    prc_nasc_df_filtered.loc[:, "NASC"] = prc_nasc_df_filtered.loc[:, "NASC"].fillna(0.0)
+    # ---- Drop the `frequency_nominal` column and return the output 
+    return prc_nasc_df_filtered.drop(columns = ["frequency_nominal"])
+
 # TODO: Documentation
 def average_sigma_bs(length: Union[pd.DataFrame, float, int], 
                      weights: Optional[Union[float, int, str]] = None):
@@ -46,6 +92,10 @@ def estimate_echometrics(acoustic_data_df: pd.DataFrame):
     # Create copy
     acoustic_df = acoustic_data_df.copy().reset_index(drop=True)
 
+    # Compute ABC
+    # ---- Convert NASC to ABC
+    acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2)
+
     # Pre-compute the change in depth
     acoustic_df["dz"] = acoustic_df["depth"].diff()
 
@@ -62,65 +112,49 @@ def estimate_echometrics(acoustic_data_df: pd.DataFrame):
             "center_of_mass": np.nan,
             "dispersion": np.nan,
             "evenness": np.nan,
-            "aggregation": np.nan,    
+            "aggregation_index": np.nan,    
             "occupied_area": 0.0,        
         })
     else:
         
-        # Compute the number of layers
-        echometrics.update({
-            "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size
-        })
-
-        # Compute ABC
-        # ---- Convert NASC to ABC
-        acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2)
-        # ---- Estimate mean Sv
-        echometrics.update({
-            "mean_Sv": 10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max())
-        })
-        # --- Estimate max Sv (i.e. )
-        echometrics.update({
-            "max_Sv": 10 * np.log10(acoustic_df["ABC"].max() 
-                                    / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"])
-        })
-
-        # Compute (acoustic) abundance
-        echometrics.update({
-            "nasc_db": 10 * np.log10(acoustic_df["ABC"].sum())
-        })
-
-        # Compute center of mass
+        # Create the `echometrics` dictionary 
         echometrics.update({
-            "center_of_mass": (
-                (acoustic_df["depth"] * acoustic_df["NASC"]).sum()
-                / (acoustic_df["NASC"]).sum()
+            # ---- Number of layers
+            "n_layers": int(acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size),
+            # ---- Mean Sv (back-calculated)
+            "mean_Sv": float(
+                10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max())
+            ),
+            # ---- Max Sv (back-calculated)
+            "max_Sv": float(
+                10 * np.log10(acoustic_df["ABC"].max() 
+                              / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"])
+            ),
+            # ---- (Logarithmic) acoustic abundance
+            "nasc_db": float(10 * np.log10(acoustic_df["ABC"].sum())),
+            # ---- Center-of-mass
+            "center_of_mass": float(
+                (acoustic_df["depth"] * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum()
+            ),
+            # ---- Evenness
+            "evenness": float(
+                (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2
+            ),
+            # ---- Occupied area
+            "occupied_area": float(
+                acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max()
             )
         })
 
-        # Compute the dispersion
+        # Update variable-dependent metrics
         echometrics.update({
-            "dispersion": (
+            # ---- Dispersion
+            "dispersion": float(
                 ((acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2 
                 * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum()                
-            )
-        })
-
-        # Compute the evenness
-        echometrics.update({
-            "evenness": (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2
-        })
-
-        # Compute the index of aggregation
-        echometrics.update({
-            "aggregation": 1 / echometrics["evenness"]
-        })
-
-        # Get the occupied area
-        echometrics.update({
-            "occupied_area": (
-                acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max()
-            )
+            ),
+            # ---- Index of aggregation
+            "aggregation_index": float(1 / echometrics["evenness"]), 
         })
 
     # Return the dictionary
@@ -141,3 +175,27 @@ def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True):
 
     # Convert `nasc_dict` to a DataFrame and return the output
     return pd.Series(nasc_dict)
+
+def compute_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True):
+
+    # Integrate NASC (and compute the echometrics, if necessary)
+    nasc_data_df = (
+        acoustic_data_df.groupby(["longitude", "latitude", "ping_time"])
+        .apply(integrate_nasc, echometrics, include_groups=False)
+        .unstack().reset_index()
+    )
+    # ---- Amend the dtypes if echometrics were computed
+    if echometrics:
+        # ---- Set dtypes
+        nasc_data_df = (
+            nasc_data_df
+            .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float,
+                    "center_of_mass": float, "dispersion": float, "evenness": float,
+                    "aggregation_index": float, "occupied_area": float})
+        )
+        # ---- Reorder columns
+        nasc_data_df = nasc_data_df[[
+            "longitude", "latitude", "ping_time", "nasc", "n_layers", "nasc_db", "mean_Sv", 
+            "max_Sv", "aggregation_index", "center_of_mass", "dispersion", "evenness", 
+            "occupied_area"
+        ]]
diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py
new file mode 100644
index 00000000..cf04589b
--- /dev/null
+++ b/echopop/live/live_biology.py
@@ -0,0 +1,812 @@
+import pandas as pd
+import numpy as np
+from .sql_methods import SQL, sql_data_exchange, get_table_key_names
+from echopop.acoustics import ts_length_regression, to_dB, to_linear
+from echopop.utils.operations import group_interpolator_creator
+from functools import reduce
+
+def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict):
+
+    # Create dataframe copy
+    data_copy = biology_data.copy()
+
+    # Iterate through dictionary to apply filters (if present)
+    for column, value in filter_dict.items():
+        if column in data_copy.columns:
+            data_copy = data_copy[data_copy[column] == value]
+
+    # Return output
+    return data_copy
+
+def merge_trawl_info(biology_dict: dict):
+
+    # Get the trawl information dictionary
+    trawl_info_df = biology_dict["trawl_info_df"]
+
+    # Update `catch_df`
+    biology_dict["catch_df"] = biology_dict["catch_df"].merge(trawl_info_df)
+
+    # Update `length_df`
+    biology_dict["length_df"] = biology_dict["length_df"].merge(trawl_info_df)
+
+    # Update `specimen_df`
+    biology_dict["specimen_df"] = biology_dict["specimen_df"].merge(trawl_info_df)
+
+    # Drop the trawl information
+    del biology_dict["trawl_info_df"]
+
+def prepare_length_distribution(file_configuration: dict):
+
+    # Get the length distribution parameters
+    distrib_params = file_configuration["biology"]["length_distribution"]["bins"]
+
+    # Create histogram bins
+    length_bins = (
+        np.linspace(**{key: value for key, value in zip(["start", "stop", "num"], distrib_params)}, 
+                    dtype=float)
+    )
+
+    # Get the binwidths
+    binwidth = np.diff(length_bins / 2.0).mean()
+
+    # Generate the equivalent interval boundaries for each bin
+    intervals = np.concatenate([length_bins[:1] - binwidth, length_bins + binwidth])
+
+    # Format as a DataFrame and return the output
+    # ---- Add Categorical interval column
+    length_bins_df = (
+        pd.DataFrame({"length_bin": length_bins, "interval": pd.cut(length_bins, intervals)})
+    )
+    # ---- Add numeric lower boundary
+    length_bins_df["lower"] = length_bins_df["interval"].apply(lambda x: x.left).astype(float)
+    # ---- Add numeric upper boundary
+    length_bins_df["upper"] = length_bins_df["interval"].apply(lambda x: x.right).astype(float)
+
+    # Return the dataframe that will be incorporated into the biological data attribute
+    return length_bins_df
+
+def preprocess_biology_data(biology_output: dict, spatial_dict: dict, file_configuration: dict):
+  
+    # Get SQL database file
+    biology_db = file_configuration["database"]["biology"]
+    
+    # Get contrasts used for filtering the dataset
+    # ---- Species
+    species_filter = file_configuration["species"]["number_code"]
+    # ---- Trawl partition information
+    trawl_filter = file_configuration["biology"]["catch"]["partition"]
+    # ---- Create filter dictionary
+    filter_dict = dict(species_id=species_filter, trawl_partition=trawl_filter)
+
+    # Apply the filter
+    filtered_biology_output = {
+        key: biology_data_filter(df, filter_dict) 
+        for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty
+    }
+    # ---- Create new data flag
+    file_configuration["length_distribution"] = prepare_length_distribution(file_configuration)
+    # ---- Incorporate additional data, if new data are present
+    if filtered_biology_output:
+        # ---- Merge the trawl information and app
+        merge_trawl_info(filtered_biology_output)
+        # ---- Apply spatial definitions/stratification, if any
+        apply_spatial_definitions(filtered_biology_output, spatial_dict)
+    # ---- Swap this out if no new files are present
+    if not filtered_biology_output:
+        # ---- Get available tables
+        table_list = list(set(SQL(biology_db, "map")) - set(["files_read"]))
+        # ---- Plug into the dictionary
+        filtered_biology_output.update({key: pd.DataFrame() for key in table_list})
+    # ---- Initialize the results dictionary   
+    sql_results_dict = {key: pd.DataFrame() for key in filtered_biology_output.keys()}
+
+    # Update the SQL database
+    for table_name, df in filtered_biology_output.items():
+        # ---- Get identifier columns
+        key_columns = get_table_key_names(biology_db, filtered_biology_output, table_name)
+        # ---- Create copy
+        df = df.copy()
+        # ---- Assign values for key values
+        key_values = [str(index) + "-" + "-".join(df.loc[index, key_columns].values.astype(str)) 
+                      for index in df.index]
+        # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint
+        df.loc[:, "id"] = key_values
+        # ---- Insert the new data into the database & pull in the combined dataset
+        table_df = sql_data_exchange(biology_db, 
+                                     dataframe=df, 
+                                     table_name=table_name, 
+                                     id_columns=["id"],
+                                     primary_keys=["id"],
+                                     output_type=pd.DataFrame)
+        # ---- Add to the outgoing dictionary (and drop SQL db identifier)
+        sql_results_dict.update({table_name: table_df.drop(columns="id")})
+
+    # Return the output
+    return filtered_biology_output, sql_results_dict
+
+def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, 
+                     file_configuration: dict):
+
+    # Assign contrast columns
+    contrast_list = []
+    # ---- Check for "stratum" column
+    if "stratum" in specimen_data.columns and "stratum" in length_data.columns:
+        contrast_list.append(["stratum"])
+    # ---- Add the additional columns
+    contrast_list.append(["haul_num", "species_id", "length"])
+    # ---- Concatenate
+    contrast_columns = list(np.concatenate(contrast_list))
+
+    # Meld the biological datasets
+    length_datasets = specimen_data.meld(length_data, 
+                                         contrasts=contrast_columns)
+
+    # Get the TS-length model parameterization
+    ts_length_parameters_spp = [
+        spp
+        for spp in file_configuration["acoustics"]["TS_length_regression_parameters"].values()
+        if spp["number_code"] in np.unique(length_datasets.species_id).astype(int)
+    ]
+
+    # Extract the target species information
+    target_species = pd.DataFrame.from_dict(ts_length_parameters_spp)
+    # ---- Filter out non-target species
+    length_datasets = (
+        length_datasets[length_datasets["species_id"].isin(target_species["number_code"])]
+    )
+    # ---- Merge with `length_datasets`
+    ts_length_df = length_datasets.merge(target_species, 
+                                         left_on=["species_id"], right_on=["number_code"])
+
+    # Compute the mean sigma_bs for this particular haul
+    # ---- Create primary key list
+    key_list = list(set(contrast_columns) - set(["length"]))
+    # ---- Compute haul-specific means
+    sigma_bs_df = (
+        ts_length_df
+        .groupby(list(set(contrast_columns) - set(["length"])), observed=False)
+        .apply(lambda x: average_sigma_bs(x, weighted="length_count"), include_groups=False)
+        .reset_index(name="sigma_bs")
+    )
+
+    # For SQL database storage purposes, the sum and count are stored instead
+    # ---- Count sum
+    sigma_bs_df["sigma_bs_count"] = ts_length_df["length_count"].sum()
+    # ---- Value sum
+    sigma_bs_df["sigma_bs_sum"] = sigma_bs_df["sigma_bs"] * sigma_bs_df["sigma_bs_count"]
+    
+    # Get the database file name
+    acoustic_db = file_configuration["database"]["acoustics"]
+
+    # Check for `sigma_bs_mean_df` in the database file
+    # ---- Query database
+    if not SQL(acoustic_db, "validate", table_name="sigma_bs_mean_df"):
+        # ---- Create
+        SQL(acoustic_db, "create", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df, 
+            primary_keys=list(set(contrast_columns) - set(["length"])))
+        # ---- Populate table
+        SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df)
+    else:
+        # ---- Create a filter condition command
+        condition_str = " & ".join([f"{key} in {np.unique(sigma_bs_df[key])}" for key in key_list])
+        # ---- Update the table key 
+        SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df, 
+            operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], condition=condition_str)
+        # ---- Update the actual `sigma_bs` value in the table
+        SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", columns=["sigma_bs"],
+            operation="sigma_bs_sum / sigma_bs_count", condition=condition_str)
+        
+def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.DataFrame, 
+                             file_configuration: dict):
+    
+    # Get the spatial column name, if there is one
+    contrast_columns = file_configuration["spatial_column"].copy()
+    # ---- Append additional columns that will be used
+    contrast_columns.extend(["trawl_partition", "sex", "haul_num", "species_id", "length_bin"])
+    
+    # Gather specimen measurements to represent 'all' fish
+    specimen_data_all = specimen_data.assign(sex="all")
+
+    # Combine sexed and 'all' specimens
+    # ---- Vertical concatenation
+    specimen_data_all = pd.concat(
+        [specimen_data[specimen_data["sex"].isin(["male", "female"])], specimen_data_all],
+        ignore_index=True
+    )
+    # ---- Remove bad values
+    specimen_data_all.dropna(subset=["length", "weight"], inplace=True)
+
+    # Get SQL database file
+    biology_db = file_configuration["database"]["biology"]
+
+    # Check for `specimen_data_df` in the database file
+    # ---- Query database
+    # if not SQL(biology_db, "validate", table_name="specimen_data_df"):
+    # ---- Assign values for key values
+    key_values = [str(index) + "-" 
+                    + "-".join(specimen_data_all.loc[index, contrast_columns].values.astype(str)) 
+                    for index in specimen_data_all.index]
+    # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint
+    specimen_data_all.loc[:, "id"] = key_values
+    # ---- Insert the new data into the database & pull in the combined dataset
+    specimen_data_sql = sql_data_exchange(biology_db, 
+                                          dataframe=specimen_data_all, 
+                                          table_name="specimen_data_df", 
+                                          id_columns=["id"], 
+                                          primary_keys=["id"], 
+                                          output_type=pd.DataFrame)    
+    # ---- Drop SQL db identifier
+    specimen_data_sql.drop(columns="id", inplace=True)
+        
+    # Fit length-weight linear regression by male, female, and all fish
+    length_weight_regression_df = (
+        specimen_data_sql.groupby(["species_id", "sex"])
+        .apply(
+            lambda df: pd.Series(
+                np.polyfit(np.log10(df["length"]), np.log10(df["weight"]), 1),
+                index=["rate", "initial"],
+            ),
+            include_groups=False,
+        )
+        .reset_index()
+    )
+
+    # Predict weights for binned lengths
+    # ---- Initialize dataframe
+    weight_fitted_df = distribution_df.copy()
+    # ---- Expand/merge with length-weight regression coefficients
+    weight_fitted_df = weight_fitted_df.merge(length_weight_regression_df, how="cross")
+    # ---- Predict weight per bin
+    weight_fitted_df["weight_modeled"] = (
+        10.0 ** weight_fitted_df["initial"]
+        * weight_fitted_df["length_bin"] ** weight_fitted_df["rate"]
+    )
+    # ---- Drop unused columns
+    weight_fitted_df = weight_fitted_df.filter(
+        ["length_bin", "species_id", "sex", "weight_modeled"]
+    )
+
+    # Adjust for cases where there are too few (< 5) specimens within a given length bin
+    # ---- Count number of specimens across length bins
+    weight_fitted_distribution_df = specimen_data_all.count_variable(
+        contrasts=["species_id", "sex", "length_bin"], variable="length", fun="size"
+    ).set_index(["species_id", "sex", "length_bin"])
+    # ---- Get mean weight per bin as well
+    weight_fitted_distribution_df["weight_mean"] = (
+        specimen_data_all.groupby(["species_id", "sex", "length_bin"], observed=False)["weight"]
+        .mean()
+        .fillna(0.0)
+    )
+    # ---- Merge with the fitted weights
+    weight_fitted_distribution_df = weight_fitted_distribution_df.merge(
+        weight_fitted_df,
+        on=["species_id", "sex", "length_bin"],
+        how="outer"
+    )
+    # ---- Fill missing counts
+    weight_fitted_distribution_df["weight_mean"] = (
+        weight_fitted_distribution_df["weight_mean"].fillna(0.0)
+    )
+    # ---- Fill missing weights
+    weight_fitted_distribution_df["count"] = (
+        weight_fitted_distribution_df["count"].fillna(0).astype(int)
+    )
+    # ---- Find fitted weights accounting for low sample sizes
+    weight_fitted_distribution_df["weight_fitted"] = np.where(
+        weight_fitted_distribution_df["count"] < 5,
+        weight_fitted_distribution_df["weight_modeled"],
+        weight_fitted_distribution_df["weight_mean"],
+    )
+    # ---- Pull out unused columns
+    weight_fitted_distribution_df = weight_fitted_distribution_df.filter(
+        ["species_id", "sex", "length_bin", "weight_fitted"]
+    )
+
+    # Check for `weight_fitted_df` in the database file
+    # ---- Create id/primary key
+    key_values = ["-".join(weight_fitted_distribution_df
+                           .loc[idx, ["species_id", "sex", "length_bin"]]
+                           .values.astype(str)) 
+                for idx in weight_fitted_distribution_df.index]
+    # ---- Add to the output
+    output_df = weight_fitted_distribution_df.assign(id=key_values)
+    # ---- Query database
+    if not SQL(biology_db, "validate", table_name="weight_fitted_df"):
+        # ---- Create
+        SQL(biology_db, "create", table_name="weight_fitted_df", 
+            dataframe=output_df, primary_keys=["id"])       
+        # ---- Populate table
+        SQL(biology_db, "insert", table_name="weight_fitted_df", 
+            dataframe=output_df, id_columns=["id"])
+    else:
+        # ---- Update the table
+        sql_group_update(db_file=biology_db, 
+                         dataframe=output_df, 
+                         table_name="weight_fitted_df", 
+                         columns=["weight_fitted"],
+                         unique_columns=["species_id", "sex", "length_bin"], 
+                         id_columns=["id"])
+        
+    # Return the dataframe
+    return weight_fitted_distribution_df
+
+def length_bin_weights(length_data: pd.DataFrame, specimen_data: pd.DataFrame, 
+                       length_weight_df: pd.DataFrame, file_configuration: dict):
+    
+    # Get the spatial column name, if there is one
+    contrast_columns = file_configuration["spatial_column"].copy()
+    # ---- Get the spatial key
+    spatial_key = contrast_columns.copy()
+    # ---- Append additional columns that will be used
+    contrast_columns.extend(["sex", "species_id"])
+
+    # Get database
+    biology_db = file_configuration["database"]["biology"]
+
+    # Pull the relevant data
+    # SQL(biology_db, "select", table_name="length_df", 
+    #     columns=list(set(length_data.columns) - set(["length_bin"])))
+    # list(set(length_data.columns) - set(["length_bin"]))
+    # Get length distribution
+    # distribution_df = file_configuration["length_distribution"]
+
+    # Generate sex-specific interpolators for fitted length-weight values for binned length counts
+    # ---- Parse the male- and female-specific fitted weight values
+    length_weight_sex = length_weight_df.copy()[length_weight_df["sex"].isin(["male", "female"])]
+    # ---- Create interpolator functions
+    interpolators = group_interpolator_creator(
+        grouped_data=length_weight_sex,
+        independent_var="length_bin",
+        dependent_var="weight_fitted",
+        contrast=["sex", "species_id"],
+    )
+    # ---- Create helper/lambda function
+    def weight_interpolator(dataframe_row):
+        sex = dataframe_row["sex"]
+        species_id = dataframe_row["species_id"]
+        length = dataframe_row["length"]
+        if (sex, species_id) in interpolators:
+            return interpolators[(sex, species_id)](length)
+        else:
+            return None    
+            
+    # Extract only sexed fish from the unaged (station 1) length dataset
+    length_data_sexed = length_data[length_data["sex"].isin(["male", "female"])].copy()
+    # ---- Add interpolated weights to the general length dataset
+    length_data_sexed.loc[:, "weight_interp"] = (
+        length_data_sexed.apply(weight_interpolator, axis=1) * length_data_sexed["length_count"]
+    )
+    # ---- Convert interpolated weights (summed across length counts) into a table
+    length_table_sexed = (
+        length_data_sexed
+        .groupby(list(set(contrast_columns).union(set(["length_bin"]))))["weight_interp"].sum()
+    ).reset_index()
+
+    # Remove specimen data with missing data required for this analysis
+    # ---- Drop unsexed fish
+    specimen_data_filtered = specimen_data[specimen_data["sex"].isin(["male", "female"])].copy()
+    # ---- Remove NaN
+    specimen_data_filtered = specimen_data_filtered.dropna(subset=["length", "weight"])
+    # ---- Convert to a table
+    specimen_table_sexed = (
+        specimen_data_filtered
+        .groupby(list(set(contrast_columns).union(set(["length_bin"]))))["weight"].sum()
+    ).reset_index()
+
+    # Check for `length_weight_df` in the database file
+    # ---- Create id/primary key
+    key_values = ["-".join(length_table_sexed.reset_index()
+                           .loc[idx, ["species_id", "sex", "length_bin"]]
+                           .values.astype(str)) 
+                for idx in length_table_sexed.reset_index().index]
+    # ---- Add to the output
+    length_table_sexed["id"] = key_values
+    # ---- Query database
+    if not SQL(biology_db, "validate", table_name="length_weight_df"):
+        # ---- Create
+        SQL(biology_db, "create", table_name="length_weight_df", 
+            dataframe=length_table_sexed, primary_keys=["id"])       
+        # ---- Populate table
+        SQL(biology_db, "insert", table_name="length_weight_df", 
+            dataframe=length_table_sexed, id_columns=["id"])
+    else:
+        # ---- Update the table
+        sql_group_update(db_file=biology_db, 
+                         dataframe=length_table_sexed, 
+                         table_name="length_weight_df", 
+                         columns=["weight_interp"],
+                         unique_columns=contrast_columns, 
+                         id_columns=["id"])
+    # length_sql_sexed
+    
+    
+    # , specimen_sql_sexed
+
+    # Return outputs
+    return length_table_sexed, specimen_table_sexed
+
+def number_proportions(specimen_binned: pd.DataFrame, specimen_binned_filtered: pd.DataFrame,
+                       length_binned: pd.DataFrame, file_configuration: dict):
+    
+    # Get the spatial column name, if there is one
+    contrast_columns = file_configuration["spatial_column"].copy()
+    # ---- Append additional columns that will be used
+    contrast_columns.extend(["sex", "species_id"])
+
+
+    # Get unique values of each contrast column across the biological datasets
+    dfs = [pd.DataFrame({col: df[col].unique().tolist()}) 
+        for col, df in zip(contrast_columns, [specimen_binned,
+                                                specimen_binned_filtered, 
+                                                length_binned])]
+    # ---- Reduce into a single DataFrame
+    count_total = reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs)
+    # ---- Set the indices
+    count_total.set_index(contrast_columns, inplace=True)
+    # ---- Specimen count
+    count_total["total_specimen"] = specimen_binned.groupby(contrast_columns)["count"].sum()
+    # ---- Specimen filtered count
+    count_total["total_specimen_filtered"] = (
+        specimen_binned_filtered.groupby(contrast_columns)["count"].sum()
+    )
+    # ---- Length count
+    count_total["total_length"] = length_binned.groupby(contrast_columns)["count"].sum()
+    # ---- Fill NaN
+    count_total.fillna(0, inplace=True)
+    count_total = (
+        count_total.reset_index().set_index(list(set(contrast_columns) - set(["sex", "species_id"])))
+    )
+    # ---- Grand totals
+    count_total["total_overall"] = (
+        count_total.loc[count_total.sex == "all", "total_specimen_filtered"]
+        + count_total.loc[count_total.sex == "all", "total_length"]
+    )
+    # ---- Reset index
+    count_total = count_total.reset_index()
+
+    # Compute the number proportions for the specimen data
+    specimen_number_proportion = specimen_binned_filtered[
+        specimen_binned_filtered["sex"].isin(["male", "female", "all"])
+    ].merge(
+        count_total[list(set(contrast_columns).union(set(["total_specimen_filtered", "total_overall"])))],
+        on=contrast_columns
+    )
+    # ---- Within-dataset proportion
+    specimen_number_proportion["proportion_number_specimen"] = (
+        specimen_number_proportion["count"] / specimen_number_proportion["total_specimen_filtered"]
+    )
+    # ---- Overall survey proportion
+    specimen_number_proportion["proportion_number_specimen_overall"] = (
+        specimen_number_proportion["count"] / specimen_number_proportion["total_overall"]
+    )
+    # ---- Compute the sex proportions
+    sex_number_proportions = (
+        specimen_number_proportion.groupby(contrast_columns, observed=False)[
+            "proportion_number_specimen_overall"
+        ]
+        .sum()
+        .reset_index()
+    )
+
+    # Compute the number proportions for the length data
+    length_number_proportion = length_binned[
+        length_binned["sex"].isin(["male", "female", "all"])
+    ].merge(
+        count_total[list(set(contrast_columns).union(set(["total_length", "total_overall"])))],
+        on=contrast_columns
+    )
+    # ---- Within-dataset proportion
+    length_number_proportion["proportion_number_length"] = (
+        length_number_proportion["count"] / length_number_proportion["total_length"]
+    )
+    # ---- Overall survey proportion
+    length_number_proportion["proportion_number_length_overall"] = (
+        length_number_proportion["count"] / length_number_proportion["total_overall"]
+    )
+
+    # Gather unaged (sexed) number proportions
+    # ---- Merge
+    sex_number_proportions = sex_number_proportions.merge(
+        length_number_proportion.groupby(contrast_columns)[
+            "proportion_number_length_overall"
+        ]
+        .sum()
+        .reset_index(),
+        how="outer",
+    ).fillna(0.0)
+    # ---- Sum overall total across datasets
+    sex_number_proportions["proportion_number_overall"] = (
+        sex_number_proportions.proportion_number_specimen_overall
+        + sex_number_proportions.proportion_number_length_overall
+    )    
+
+    # Return the output
+    return specimen_number_proportion, length_number_proportion, sex_number_proportions
+
+def length_bin_counts(length_data: pd.DataFrame, specimen_data: pd.DataFrame, 
+                      file_configuration: dict):
+
+    # Get the spatial column name, if there is one
+    contrast_columns = file_configuration["spatial_column"].copy()
+    # ---- Append additional columns that will be used
+    contrast_columns.extend(["sex", "species_id", "length_bin"])
+
+    # Bin counts by sex
+    # ---- Specimen
+    specimen_number_distribution = pd.concat(
+        [specimen_data, specimen_data.assign(sex="all")]
+    ).count_variable(
+        contrasts=contrast_columns,
+        variable="length",
+        fun="size",
+    )
+    # ---- Filter out unsexed data for parallel number counts and drop any NA's
+    specimen_number_distribution_filtered = (
+        pd.concat(
+            [
+                specimen_data[specimen_data.sex != "unsexed"],
+                specimen_data[specimen_data.sex != "unsexed"].assign(sex="all"),
+            ]
+        )
+        .dropna(subset=["length", "weight"])
+        .count_variable(
+            contrasts=contrast_columns,
+            variable="length",
+            fun="size",
+        )
+    )
+
+    # Repeat for the aggregated data
+    # ---- Length
+    length_number_distribution = pd.concat(
+        [length_data, length_data.assign(sex="all")]
+    ).count_variable(
+        contrasts=contrast_columns,
+        variable="length_count",
+        fun="sum",
+    )    
+
+    return (
+        specimen_number_distribution, 
+        specimen_number_distribution_filtered, 
+        length_number_distribution
+    )
+
+
+# def length_bin_counts(biology_dict: dict, file_configuration: dict):
+
+#     # Get the spatial column name, if there is one
+#     contrast_columns = file_configuration["spatial_column"].copy()
+#     # ---- Append additional columns that will be used
+#     contrast_columns.extend(["sex", "species_id", "length_bin"])
+
+#     # Get database file
+#     biology_db = file_configuration["database"]["biology"]
+
+#     # Get distribution data
+#     distribution_df = file_configuration["length_distribution"]
+
+#     # Generate number counts for the length distribution
+#     length_datasets = (
+#         biology_dict["specimen_df"]
+#         .meld(biology_dict["length_df"], 
+#               contrasts=list(set(contrast_columns).union(["length_bin"])))
+#     )    
+#     # ---- Create 'all'
+#     length_datasets_all = pd.concat([
+#         length_datasets[length_datasets["sex"].isin(["male", "female"])], 
+#         length_datasets.assign(sex="all")
+#     ])
+
+#     # Collapse by each bin
+#     grouped_length = (
+#         length_datasets_all
+#         .groupby(contrast_columns, observed=False)["length_count"].sum()
+#     )
+    
+#     # Get distinct DataFrame columns
+#     distinct_keys = (
+#         grouped_length
+#         .reset_index()
+#         .loc[:, list(set(contrast_columns) - set(["length_bin"]))].drop_duplicates()
+#     )
+
+#     # Create complete DataFrame
+#     complete_distrib_df = (
+#         distribution_df.merge(distinct_keys, how="cross").set_index(contrast_columns)
+#     )
+#     # ---- Pre-allocate the "length_count" column
+#     complete_distrib_df.loc[:, "count"] = 0
+#     # ---- Add the computed counts
+#     complete_distrib_df.loc[grouped_length.index, "count"] = grouped_length
+#     # ---- Create output DataFrame
+#     output_df = complete_distrib_df.filter(["count"]).reset_index()
+
+#     # Check for `length_count_df` in the database file
+#     # ---- Create id/primary key
+#     key_values = ["-".join(output_df
+#                            .loc[idx, ["species_id", "sex", "length_bin"]]
+#                            .values.astype(str)) 
+#                 for idx in output_df.index]
+#     # ---- Add to the output
+#     output_df["id"] = key_values
+#     # ---- Query database
+#     if not SQL(biology_db, "validate", table_name="length_count_df"):
+#         # ---- Create
+#         SQL(biology_db, "create", table_name="length_count_df", 
+#             dataframe=output_df, primary_keys=["id"])       
+#         # ---- Populate table
+#         SQL(biology_db, "insert", table_name="length_count_df", 
+#             dataframe=output_df, id_columns=["id"])
+#     else:
+#         # ---- Update the table
+#         sql_group_update(db_file=biology_db, 
+#                          dataframe=output_df, 
+#                          table_name="length_count_df", 
+#                          columns=["count"],
+#                          unique_columns=contrast_columns, 
+#                          id_columns=["id"])
+        
+#     # Return output
+#     return output_df
+
+
+def bin_length_data(biology_dict: dict, distribution_df: pd.DataFrame):
+
+    # Create Lambda help function
+    def _quantize_lengths(dataset, distribution):
+        # ---- Cut/merge the underlying histogram/discretized length bins
+        if "length" in dataset.columns:
+            # ---- Cut the intervals
+            dataset["length_bin"] = pd.cut(dataset["length"], 
+                                        np.unique(np.hstack([distribution["lower"], 
+                                                                distribution["upper"]])),
+                                            labels=distribution["length_bin"]).astype(float)
+        # ---- Return the dataset
+        return dataset
+        
+    # Update the data dictionary
+    biology_dict.update({
+        k: _quantize_lengths(d, distribution_df) for k, d in biology_dict.items()
+    })
+
+
+def compute_average_weights(specimen_number_proportion: pd.DataFrame, 
+                            length_number_proportion: pd.DataFrame, 
+                            sex_number_proportions: pd.DataFrame,
+                            length_weight_df: pd.DataFrame,
+                            distribution_df: pd.DataFrame,
+                            file_configuration: dict):
+
+    # Get the spatial column name, if there is one
+    contrast_columns = file_configuration["spatial_column"].copy()
+    # ---- Append additional columns that will be used
+    contrast_columns.extend(["sex", "species_id"])
+
+    overall_proportions = sex_number_proportions[sex_number_proportions["sex"] == "all"]
+    updated_proportions = sex_number_proportions.copy()
+
+    updated_proportions["number_proportion_length_all"] = overall_proportions["proportion_number_length_overall"].values[0]
+    updated_proportions["number_proportion_specimen_all"] = overall_proportions["proportion_number_specimen_overall"].values[0]
+
+    # Calculate the mixed aged and unaged number proportions
+    updated_proportions["proportion_length"] = (
+        updated_proportions["number_proportion_length_all"] / 
+        (updated_proportions["number_proportion_length_all"] +
+        updated_proportions["proportion_number_specimen_overall"])
+    )
+    # ---- Calculate aged number proportions per sex per stratum
+    updated_proportions["proportion_specimen"] = (
+        updated_proportions["proportion_number_specimen_overall"] / (
+            updated_proportions["proportion_number_specimen_overall"] +
+            updated_proportions["proportion_length"]
+        )
+    )
+    # ---- Reduce the columns
+    proportion_df = (
+        updated_proportions.filter(contrast_columns + ["proportion_length", "proportion_specimen"])
+    )
+
+    # Combine the aged-unaged (or station-specific) proportions for calculations
+    # ---- Wide-to-long DataFrame
+    station_proportions = pd.wide_to_long(
+        proportion_df,
+        stubnames="proportion",
+        i=contrast_columns,
+        j="group",
+        sep="_",
+        suffix="\\w+",
+    ).reset_index()
+    # ---- Convert to Table (to replicate indexed matrix operations)
+    station_proportions_table = station_proportions.pivot_table(
+        index=["species_id", "group", "sex"], 
+        columns=file_configuration["spatial_column"].copy(), values="proportion"
+    ).fillna(0.0)
+
+    # Calculate the number length proportions that will later be converted into weight
+    # ---- Specimen
+    specimen_length_distribution = (
+        specimen_number_proportion.groupby(contrast_columns + ["length_bin"], observed=False)[
+            "proportion_number_specimen"
+        ]
+        .sum()
+        .reset_index(name="number_proportion")
+    )
+    # ---- Length
+    length_length_distribution = (
+        length_number_proportion[length_number_proportion.sex != "unsexed"][
+            contrast_columns + ["length_bin", "proportion_number_length"]
+        ].rename(columns={"proportion_number_length": "number_proportion"})
+    )
+
+    # Get unique values of each contrast column across the biological datasets
+    dfs = [pd.DataFrame({col: df[col].unique().tolist()}) 
+        for col, df in zip(contrast_columns, [specimen_number_proportion, 
+                                            length_number_proportion, 
+                                            sex_number_proportions])]
+    # ---- Reduce into a single DataFrame
+    full_contrast_keys = reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs)
+
+    #
+    length_distribution_df = distribution_df.copy()
+    complete_distrib_df = (
+        length_distribution_df.merge(full_contrast_keys, how="cross")
+        .drop(columns=["interval", "lower", "upper"])
+        .set_index(contrast_columns + ["length_bin"])
+    )
+
+    specimen_length_complete = complete_distrib_df.copy()
+    specimen_length_complete["number_proportion"] = specimen_length_distribution.set_index(contrast_columns + ["length_bin"])
+    specimen_length_complete.loc[:, "number_proportion"] = specimen_length_complete["number_proportion"].fillna(0.0)
+
+    length_length_complete = complete_distrib_df.copy()
+    length_length_complete["number_proportion"] = length_length_distribution.set_index(contrast_columns + ["length_bin"])
+    length_length_complete.loc[:, "number_proportion"] = length_length_complete["number_proportion"].fillna(0.0)
+
+    # ---- Concatenate the two datasets
+    combined_number_proportions = (
+        pd.concat([specimen_length_complete.assign(group="specimen"), 
+                length_length_complete.assign(group="length")])
+    ).reset_index()
+    # ---- Convert to Table (to replicate indexed matrix operations)
+    length_proportions_table = combined_number_proportions.pivot_table(
+        index=["species_id", "group", "sex", "length_bin"],
+        columns=file_configuration["spatial_column"].copy(),
+        values="number_proportion",
+        observed=False,
+    ).fillna(0.0)
+
+    # Convert the fitteed weights into a Table (to replicate index matrix operations)
+    fitted_weight_table = length_weight_df.pivot_table(
+        index=["species_id", "sex", "length_bin"], values="weight_fitted", observed=False
+    )
+
+    # Calculate the average weights for male, female, and all fish within each stratum
+    # ---- All
+    fitted_weight_table.loc[:, "all", :]
+    weight_all = fitted_weight_table.loc[:, "all", :]["weight_fitted"].values.dot(
+        length_proportions_table.loc[:, "specimen", "all"] 
+        * station_proportions_table.loc[:, "specimen", "all"]
+        + length_proportions_table.loc[:, "length", "all"]
+        * station_proportions_table.loc[:, "length", "all"]
+    )
+    weight_male = fitted_weight_table.loc[:, "male", :]["weight_fitted"].values.dot(
+        length_proportions_table.loc[:, "specimen", "male"] 
+        * station_proportions_table.loc[:, "specimen", "male"]
+        + length_proportions_table.loc[:, "length", "male"]
+        * station_proportions_table.loc[:, "length", "male"]
+    )
+    weight_female = fitted_weight_table.loc[:, "female", :]["weight_fitted"].values.dot(
+        length_proportions_table.loc[:, "specimen", "female"] 
+        * station_proportions_table.loc[:, "specimen", "female"]
+        + length_proportions_table.loc[:, "length", "female"]
+        * station_proportions_table.loc[:, "length", "female"]
+    )
+    # ---- Combine the averaged weights for each sex and all fish
+    fitted_weight_df = full_contrast_keys.copy()
+    fitted_weight_df["average_weight"] = (
+        np.concatenate([weight_all, weight_male, weight_female])
+    )
+
+    # Return output
+    return fitted_weight_df
\ No newline at end of file
diff --git a/echopop/live/live_core.py b/echopop/live/live_core.py
index 28a63237..677cddc3 100644
--- a/echopop/live/live_core.py
+++ b/echopop/live/live_core.py
@@ -79,11 +79,15 @@
         },
         "length": {
             "dtypes": {
+                "operation_number": int,
+                "partition": str,
                 "sex": str,
                 "rounded_length": int,
                 "frequency": int,
             },
             "names": {
+                "operation_number": "haul_num",
+                "partition": "trawl_partition",
                 "sex": "sex",
                 "rounded_length": "length",
                 "frequency": "length_count",
@@ -91,13 +95,17 @@
         },
         "specimen": {
             "dtypes": {
-                "rounded_length": int,
+                "operation_number": int,
+                "partition": str,
+                "length": float,
                 "organism_weight": float,
                 "sex": str,
             },
             "names": {
+                "operation_number": "haul_num",
+                "partition": "trawl_partition",
                 "sex": "sex",
-                "rounded_length": "length",
+                "length": "length",
                 "organism_weight": "weight"
             },
         },
diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index ce5a06f7..1220591f 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -13,6 +13,8 @@
     SPATIAL_CONFIG_MAP
 )
 
+from .live_spatial_methods import create_inpfc_strata
+
 # TODO: Incorporate complete YAML file validator
 # TODO: Documentation
 def live_configuration(live_init_config_path: Union[str, Path], 
@@ -55,53 +57,105 @@ def live_configuration(live_init_config_path: Union[str, Path],
     # Combine both into a dictionary output that can be added to the `LiveSurvey` class object
     return {**init_config, **file_config}
 
-# TODO: Documentation
-def validate_data_directory(root_directory: str, file_settings: dict) -> List[Path]:
+def read_acoustic_files(acoustic_files: List[Path]) -> tuple:
 
-    # Get acoustic directory and initialization settings
-    # ---- Create the full filepath 
-    directory_path = Path(root_directory) / file_settings["directory"]
-    # ---- Get the defined file extension
-    file_extension = file_settings["extension"]
+    # Get the file-specific settings, datatypes, columns, etc.
+    # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
+    acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"]
 
-    # Validate filepath, columns, datatypes
-    # ---- Error evaluation (if applicable)
-    if not directory_path.exists():
-        raise FileNotFoundError(
-            f"The acoustic data directory [{directory_path}] does not exist."
-        )
-    
-    # Validate that files even exist
-    # ---- List available *.zarr files
-    data_files = list(directory_path.glob(f"*{'.'+file_extension}"))
-    # ---- Error evaluation (if applicable)
-    if not data_files:
-        raise FileNotFoundError(
-            f"No `*.{file_extension}` files found in [{directory_path}]!"
-        )
+    # Read all of the zarr files
+    results_list =  [(data_df, unit_dict) if i ==0 else (data_df, None) 
+                     for i, (data_df, unit_dict) in enumerate(
+                        read_acoustic_zarr(Path(file), acoustics_config_map) 
+                        for file in acoustic_files
+    )]
+
+    # Concatenate the dataframe component
+    acoustic_data_df = pd.concat([df for df, _ in results_list], ignore_index = True)
+    # ---- Add the `acoustic_data_units` to the dictionary and output the resulting tuple
+    return acoustic_data_df, results_list[0][1] if results_list else None
+
+def filter_filenames(directory_path: Path, filename_id: str, 
+                     files: List[Path],
+                     file_extension: str):
+
+    # Drop the `{FIELD_ID}` tag identifier
+    file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', filename_id)
+    # ---- Replace all other tags with `*` placeholders
+    file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
+    # ---- Create Path object with the generalized format
+    subfile_path_obj = directory_path.glob(f"{file_id_format}.{file_extension}")
+    # ---- List all files that match this pattern
+    subfile_str = [str(file) for file in list(subfile_path_obj)]
+
+    # Convert list of proposed files from Path to String
+    file_str = [str(file) for file in list(files)]
     
-    # Return the output
-    return data_files
+    # Find intersection with the proposed filenames and return the output
+    return list(set(subfile_str).intersection(set(file_str)))
+
+def read_biology_files(biology_files: List[Path], file_configuration: dict):
 
-def read_acoustic_zarr(acoustic_files: Path) -> tuple:
+    # Get the biology data file settings
+    file_settings = file_configuration["input_directories"]["biology"]
 
     # Get the file-specific settings, datatypes, columns, etc.
     # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
-    acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"]
-    # ---- Create list of coordinate data variables
-    specified_vars = list(acoustics_config_map["xarray_variables"].keys())
-    # ---- Create set of coordinate variables
-    specified_coords = list(acoustics_config_map["xarray_coordinates"].keys())      
-    # ---- Concatenate into a full configuration map
-    full_config_map = {**acoustics_config_map["xarray_coordinates"],
-                        **acoustics_config_map["xarray_variables"]} 
+    biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] 
+    # ---- Extract the expected file name ID's
+    biology_file_ids = file_settings["file_name_formats"]
+    # ---- Extract all of the file ids
+    biology_config_ids = list(biology_file_ids.keys())
+    # ---- Initialize the dictionary that will define this key in the `input` attribute
+    biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
+    # # ---- Create filepath object
+    directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"]
+    
+    # Add SQL file to dict
+    file_configuration["database"]["biology"] = (
+        Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"]         
+    )
+
+    # Iterate through the different biology datasets and read them in
+    for dataset in list(biology_file_ids.keys()):
+        # ---- Get dataset-specific file lists
+        dataset_files = filter_filenames(directory_path, 
+                                         biology_file_ids[dataset], 
+                                         biology_files, 
+                                         file_settings["extension"])
+        # ---- If there are dataset files available
+        if dataset_files:
+            # ---- Read in validated biology data
+            dataframe_list = [read_biology_csv(Path(file), 
+                                               file_settings["file_name_formats"][dataset], 
+                                               biology_config_map[dataset]) 
+                              for file in dataset_files]
+            # ---- Concatenate the dataset
+            dataframe_combined = pd.concat(dataframe_list, ignore_index=True)
+            # ---- Lower-case sex
+            if "sex" in dataframe_combined.columns: 
+                dataframe_combined["sex"] = dataframe_combined["sex"].str.lower()
+            # ---- Lower-case trawl partition type
+            if "trawl_partition" in dataframe_combined.columns: 
+                dataframe_combined["trawl_partition"] = dataframe_combined["trawl_partition"].str.lower()
+            # ---- Reformat datetime column
+            if "datetime" in dataframe_combined.columns:
+                dataframe_combined["datetime"] = convert_datetime(dataframe_combined["datetime"])
+            # ---- Add to the data dictionary
+            biology_output[f"{dataset}_df"] = dataframe_combined
+    
+    # Return the output
+    return biology_output
+
+def read_acoustic_zarr(file: Path, config_map: dict) -> tuple:
     
+    # Format the file reading configuration
+    # ---- Concatenate into a full configuration map
+    full_config_map = {**config_map["xarray_coordinates"],
+                        **config_map["xarray_variables"]} 
+
     # Determine the file loading method for the `acoustic_files`
-    if len(acoustic_files) > 1:
-        zarr_data_ds = xr.open_mfdataset(acoustic_files, engine="zarr", chunks="auto", 
-                                         data_vars=specified_vars, coords=specified_coords)
-    else:
-        zarr_data_ds = xr.open_dataset(acoustic_files[0], engine="zarr", chunks="auto")
+    zarr_data_ds = xr.open_dataset(file, engine="zarr", chunks="auto")
 
     # Pre-process the Dataset, convert it to a DataFrame, and validate the structure
     # ---- Convert to a DataFrame
@@ -119,6 +173,9 @@ def read_acoustic_zarr(acoustic_files: Path) -> tuple:
     # ---- Select defined columns
     zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map)
 
+    # Add the filename as a column
+    zarr_data_df_filtered["source"] = Path(file).name 
+
     # Gather some of the units
     data_units = {
         "longitude": zarr_data_ds.longitude.units,
@@ -130,99 +187,55 @@ def read_acoustic_zarr(acoustic_files: Path) -> tuple:
     return zarr_data_df_filtered, data_units
 
 # TODO: Documentation
-def configure_transmit_frequency(frequency_values: pd.Series,
-                                 transmit_settings: dict, 
-                                 current_units: str):
-    
-    # Extract transmit frequency units defined in configuration file
-    configuration_units = transmit_settings["units"]
-    
-    # Transform the units, if necessary
-    # ---- Hz to kHz
-    if current_units == "Hz" and configuration_units == "kHz":
-        return frequency_values * 1e-3
-    # ---- kHz to Hz
-    elif current_units == "kHz" and configuration_units == "Hz":
-        return frequency_values * 1e3
-    # ---- No change
-    else:
-        return frequency_values
-    
-# TODO: Documentation
-def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame,
-                             file_configuration: dict) -> pd.DataFrame:
-
-    # Get acoustic processing settings
-    acoustic_analysis_settings = file_configuration["acoustics"]
-    # ---- Extract the fined acoustic frequency
-    transmit_settings = acoustic_analysis_settings["transmit"]
-
-    # Filter the dataset
-    # ---- Configure `frequency_nominal`, if necessary
-    prc_nasc_df["frequency_nominal"] = (
-        configure_transmit_frequency(prc_nasc_df["frequency_nominal"],
-                                     transmit_settings,
-                                     acoustic_analysis_settings["dataset_units"]["frequency"])
-    )
-    # ---- Filter out any unused frequency coordinates
-    prc_nasc_df_filtered = (
-        prc_nasc_df[prc_nasc_df["frequency_nominal"] == transmit_settings["frequency"]]
-    )
-
-    # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object
-    # ---- Replace NASC `NaN` values with `0.0`
-    prc_nasc_df_filtered.loc[:, "NASC"] = prc_nasc_df_filtered.loc[:, "NASC"].fillna(0.0)
-    # ---- Drop the `frequency_nominal` column and return the output 
-    return prc_nasc_df_filtered.drop(columns = ["frequency_nominal"])
+def validate_data_directory(file_configuration: dict, dataset: str,
+                            input_filenames: Optional[list] = None) -> List[Path]:
 
-# TODO: Documentation
-def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame]:
+    # Get the dataset file settings
+    file_settings = file_configuration["input_directories"][dataset]
 
     # Get the acoustic file settings and root directory
-    # ---- File settings
-    file_settings = file_configuration["input_directories"]["acoustics"]
     # ---- Root directory
-    root_directory = file_configuration["data_root_dir"]
-    
-    # Get and validate the acoustic data directory and files
-    acoustic_files = validate_data_directory(root_directory, file_settings)
-
-    # Query `acoustics.db` to process only new files (or create the db file in the first place)
-    new_acoustic_files, file_configuration["database"]["acoustics"] = (
-        query_processed_files(root_directory, file_settings, acoustic_files)  
-    )  
-
-    # Read in the acoustic data files
-    if new_acoustic_files:
-        # ! [REQUIRES DASK] ---- Read in the listed file
-        prc_nasc_df, acoustic_data_units = read_acoustic_zarr(new_acoustic_files)
-        # ---- Add the `acoustic_data_units` to the dictionary
-        file_configuration["acoustics"]["dataset_units"] = acoustic_data_units
-        # ---- Preprocess the acoustic dataset
-        prc_nasc_df_processed = preprocess_acoustic_data(prc_nasc_df, file_configuration)
-        # ---- Return output
-        return prc_nasc_df_processed
-    else:
-        return None
-    
-def filter_filenames(directory_path: Path, filename_id: str, 
-                     files: List[Path],
-                     file_extension: str):
+    if "data_root_dir" in file_configuration.keys():
+        root_directory = Path(file_configuration["data_root_dir"])
+    else: 
+        root_directory = Path()
+    # ---- File folder
+    data_directory = Path(file_settings["directory"])
+    # ---- Createa directory path
+    directory_path = root_directory / data_directory
 
-    # Drop the `{FIELD_ID}` tag identifier
-    file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', filename_id)
-    # ---- Replace all other tags with `*` placeholders
-    file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
-    # ---- Create Path object with the generalized format
-    subfile_path_obj = directory_path.glob(f"{file_id_format}.{file_extension}")
-    # ---- List all files that match this pattern
-    subfile_str = [str(file) for file in list(subfile_path_obj)]
+    # Validate filepath, columns, datatypes
+    # ---- Error evaluation (if applicable)
+    if not directory_path.exists():
+        raise FileNotFoundError(
+            f"The acoustic data directory [{directory_path}] does not exist."
+        )
 
-    # Convert list of proposed files from Path to String
-    file_str = [str(file) for file in list(files)]
+    # Validate that files even exist
+    # ---- List available *.zarr files
+    data_files = list(directory_path.glob(f"*{'.'+file_settings["extension"]}"))
+    # ---- Error evaluation (if applicable)
+    if not data_files:
+        raise FileNotFoundError(
+            f"No `*.{file_settings["extension"]}` files found in [{directory_path}]!"
+        )
     
-    # Find intersection with the proposed filenames and return the output
-    return list(set(subfile_str).intersection(set(file_str)))
+    # Check and format specific input filenames
+    if isinstance(input_filenames, list):
+        data_files = [directory_path / filename for filename in input_filenames]
+    # ---- Raise Error
+    elif input_filenames is not None:
+        raise TypeError(
+            "Data loading argument `input_filenames` must be a list."
+        )        
+    
+    # Query the SQL database to process only new files (or create the db file in the first place)
+    valid_files, file_configuration["database"][dataset] = (
+        query_processed_files(root_directory, file_settings, data_files)
+    )
+
+    # Return the valid filenames/paths
+    return valid_files
 
 def compile_filename_format(file_name_format: str):
 
@@ -276,89 +289,6 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict):
     # Return the resulting DataFrame
     return df_validated
 
-def get_table_key_names(db_file: Path, data_dict: dict, table_name: str) -> List[str]:
-
-    # Get the data input column names
-    if data_dict[table_name].empty:
-        # ---- Inspect the table
-        inspected_table = SQL(db_file, "inspect", table_name=table_name)
-        # ---- Create a list of the data columns
-        table_columns = list(inspected_table.keys())
-    else:
-        # ---- Get the DataFrame column names
-        table_columns = data_dict[table_name].columns
-
-    # Create a list of the primary keys
-    key_columns = (
-           set(table_columns)
-           .intersection(["trawl_partition", "sex", "haul_num", "species_id", "longitude", 
-                          "latitude"]) 
-        )
-
-    # Return a list of the output
-    return list(key_columns)
-
-def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict):
-
-    # Create dataframe copy
-    data_copy = biology_data.copy()
-
-    # Iterate through dictionary to apply filters (if present)
-    for column, value in filter_dict.items():
-        if column in data_copy.columns:
-            data_copy = data_copy[data_copy[column] == value]
-
-    # Return output
-    return data_copy
-
-def preprocess_biology_data(biology_output: dict, file_configuration: dict):
-  
-    # Get SQL database file
-    biology_db = file_configuration["database"]["biology"]
-    
-    # Get contrasts used for filtering the dataset
-    # ---- Species
-    species_filter = file_configuration["species"]["number_code"]
-    # ---- Trawl partition information
-    trawl_filter = file_configuration["biology"]["catch"]["partition"]
-    # ---- Create filter dictionary
-    filter_dict = dict(species_id=species_filter, trawl_partition=trawl_filter)
-
-    # Apply the filter
-    filtered_biology_output = {
-        key: biology_data_filter(df, filter_dict) 
-        for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty
-    }
-    # ---- Swap this out if no new files are present
-    if not filtered_biology_output:
-        # ---- Get available tables
-        table_list = list(set(SQL(biology_db, "map")) - set(["files_read"]))
-        # ---- Plug into the dictionary
-        filtered_biology_output.update({key: pd.DataFrame() for key in table_list})
-    # ---- Initialize the results dictionary   
-    results_dict = {key: pd.DataFrame() for key in filtered_biology_output.keys()}
-
-    # Update the SQL database
-    for table_name, df in filtered_biology_output.items():
-        # ---- Get identifier columns
-        key_columns = get_table_key_names(biology_db, filtered_biology_output, table_name)
-        # ---- Create copy
-        df = df.copy()
-        # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint
-        df.loc[:, "id"] = "row" + df.index.astype(str) + "-" + "-".join(key_columns)        
-        # ---- Insert the new data into the database & pull in the combined dataset
-        table_df = sql_data_exchange(biology_db, 
-                                     dataframe=df, 
-                                     table_name=table_name, 
-                                     id_columns=["id"],
-                                     primary_keys=["id"],
-                                     output_type=pd.DataFrame)
-        # ---- Add to the outgoing dictionary (and drop SQL db identifier)
-        results_dict.update({table_name: table_df.drop(columns="id")})
-    
-    # Return the output
-    return results_dict
-
 def infer_datetime_format(timestamp_str: Union[int, str]):
     patterns = {
         r"^\d{14}$": "%Y%m%d%H%M%S",             # YYYYMMDDHHMMSS
@@ -392,70 +322,70 @@ def convert_datetime(timestamp: Union[int, str, pd.Series]):
     else:
         return datetime.strptime(timestamp, datetime_format)
 
-def load_biology_data(file_configuration: dict):
-
-    # Get the acoustic file settings and root directory
-    # ---- File settings
-    file_settings = file_configuration["input_directories"]["biology"]
-    # ---- Root directory
-    root_directory = file_configuration["data_root_dir"]
-
-    # Get and validate the acoustic data directory and files
-    biology_files = validate_data_directory(root_directory, file_settings)
-
-    # Query `biology.db` to process only new files (or create the db file in the first place)
-    # SQL(biology_db, "drop", table_name="files_read")
-    new_biology_files, file_configuration["database"]["biology"] = (
-        query_processed_files(root_directory, file_settings, biology_files)
-    )
-
-    # Get the file-specific settings, datatypes, columns, etc.
-    # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
-    biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"]
-    # ---- Extract the expected file name ID's
-    biology_file_ids = file_settings["file_name_formats"]
-    # ---- Extract all of the file ids
-    biology_config_ids = list(biology_file_ids.keys())
-    # ---- Initialize the dictionary that will define this key in the `input` attribute
-    biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
-    # ---- Create filepath object
-    directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"]
+# def load_biology_data(file_configuration: dict):
+
+#     # Get the acoustic file settings and root directory
+#     # ---- File settings
+#     file_settings = file_configuration["input_directories"]["biology"]
+#     # ---- Root directory
+#     root_directory = file_configuration["data_root_dir"]
+
+#     # Get and validate the acoustic data directory and files
+#     biology_files = validate_data_directory(root_directory, file_settings)
+
+#     # Query `biology.db` to process only new files (or create the db file in the first place)
+#     # SQL(biology_db, "drop", table_name="files_read")
+#     new_biology_files, file_configuration["database"]["biology"] = (
+#         query_processed_files(root_directory, file_settings, biology_files)
+#     )
+
+#     # Get the file-specific settings, datatypes, columns, etc.
+#     # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
+#     biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"]
+#     # ---- Extract the expected file name ID's
+#     biology_file_ids = file_settings["file_name_formats"]
+#     # ---- Extract all of the file ids
+#     biology_config_ids = list(biology_file_ids.keys())
+#     # ---- Initialize the dictionary that will define this key in the `input` attribute
+#     biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
+#     # ---- Create filepath object
+#     directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"]
     
-    # Add SQL file to dict
-    file_configuration["database"]["biology"] = (
-        Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"]         
-    )
-
-    # Iterate through the different biology datasets and read them in
-    for dataset in list(biology_file_ids.keys()):
-        # ---- Get dataset-specific file lists
-        dataset_files = filter_filenames(directory_path, 
-                                         file_settings["file_name_formats"][dataset], 
-                                         new_biology_files, 
-                                         file_settings["extension"])
-        # ---- If there are dataset files available
-        if dataset_files:
-            # ---- Read in validated biology data
-            dataframe_list = [read_biology_csv(Path(file), 
-                                               file_settings["file_name_formats"][dataset], 
-                                               biology_config_map[dataset]) 
-                              for file in dataset_files]
-            # ---- Concatenate the dataset
-            dataframe_combined = pd.concat(dataframe_list, ignore_index=True)
-            # ---- Lower-case sex
-            if "sex" in dataframe_combined.columns: 
-                dataframe_combined["sex"] = dataframe_combined["sex"].str.lower()
-            # ---- Lower-case trawl partition type
-            if "trawl_partition" in dataframe_combined.columns: 
-                dataframe_combined["trawl_partition"] = dataframe_combined["trawl_partition"].str.lower()
-            # ---- Reformat datetime column
-            if "datetime" in dataframe_combined.columns:
-                dataframe_combined["datetime"] = convert_datetime(dataframe_combined["datetime"])
-            # ---- Add to the data dictionary
-            biology_output[f"{dataset}_df"] = dataframe_combined
-
-    # Pre-process and return the results
-    return preprocess_biology_data(biology_output, file_configuration)
+#     # Add SQL file to dict
+#     file_configuration["database"]["biology"] = (
+#         Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"]         
+#     )
+
+#     # Iterate through the different biology datasets and read them in
+#     for dataset in list(biology_file_ids.keys()):
+#         # ---- Get dataset-specific file lists
+#         dataset_files = filter_filenames(directory_path, 
+#                                          file_settings["file_name_formats"][dataset], 
+#                                          new_biology_files, 
+#                                          file_settings["extension"])
+#         # ---- If there are dataset files available
+#         if dataset_files:
+#             # ---- Read in validated biology data
+#             dataframe_list = [read_biology_csv(Path(file), 
+#                                                file_settings["file_name_formats"][dataset], 
+#                                                biology_config_map[dataset]) 
+#                               for file in dataset_files]
+#             # ---- Concatenate the dataset
+#             dataframe_combined = pd.concat(dataframe_list, ignore_index=True)
+#             # ---- Lower-case sex
+#             if "sex" in dataframe_combined.columns: 
+#                 dataframe_combined["sex"] = dataframe_combined["sex"].str.lower()
+#             # ---- Lower-case trawl partition type
+#             if "trawl_partition" in dataframe_combined.columns: 
+#                 dataframe_combined["trawl_partition"] = dataframe_combined["trawl_partition"].str.lower()
+#             # ---- Reformat datetime column
+#             if "datetime" in dataframe_combined.columns:
+#                 dataframe_combined["datetime"] = convert_datetime(dataframe_combined["datetime"])
+#             # ---- Add to the data dictionary
+#             biology_output[f"{dataset}_df"] = dataframe_combined
+
+#     # Pre-process and return the results
+#     return preprocess_biology_data(biology_output, file_configuration)
 
 def validate_hauls_config(spatial_config: dict, link_method: str):
 
@@ -581,6 +511,34 @@ def validate_inpfc_config(spatial_config: dict, link_method: str):
                 f"be one of the following types within a list: {model}."
             )    
         
+def configure_spatial_settings(file_configuration: dict):
+
+    # Extract spatial strata *only* if spatial information from the configuration settings
+    # ---- Get (geo)spatial config
+    spatial_config = file_configuration["geospatial"]
+    # ---- Remove case sensitivity
+    spatial_config = {key.lower(): value for key, value in spatial_config.items()}
+    # ---- Extract the biology-acoustics linking method options
+    acoustics_biology_link = spatial_config["link_biology_acoustics"]
+
+    # Validate the configuration
+    validate_spatial_config(spatial_config)
+
+    # Create spatial dictionary that will be added as an `input`
+    spatial_dict = {"link_method": acoustics_biology_link}
+
+    # Assign the spatial link constraints to the acoustic and biological data
+    if acoustics_biology_link == "INPFC":
+        # ---- Update spatial dictionary
+        spatial_dict.update({"strata": create_inpfc_strata(spatial_config)})
+        # ---- Update the stratum classification in the primary file configuration
+        file_configuration.update({"spatial_column": ["stratum"]})
+    else: 
+        # ---- Empty `spatial_column` key
+        file_configuration.update({"spatial_column": []})
+
+    # Return the dictionary as an output
+    return spatial_dict
 
 def validate_spatial_config(spatial_config: dict):
 
diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py
index f38b130b..c83f35de 100644
--- a/echopop/live/live_spatial_methods.py
+++ b/echopop/live/live_spatial_methods.py
@@ -5,10 +5,10 @@
 from ..spatial.projection import utm_string_generator
 import shapely.geometry
 
-def apply_inpfc_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict):
+def create_inpfc_strata(spatial_config: dict):
 
     # Extract the INPFC definitions
-    inpfc_definitions = spatial_config["inpfc"]
+    inpfc_definitions = spatial_config["inpfc"]    
 
     # Create latitude bins
     latitude_bins = np.concatenate([[-90.0], inpfc_definitions["latitude_max"], [90.0]])
@@ -17,36 +17,88 @@ def apply_inpfc_definitions(acoustic_data: dict, biology_data: dict, spatial_con
                                 [np.max(inpfc_definitions["stratum_names"]) + 1]])
     
     # Create spatial key
-    spatial_config["spatial_key"] = pd.DataFrame({
-        "latitude_limit": inpfc_definitions["latitude_max"],
+    inpfc_strata_df = pd.DataFrame({
+        "latitude_limit": np.concatenate([inpfc_definitions["latitude_max"], [90.0]]),
+        "latitude_interval": pd.cut(np.concatenate([inpfc_definitions["latitude_max"], [90.0]]), 
+                                    latitude_bins),
+        "stratum": bin_names,
     })
-    # ---- Cut
-    spatial_config["spatial_key"]["stratum"] = (
-        pd.cut(inpfc_definitions["latitude_max"],
-               latitude_bins,
-               right = True,
-               labels = bin_names)
-    )
-
-    # Get the `prc_nasc_df` values, if they exist, and apply stratification information
-    if not acoustic_data["prc_nasc_df"].empty:
-        # ---- Bin the latitude data
-        acoustic_data["prc_nasc_df"]["stratum"] = pd.cut(
-            acoustic_data["prc_nasc_df"]["latitude"],
-            latitude_bins,
-            right = True,
-            labels = bin_names,
-        )
 
-    # Get the `trawl_info_df` values, if they exist, and apply stratification information
-    if not biology_data["trawl_info_df"].empty:
-        # ---- Bin the latitude data
-        biology_data["trawl_info_df"]["stratum"] = pd.cut(
-            biology_data["trawl_info_df"]["latitude"],
-            latitude_bins,
-            right = True,
-            labels = bin_names,
-        )
+    # Add boundaries
+    # ---- Lower
+    inpfc_strata_df["lower"] = inpfc_strata_df["latitude_interval"].apply(lambda x: x.left)
+    # ---- Upper
+    inpfc_strata_df["upper"] = inpfc_strata_df["latitude_interval"].apply(lambda x: x.right)
+
+    # Return the dataframe
+    return inpfc_strata_df
+
+def apply_inpfc_definitions(dataset: pd.DataFrame, inpfc_df: pd.DataFrame):
+
+    # Bin the data based on latitude
+    if "latitude" in dataset.columns:
+        dataset["stratum"] = pd.cut(
+            dataset["latitude"],
+            np.unique(np.hstack([inpfc_df["lower"], inpfc_df["upper"]])),
+            labels = inpfc_df["stratum"]
+        ).astype(int)
+
+    # Return the INPFC-stratified dataset
+    return dataset
+
+def apply_spatial_definitions(data_dict: dict, spatial_dict: dict):
+
+    # Get the acoustic-biology link method
+    link_method = spatial_dict["link_method"]
+      
+    # Apply spatial definitions
+    if link_method == "INPFC":
+        data_dict.update({
+            k: apply_inpfc_definitions(d, spatial_dict["strata"]) for k, d in data_dict.items()
+        })
+
+# def apply_inpfc_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict):
+
+#     # Extract the INPFC definitions
+#     inpfc_definitions = spatial_config["inpfc"]
+
+#     # Create latitude bins
+#     latitude_bins = np.concatenate([[-90.0], inpfc_definitions["latitude_max"], [90.0]])
+#     # ---- Append 1 more stratum layer
+#     bin_names = np.concatenate([inpfc_definitions["stratum_names"],
+#                                 [np.max(inpfc_definitions["stratum_names"]) + 1]])
+    
+#     # Create spatial key
+#     spatial_config["spatial_key"] = pd.DataFrame({
+#         "latitude_limit": inpfc_definitions["latitude_max"],
+#     })
+#     # ---- Cut
+#     spatial_config["spatial_key"]["stratum"] = (
+#         pd.cut(inpfc_definitions["latitude_max"],
+#                latitude_bins,
+#                right = True,
+#                labels = bin_names)
+#     )
+
+#     # Get the `prc_nasc_df` values, if they exist, and apply stratification information
+#     if not acoustic_data["prc_nasc_df"].empty:
+#         # ---- Bin the latitude data
+#         acoustic_data["prc_nasc_df"]["stratum"] = pd.cut(
+#             acoustic_data["prc_nasc_df"]["latitude"],
+#             latitude_bins,
+#             right = True,
+#             labels = bin_names,
+#         )
+
+#     # Get the `trawl_info_df` values, if they exist, and apply stratification information
+#     if not biology_data["trawl_info_df"].empty:
+#         # ---- Bin the latitude data
+#         biology_data["trawl_info_df"]["stratum"] = pd.cut(
+#             biology_data["trawl_info_df"]["latitude"],
+#             latitude_bins,
+#             right = True,
+#             labels = bin_names,
+#         )
 
 def define_boundary_box(boundary_dict: dict, projection: str):
     
diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index 579cf463..306ddeb9 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -1,4 +1,4 @@
-from typing import Union
+from typing import Union, Optional
 from pathlib import Path
 import copy
 
@@ -12,6 +12,11 @@
     to_linear
 )
 
+from .sql_methods import query_processed_files
+from .live_acoustics import preprocess_acoustic_data, integrate_nasc
+from .live_biology import preprocess_biology_data
+
+
 from . import live_data_processing as eldp
 from . import live_data_loading as eldl
 class LiveSurvey:
@@ -47,13 +52,69 @@ def __init__(
         # Initialize the results attribute
         self.results = copy.deepcopy(LIVE_DATA_STRUCTURE["results"])
 
-        # TODO: Replace Tuple output by appending the "database" key to the respective dataset dict
-        # Ingest data
-        # ---- Acoustics
-        self.input["acoustics"]["prc_nasc_df"] = eldl.load_acoustic_data(self.config)
-        # ---- Biology
-        self.input["biology"] = eldp.load_biology_data(self.config)
-        
+        # Configure the spatial settings
+        self.input.update({"spatial": eldl.configure_spatial_settings(self.config)})
+
         # TODO: Add verbosity for printing database filepaths/connections 
         if verbose: 
-            pass
\ No newline at end of file
+            pass
+
+
+    def load_acoustic_data(self,
+                           input_filenames: Optional[list] = None,
+                           verbose: bool = True):
+        
+        # Validate the data directory and format the filepaths
+        acoustic_files = eldl.validate_data_directory(self.config, dataset="acoustics", 
+                                                      input_filenames=input_filenames)
+        
+        # Read in the acoustic data files
+        if acoustic_files:
+            # ! [REQUIRES DASK] ---- Read in the listed file
+            # ---- Read in the acoustic data files
+            prc_nasc_df, acoustic_data_units = eldl.read_acoustic_files(acoustic_files)
+            # ---- Add the `acoustic_data_units` to the dictionary
+            self.config["acoustics"]["dataset_units"] = acoustic_data_units   
+            # ---- Preprocess the acoustic dataset
+            self.input["acoustics"]["prc_nasc_df"] = preprocess_acoustic_data(prc_nasc_df, 
+                                                                              self.config)     
+            # TODO: Add verbosity for printing database filepaths/connections 
+            if verbose:
+                print(
+                    f"The following acoustic files have been processed:\n"
+                    f"{"\n".join(acoustic_files)}."
+                )
+        else:
+            self.input["acoustics"]["prc_nasc_df"] = None
+
+    def load_biology_data(self,
+                          input_filenames: Optional[list] = None,
+                          verbose: bool = True):
+
+        # Validate the data directory and format the filepaths
+        biology_files = eldl.validate_data_directory(self.config, dataset="biology", 
+                                                     input_filenames=input_filenames)
+        
+        # TODO: Add verbosity for printing database filepaths/connections 
+        if biology_files and verbose:
+            print(
+                f"The following biological files have been processed:\n"
+                f"{"\n".join(biology_files)}."
+            )
+        
+        # Read in the biology data files
+        initial_biology_output = eldl.read_biology_files(biology_files, self.config)
+
+        # Preprocess the biology dataset
+        self.input["biology"], self.input["biology_processed"] = (
+            preprocess_biology_data(initial_biology_output, self.input["spatial"], self.config)
+        )
+
+    def process_biology_data(self):
+
+        # Separate out processed and unprocessed biological data 
+        # ----- Unprocessed
+        biology_unprocessed = self.input["biology"]
+        # ---- Processed
+        biology_processed = self.input["biology_processed"]
+        
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index 0d6a6d58..4d253455 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -4,6 +4,7 @@
 from typing import Optional, Literal, Union, List
 import numpy as np
 from pathlib import Path
+import re
 
 def sql_create(connection: sqla.Connection, dataframe: pd.DataFrame, table_name: str, 
                primary_keys: Optional[list] = None):
@@ -60,7 +61,7 @@ def sql_validate(connection: sqla.Connection, table_name: str):
     inspector = inspect(connection)
     return table_name in inspector.get_table_names()
 
-def sql_inspect(connection: sqla.Connection, table_name: str):
+def sql_inspect(connection: sqla.Connection, table_name: str, columns: List[str] = None):
     """
     Get a list of all tables present
 
@@ -71,15 +72,26 @@ def sql_inspect(connection: sqla.Connection, table_name: str):
         list: True if the table exists, False otherwise.
     """  
 
-    # Create 'inspector' for the db file
-    inspector = inspect(connection)
-
-    # Retrieve column information
-    column_info =  inspector.get_columns(table_name)
-
-    # Format as a dictionary
-    return {col['name']: {k: v for k, v in col.items() if k != 'name'} for col in column_info}
-
+    # Inspect the columns from the table
+    if columns is None:
+        # ---- Create 'inspector' for the db file
+        inspector = inspect(connection)
+        # ---- Retrieve column information
+        column_info =  inspector.get_columns(table_name)
+        # ---- Format as a dictionary and return the output
+        return {col['name']: {k: v for k, v in col.items() if k != 'name'} for col in column_info}
+    else: 
+        # Inspect unique values in specified columns
+        # ---- Create SQL command
+        sql_command = f"SELECT DISTINCT {", ".join(columns)} FROM {table_name};"
+        # ---- Execute
+        table = connection.execute(text(sql_command.strip()))
+        # ---- Extract unique values
+        unique_values = table.fetchall()
+        # ---- Format as a dictionary and return the output
+        return (
+            {col: list(set(row[idx] for row in unique_values)) for idx, col in enumerate(columns)}
+        )
 
 def sql_drop(connection: sqla.Connection, table_name: str):
     """
@@ -134,16 +146,13 @@ def format_value(x):
         
     # ---- Tuple to String
     # data_str = ", ".join(
-    #     # f"({', '.join(map(lambda x: f'\'{x}\'' if isinstance(x, str) or isinstance(x, pd.Timestamp) else 'NULL' if x is None else str(x), row))})"
-    #     f"({', '.join(map(lambda x: f'\'{x.replace('\\', '\\\\')}\'' if isinstance(x, str) or isinstance(x, pd.Timestamp) else 'NULL' if x is None else str(x), row))})"
-    #     for row in data_tuple
-    # )
-    flattened_data = [format_value(x) for row in data_tuple for x in row]
-    data_str = "({})".format(", ".join(flattened_data))
-    # data_str = ", ".join(
-    #     "({})".format(", ".join(map(format_value, row)))
+    #     # f"({', '.join(map(lambda x: f'\'{x}\'' if isinstance(x, str) else str(x), row))})"
+    #             f"({', '.join(map(lambda x: f'\'{x}\'' 
+    #                                 if isinstance(x, str) or isinstance(x, pd.Timestamp) 
+    #                                 else 'NULL' if x is None else str(x), row))})"
     #     for row in data_tuple
     # )
+    data_str = ", ".join(f"({','.join(map(lambda x: format_value(x), row))})"  for row in data_tuple)
     
     # Construct the "ON CONFLICT, DO UPDATE SET" if needed
     on_conflict_clause = ""
@@ -166,15 +175,92 @@ def format_value(x):
     # Commit
     connection.commit()
 
+def sql_update(connection: sqla.Connection, table_name: str, columns: list, 
+               dataframe: Optional[pd.DataFrame] = None, operation: Optional[str] = None, 
+               condition: Optional[str] = None):
+    """
+    Insert data into a table.
+
+    Args:
+        connection (Connection): The SQLAlchemy Connection instance.
+        table_name (str): The name of the table.
+        columns (list): List of column names.
+        data (list of dict): List of dictionaries containing data to insert or update.
+        conflict_columns (list): List of column names to use for conflict resolution.
+    """
+    
+    # Prepare the SQL statement for insertion
+    # ---- Check whether `columns` is '*'
+    if "*" in columns:
+        # ---- Create 'inspector' for the db file
+        inspector = inspect(connection)
+        # ---- Get the column names from the db file
+        columns = [col['name'] for col in inspector.get_columns(table_name)]
+    # ---- If not a List
+    elif not isinstance(columns, list):
+        columns = [columns]
+
+    # Format the SET command
+    # ---- Update column by applying arithmetic between table and dataframe
+    if operation is not None and dataframe is not None:
+        set_list = [f"{column} = {column} {operation} {dataframe[column].values[0]}" 
+                    for column in columns]
+    # ---- Update column by applying arithmetic within table
+    if dataframe is None and operation is not None:
+        # ---- Make sure `operation` is a list
+        if not isinstance(operation, list):
+            operation = [operation]
+        # ---- Break up the columns into their components
+        set_list = [f"{column} = {calculation}" for column, calculation in zip(columns, operation)]
+    # ---- Update column by setting a defined value
+    if dataframe is not None and operation is None:
+        set_list = [f"{column} = {dataframe[column].values[0]}" for column in columns]
+    # ---- Join the list
+    set_clause = ', '.join(set_list)
+
+    # Add the WHERE clause if a parsed condition is provided
+    if condition is not None:
+        # ---- Parse the conditional string
+        parsed_condition = parse_condition(condition)
+        set_clause += " WHERE " + parsed_condition
+
+    # Complete the full command
+    sql_command = f"UPDATE {table_name} SET {set_clause};"
 
-def sql_select(connection: sqla.Connection, table_name: str, columns: list, 
+    # Execute
+    connection.execute(text(sql_command.strip()))
+    
+    # Commit
+    connection.commit()
+
+def sql_select(connection: sqla.Connection, table_name: str, 
+               columns: Optional[Union[list, str]] = None, 
+               condition: Optional[str] = None, 
                output_type: type = pd.DataFrame):
 
+    # Columns
+    if columns is None:
+        column_names = "*"
+    elif isinstance(columns, list) or isinstance(columns, pd.Index):
+        column_names = ", ".join(columns)
+    else:
+        column_names = columns
+
     # Prepare the columns as a string of column names
-    column_names = ", ".join(columns)
+    # if isinstance(columns, list):
+    #     column_names = ", ".join(columns)
+    # else:
+    #     column_names = columns
 
     # Format the SQL command
-    sql_command = f"SELECT {column_names} FROM {table_name};"
+    # sql_command = f"SELECT {column_names} FROM {table_name};"
+    sql_command = f"SELECT {column_names} FROM {table_name}"
+
+    # Add the WHERE clause if a parsed condition is provided
+    if condition is not None:
+        # ---- Parse the conditional string
+        parsed_condition = parse_condition(condition)
+        sql_command += " WHERE " + parsed_condition
 
     # Execute the command 
     table = connection.execute(text(sql_command))
@@ -200,7 +286,8 @@ def sql_select(connection: sqla.Connection, table_name: str, columns: list,
         # ---- Create DataFrame
         output_df = pd.DataFrame(data, columns=table.keys())
         # ---- Format the expected datatypes
-        df_dtypes = {col: SQL_DTYPES[type(dtype).__name__] for col, dtype in table_dtypes.items()}
+        df_dtypes = {col: SQL_DTYPES[type(dtype).__name__] 
+                     for col, dtype in table_dtypes.items() if col in columns }
         # ---- Apply the dtypes
         return output_df.astype(df_dtypes)
     else:
@@ -226,13 +313,14 @@ def sql_select(connection: sqla.Connection, table_name: str, columns: list,
     "create": dict(function=sql_create, args=["table_name", "dataframe", "primary_keys"]),
     "drop": dict(function=sql_drop, args=["table_name"]),
     "insert": dict(function=sql_insert, args=["table_name", "columns", "dataframe", "id_columns"]),
-    "inspect": dict(function=sql_inspect, args=["table_name"]),
+    "inspect": dict(function=sql_inspect, args=["table_name", "columns"]),
     "map": dict(function=sql_map_tables, args=[]),
-    "select": dict(function=sql_select, args=["table_name", "columns", "output_type"]),
+    "select": dict(function=sql_select, args=["table_name", "columns", "output_type", "condition"]),
+    "update": dict(function=sql_update, args=["table_name", "columns", "condition", "operation", 
+                                              "dataframe"]),
     "validate": dict(function=sql_validate, args=["table_name"]),
 }
-    
-    
+        
 SQL_DTYPES = {
     'int32': 'INTEGER',
     'int64': 'INTEGER',
@@ -249,13 +337,144 @@ def sql_select(connection: sqla.Connection, table_name: str, columns: list,
     "TEXT": str,
 } 
 
+def sql_group_update(db_file: str,
+                     dataframe: pd.DataFrame, 
+                     table_name: str,
+                     columns: List[str],
+                     unique_columns: List[str],
+                     id_columns: Optional[List[str]] = None):
+    
+    # Check for unique values contained within the table
+    unique_values = SQL(db_file, "inspect", table_name=table_name, columns=unique_columns)
+
+    # Get the unique values in the table
+    table_values = {col: dataframe[col].unique().tolist() for col in unique_columns}
+
+    # Find mismatched indices
+    new_indices = {col: list(set(table_values[col]) - set(unique_values[col])) 
+                   for col in unique_columns}
+
+    # Filter the DataFrame to include only rows with these missing values
+    # ---- Create DataFrame copy
+    filtered_df = dataframe.copy()
+    # ---- Iterate through the extracted dictionary
+    for col, missing_vals in new_indices.items():
+        if missing_vals:
+            filtered_df = filtered_df[filtered_df[col].isin(missing_vals)]
+        else:
+            # ---- Drop the values that are not contained within the list
+            filtered_df = pd.DataFrame(columns=filtered_df.columns)
+
+    # Insert into the table if not otherwise present
+    if not filtered_df.empty: 
+        SQL(db_file, "insert", table_name=table_name, id_columns=id_columns, dataframe=filtered_df)
+
+    # Update the table
+    # ---- Format the conditional string
+    case_statements = []
+    for col in columns:
+        case_stmt = "CASE"
+        for _, row in dataframe.iterrows():
+            # Construct the filter condition based on unique_columns
+            filter_conditions = ' AND '.join([
+                f"{col} = '{row[col]}'" if isinstance(row[col], str) else f"{col} = {row[col]}"
+                for col in unique_columns
+            ])
+            # Add the WHEN condition to the CASE statement
+            case_stmt += f" WHEN {filter_conditions} THEN {row[col]}"
+        case_stmt += " END"
+        case_statements.append(f"{col} = {case_stmt}")
+
+    # Construct the full SQL UPDATE statement
+    update_clause = ', '.join(case_statements)
+
+    # Format the SQL COMMAND string
+    sql_command = f"""        
+    UPDATE {table_name}
+    SET {update_clause}
+    WHERE ({' OR '.join([
+        ' AND '.join([
+            f"{col} = '{row[col]}'" if isinstance(row[col], str) else f"{col} = {row[col]}"
+            for col in unique_columns
+        ])
+        for _, row in dataframe.iterrows()
+    ])});
+    """
+
+    # Create engine
+    engine = create_engine(f"sqlite:///{db_file}")
+
+    # Execute and commit
+    with engine.connect() as connection:
+        connection.execute(text(sql_command))
+        connection.commit()
+
+    # Dispose engine
+    engine.dispose()
+
+def get_table_key_names(db_file: Path, data_dict: dict, table_name: str) -> List[str]:
+
+    # Get the data input column names
+    if data_dict[table_name].empty:
+        # ---- Inspect the table
+        inspected_table = SQL(db_file, "inspect", table_name=table_name)
+        # ---- Create a list of the data columns
+        table_columns = list(inspected_table.keys())
+    else:
+        # ---- Get the DataFrame column names
+        table_columns = data_dict[table_name].columns
+
+    # Create a list of the primary keys
+    key_columns = (
+           set(table_columns)
+           .intersection(["trawl_partition", "sex", "haul_num", "species_id", "longitude", 
+                          "latitude", "stratum"]) 
+        )
+
+    # Return a list of the output
+    return list(key_columns)
+
+def parse_condition(condition: str):
+    # Replace logical operators with SQL equivalents
+    condition = condition.replace('&', ' AND ').replace('|', ' OR ')
+    
+    # Handle "IN" lists and replace square brackets with parentheses
+    condition = re.sub(r'(\w+)\s*IN\s*\[(.*?)\]', lambda m: f"{m.group(1)} IN ({m.group(2)})", condition, flags=re.IGNORECASE)
+    
+    # Handle range conditions for BETWEEN, including floats
+    condition = re.sub(r'(\d*\.\d+|\d+)\s*<=\s*(\w+)\s*<=\s*(\d*\.\d+|\d+)', 
+                       lambda m: f"{m.group(2)} BETWEEN {m.group(1)} AND {m.group(3)}", condition)
+    
+    # Handle individual comparisons
+    condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\d*\.\d+|\d+)', lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", condition)
+    condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\'[^\']*\')', lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", condition)
+
+    # Return the parsed condition
+    return condition
+
+def format_sql_select(table_name, column_names, condition_string):
+    # Base SQL command to select columns from the table
+    sql_command = f"SELECT {column_names} FROM {table_name}"
+    
+    # Parse the condition string
+    parsed_condition = parse_condition(condition_string)
+    
+    # Add the WHERE clause if a parsed condition is provided
+    if parsed_condition:
+        sql_command += " WHERE " + parsed_condition
+    
+    # Add a semicolon at the end of the SQL command
+    sql_command += ";"
+    
+    return sql_command
+
 def format_sql_columns(kwargs: dict):
 
     # Columns
-    if "columns" in kwargs:
+    if "columns" in kwargs and "condition" not in kwargs:
         if isinstance(kwargs["columns"], list) or isinstance(kwargs["columns"], pd.Index):
             kwargs["columns"] = ", ".join(kwargs["columns"])
-    else: 
+    elif "columns" not in kwargs: 
         kwargs["columns"] = "*"
 
     # ID/Conflict columns
@@ -267,14 +486,14 @@ def format_sql_columns(kwargs: dict):
     return kwargs
 
 # TODO: Documentation
-def query_processed_files(root_directory: str, file_settings: dict, files: List[Path]) -> dict:
+def query_processed_files(root_directory: Path, file_settings: dict, files: List[Path]) -> dict:
 
     # Get the database name 
     db_name = file_settings["database_name"]
 
     # Create filepath to the SQL database
     # ---- Create Path to SQL database file
-    db_directory = Path(root_directory) / "database"
+    db_directory = root_directory / "database"
     # ---- Create the directory if it does not already exist
     db_directory.mkdir(parents=True, exist_ok=True)
     # ---- Complete path to the database file
@@ -301,7 +520,7 @@ def query_processed_files(root_directory: str, file_settings: dict, files: List[
     # Query already existing files
     previous_files = SQL(db_file, "select", table_name="files_read", output_type=str)
     # ---- Insert file list
-    SQL(db_file, "insert", table_name="files_read", dataframe=current_files, id_columns="filepath")
+    SQL(db_file, "insert", table_name="files_read", dataframe=current_files, id_columns=["filepath"])
 
     # Filter out previously processed files
     # ---- Apply filter by comparing sets and return the output
@@ -321,11 +540,6 @@ def sql_data_exchange(database_file: Path, **kwargs):
     if not table_exists:
         # ---- Create table
         SQL(database_file, "create", **kwargs)
-        # ---- Ignore the `id_columns` argument, if present
-        try:
-            del kwargs["id_columns"]
-        except KeyError:
-            pass
         # ---- Insert into table        
         SQL(database_file, "insert", **kwargs)
         # ---- Return the initial dataframe
@@ -345,7 +559,8 @@ def SQL(db_file: str, command: str, **kwargs):
     engine = create_engine(f"sqlite:///{db_file}")
     
     # Format the data columns, if necessary, to fit within the SQL commands
-    kwargs = format_sql_columns(kwargs)
+    if command not in ["inspect", "update", "select"]:
+        kwargs = format_sql_columns(kwargs)
     
     # Run the command
     try:
diff --git a/echopop/utils/operations.py b/echopop/utils/operations.py
index bb5a6423..eae68ec2 100644
--- a/echopop/utils/operations.py
+++ b/echopop/utils/operations.py
@@ -306,8 +306,13 @@ def group_merge(dataframe, dataframes_to_add, inner_on, outer_on, how="outer", d
 
 
 def group_interpolator_creator(
-    grouped_data: pd.DataFrame, independent_var: str, dependent_var: str, contrast: str
+    grouped_data: pd.DataFrame, independent_var: str, dependent_var: str, 
+    contrast: Union[List[str], str]
 ) -> dict:
+    
+    # Check if `contrast` is a list or not
+    if not isinstance(contrast, list):
+        contrast = []
 
     # Interpolator generation helper function
     def interpolator_factory(sub_group):
@@ -323,7 +328,7 @@ def interpolator_factory(sub_group):
 
     # Produce a dictionary comprising all of the produced interpolators
     interpolators = (
-        grouped_data.groupby([contrast]).apply(
+        grouped_data.groupby(contrast).apply(
             lambda group: interpolator_factory(group), include_groups=False
         )
     ).to_dict()
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
index ba7c2a2c..b657b07d 100644
--- a/echopop/zarr_read_ingest_test.py
+++ b/echopop/zarr_read_ingest_test.py
@@ -2,7 +2,7 @@
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
-from typing import Union, Tuple
+from typing import Union, Tuple, Optional
 from pathlib import Path
 import copy
 import yaml
@@ -13,38 +13,132 @@
 import re
 import contextlib
 from sqlalchemy import create_engine, text, Engine, inspect
-from echopop.live.live_core import LIVE_DATA_STRUCTURE, LIVE_FILE_FORMAT_MAP, LIVE_INPUT_FILE_CONFIG_MAP
+from echopop.live.live_core import LIVE_DATA_STRUCTURE, LIVE_FILE_FORMAT_MAP, LIVE_INPUT_FILE_CONFIG_MAP, SPATIAL_CONFIG_MAP
+from echopop.live.live_data_loading import validate_data_directory
+from echopop.live.sql_methods import SQL, SQL_COMMANDS, query_processed_files, format_sql_columns
 from echopop.live import live_data_processing as eldp
-
+from echopop.live import live_data_loading as eldl
+from echopop.live.live_survey import LiveSurvey
+from echopop.live.live_acoustics import preprocess_acoustic_data
+from echopop.live.live_biology import preprocess_biology_data
+from echopop.survey import Survey
+
+survey_2019 = Survey("C:/Users/Brandyn/Documents/GitHub/echopop/config_files/initialization_config.yml", "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/survey_year_2019_config.yml")
+survey_2019.transect_analysis()
+survey_2019.analysis["transect"]["biology"]["weight"]["weight_stratum_df"]
+analysis_dict = survey_2019.analysis["transect"]
+
+proportions_dict=analysis_dict["biology"]["proportions"]["number"]
+length_weight_dict = analysis_dict["biology"]["weight"]
+stratum_proportions_sexed["proportion_aged"] + stratum_proportions_sexed["proportion_unaged"]
 ####################################################################################################  
 # TEST: YAML FILE CONFIGURATION
 # ---- Define filepaths
+self = LiveSurvey
 live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
 live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
 # ---- Run function: `live_configuration`
-file_configuration = live_configuration(live_init_config_path, live_file_config_path)
-file_configuration.update({"database": {"acoustics": None, "biology": None}})
-####################################################################################################
-# * Accessory function for tuning the acoustic transmit frequency units/scaling
-def format_vlaue(x):
-    pass
-
-def format_value(x):
-    if isinstance(x, str):
-        return "'{}'".format(x.replace("'", "''"))
-    elif isinstance(x, pd.Timestamp):
-        return "'{}'".format(x)
-    elif x is None:
-        return 'NULL'
-    else:
-        return str(x)
+file_configuration = self.config
+files = biology_files
+
+biology_output = initial_biology_output
+file_configuration = self.config
+table_name = "length_df"
+df = filtered_biology_output[table_name]
+database_file = biology_db
+kwargs = dict(dataframe=df, table_name=table_name, id_columns=["id"], primary_keys=["id"], output_type=pd.DataFrame)
+
+def process_biology_data(self):
+
+    # Separate out processed and unprocessed biological data 
+    # ----- Unprocessed
+    biology_unprocessed = self.input["biology"]
+    # ---- Processed
+    biology_processed = self.input["biology_processed"]
+
+    # Compute `sigma_bs` by sending it to the appropriate database table
+    compute_sigma_bs(biology_unprocessed["specimen_df"], biology_unprocessed["length_df"], 
+                     self.config)
+    
+    # Bin the length measurements of the biological data
+    bin_length_data(biology_unprocessed, self.config["length_distribution"])
 
-data_str = ", ".join(
-    "({})".format(", ".join(format_value(x) for x in row))
-    for row in data_tuple
-)
+    # Compute the length-weight regression and add it to the SQL table
+    length_weight_df = length_weight_regression(biology_unprocessed["specimen_df"], 
+                                                self.config["length_distribution"],
+                                                self.config)
+    
+    # Compute length-binned counts for the aggregated and individual-based measurements
+    specimen_binned, specimen_binned_filtered, length_binned = (
+        length_bin_counts(biology_unprocessed["length_df"], biology_unprocessed["specimen_df"], 
+                          self.config)
+    )
+
+    # Compute the number proportions
+    specimen_number_proportion, length_number_proportion, sex_number_proportions = (
+        number_proportions(specimen_binned, specimen_binned_filtered, length_binned,
+                           self.config)
+    )
+    
+    # Compute the length-binned weights for the aggregated and individual-based measurements
+    length_weight_binned, specimen_weight_binned = (
+        length_bin_weights(biology_unprocessed["length_df"],
+                           biology_unprocessed["specimen_df"],
+                           length_weight_df,self.config)
+    )
+
+    # Calculate the average weights among male, female, and all fish
+    fitted_weight_df = compute_average_weights(specimen_number_proportion, 
+                                               length_number_proportion, 
+                                               sex_number_proportions,
+                                               length_weight_df,
+                                               self.config["length_distribution"],
+                                               self.config)
 
+catch_data = self.input["biology"]["catch_df"]
 
+# Get the spatial column name, if there is one
+contrast_columns = file_configuration["spatial_column"].copy()
+# ---- Append additional columns that will be used
+contrast_columns.extend(["sex", "species_id"])
+
+# Calculate grouped totals
+# ---- Specimen
+specimen_weights = specimen_weight_binned.sum().reset_index(name="total_weight")
+
+
+# Calculate the sexed and total stratum weights for each sex among unaged fish
+# ---- Sum the net haul weights from station 1/unaged fish
+catch_weights = catch_data.count_variable(
+    contrasts=["species_id"] + file_configuration["spatial_column"], 
+    variable="haul_weight", fun="sum"
+)
+# ---- Rename resulting columns for both
+catch_weights.rename(columns={"count": "total_weight"}, inplace=True)
+
+# Sum the sexed and total weights from the weight-fitted unaged data
+# ---- Extract the unaged/length quantized weights
+unaged_weights_binned = distributions_dict["unaged_length_weight_tbl"].copy()
+# ---- Calculate the total weight per stratum per sex
+unaged_weights_sex = unaged_weights_binned.sum()
+# ---- Length (by sex)
+length_weights_sex = length_weight_binned.groupby(contrast_columns)["weight_interp"].sum()#.to_frame("weight")
+# ---- Further reduce
+length_weight_total = length_weights_sex.transpose().unstack(0).sum(axis=0)
+# ---- Standardize the unaged sexed weights
+(length_weights_sex / length_weight_total).unstack(0) * catch_weights["total_weight"].to_numpy()
+
+
+length_weight_total = (
+    length_weights_sex.reset_index(list(set(contrast_columns)-set(file_configuration["spatial_column"].copy())))
+    ["weight_interp"].sum()
+)
+# ---- Calculate the stratum totals
+unaged_strata_weights = unaged_weights_sex.unstack(0).sum(axis=0)
+# ---- Standardize the unaged sexed weights
+unaged_weights_sex_standardized = (unaged_weights_sex / unaged_strata_weights).unstack(
+    0
+) * catch_strata_weights["stratum_weight"].to_numpy()
 
 ####################################################################################################
 # * Functionality for reading in processed acoustic data
@@ -182,6 +276,37 @@ def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True):
 # SQL(database_file, "drop", table_name="nasc_df")
 # SQL_DTYPES[type(dataframe["ping_time"][0]).__name__]
 
+
+def process_acoustic_data(self,
+                            echometrics: bool = True):
+    
+    # Get the unprocessed acoustic data
+    acoustic_data_df = self.input["acoustics"]["prc_nasc_df"]
+
+    # Integrate NASC (and compute the echometrics, if necessary)
+    nasc_data_df = (
+        acoustic_data_df.groupby(["longitude", "latitude", "ping_time"])
+        .apply(integrate_nasc, echometrics, include_groups=False)
+        .unstack().reset_index()
+    )
+
+    # ---- Amend the dtypes if echometrics were computed
+    if echometrics:
+        # ---- Set dtypes
+        nasc_data_df = (
+            nasc_data_df
+            .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float,
+                    "center_of_mass": float, "dispersion": float, "evenness": float,
+                    "aggregation_index": float, "occupied_area": float})
+        )
+        # ---- Reorder columns
+        nasc_data_df = nasc_data_df[[
+            "longitude", "latitude", "ping_time", "nasc", "n_layers", "nasc_db", "mean_Sv", 
+            "max_Sv", "aggregation_index", "center_of_mass", "dispersion", "evenness", 
+            "occupied_area"
+        ]]
+
+
 def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: dict, 
                           echometrics: bool = True):
 
@@ -389,6 +514,10 @@ def convert_datetime(timestamp: Union[int, str, pd.Series]):
 acoustic_data = self.input["acoustics"]
 biology_data = self.input["biology"]
 
+
+
+from echopop.live.live_core import SPATIAL_CONFIG_MAP
+
 def load_spatial_data(acoustic_data: dict,
                       biology_data: dict,                      
                       file_configuration: dict,):
@@ -406,9 +535,15 @@ def load_spatial_data(acoustic_data: dict,
     # Validate the configuration
     validate_spatial_config(spatial_config)
 
+    # Create spatial dictionary that will be added as an `input`
+    spatial_dict = {"link_method": acoustics_biology_link}
+
     # Assign the spatial link constraints to the acoustic and biological data
     if acoustics_biology_link == "INPFC":
-        apply_inpfc_definitions(acoustic_data, biology_data, spatial_config)
+        spatial_dict.update({"strata": create_inpfc_strata(spatial_config)})
+
+    # Return the dictionary as an output
+    return spatial_dict
 
 
 
@@ -552,6 +687,8 @@ def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: di
 
 __all__ = ["operations"]
 
+biology_data = self.input["biology"]
+
 # Meld bio datasets
 length_datasets = biology_data["specimen_df"].meld(biology_data["length_df"], 
                                                    contrasts=["haul_num", "species_id", "length"])
@@ -576,7 +713,10 @@ def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: di
 #
 file_configuration["acoustics"]["TS_length_regression_parameters"][target_species["text_code"]]
 
-def average_sigma_bs(length: Union[pd.DataFrame, float, int], TS_L_slope: Optional[float] = None, TS_L_intercept: Optional[float] = None, weighted: Optional[Union[float, int, str]] = None):
+def average_sigma_bs(length: Union[pd.DataFrame, float, int], 
+                     TS_L_slope: Optional[float] = None, 
+                     TS_L_intercept: Optional[float] = None, 
+                     weighted: Optional[Union[float, int, str]] = None):
 
     # 
     if isinstance(length, pd.DataFrame):
@@ -648,6 +788,36 @@ def average_sigma_bs(length: Union[pd.DataFrame, float, int], TS_L_slope: Option
     else:
         return sigma_bs_value.mean()
 
+def parse_condition(condition):
+    # Handle nested conditions and logical operators
+    condition = condition.replace('&', ' AND ').replace('|', ' OR ')
+
+    # Handle "IN" lists and replace square brackets with parentheses
+    condition = re.sub(r'(\w+)\s*IN\s*\[(.*?)\]', lambda m: f"{m.group(1)} IN ({m.group(2)})", condition, flags=re.IGNORECASE)
+    
+    # Handle range conditions for BETWEEN, including floats
+    condition = re.sub(r'(\d*\.\d+|\d+)\s*<=\s*(\w+)\s*<=\s*(\d*\.\d+|\d+)', 
+                       lambda m: f"{m.group(2)} BETWEEN {m.group(1)} AND {m.group(3)}", condition)
+    
+    # Handle individual comparisons
+    condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\d*\.\d+|\d+)', lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", condition)
+    condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\'[^\']*\')', lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", condition)
+
+    # Handle single equal sign
+    condition = re.sub(r'(\w+)\s*=\s*(\d*\.\d+|\d+)', lambda m: f"{m.group(1)} = {m.group(2)}", condition)
+
+    # Remove redundant spaces
+    condition = re.sub(r'\s+', ' ', condition).strip()
+
+    return condition
+
+columns = ["sigma_bs_sum", "sigma_bs_count"]
+operation = "+"
+table_name = "sigma_bs_mean_df"
+dataframe = sigma_bs_df
+condition = condition_str
+
+SQL(acoustic_db, "select", table_name="files_read")
 average_sigma_bs
 
 ts_lengths_df.groupby(["haul_num"]).apply(average_sigma_bs).apply(lambda x: to_dB(x))
@@ -671,13 +841,57 @@ def integrate_nasc(prc_nasc_df: pd.DataFrame):
 })
 
 
+current = 10 ** (-60/10)
+count = 5
+old_tuple = (current, count)
+
+new = 10 ** (-50/10)
+count = 2
+
+data = pd.DataFrame({"value": 10 ** (np.array([-60.0, -50.0]) / 10.0),
+                     "count": np.array([5, 2]) })
+
+data = pd.DataFrame({"value": 10 ** (np.array([-61, -62, -63, -62, -61]) / 10.0)})
+data_new = pd.DataFrame({"value": 10 ** (np.array([-51, -52, -53, -52, -54, -56, -58]) / 10.0)})
+data["value"].sum() / data["value"].size
+data_new["value"].sum() / data_new["value"].size
+
+(data["value"].sum() + data_new["value"].sum()) / (data["value"].size + data_new["value"].size)
+
+data_test = pd.DataFrame({"value": 10 ** (np.array([-61, -62, -63, -62, -61, -51, -52, -53, -52, -54, -56, -58]) / 10.0)})
+data_test["value"].mean()
+
+data["value"].mean()
+
+data["value"].sum()
+
+old_number = np.average(data["value"], weights=data["count"])
+old_count = data["count"].sum()
+
+new_number = np.array([-80.0, -70.0, -60.0, -70.0, -80.0])
+new_count = len(new_number)
+new_mean = 10 ** (new_number.mean() / 10)
+
+np.average(np.concatenate([[old_number], [new_mean]]), 
+           weights = np.concatenate([[old_count], [new_count]]))
+
+np.mean(10 ** (np.array([-60.0, -60.0, -60.0, -60.0, -60.0, -50.0, -50.0, -80.0, -70.0, -60.0, -70.0, -80.0]) / 10))
+np.average(data["value"], weights=data["count"])
+
+np.sum(10 ** (np.array([-60.0, -60.0, -60.0, -60.0, -60.0, -50.0, -50.0, -80.0, -70.0, -60.0, -70.0, -80.0]) / 10))
 
 
 pd.read_fr
 pd.read_sql(text(SQL_COMMANDS["select"].format(**kwargs)), con=connection)
+db_file = self.config["database"]["acoustics"]
 engine = create_engine(f"sqlite:///{db_file}")
 connection = engine.connect()
-kwargs["dataframe"].to_sql(name=kwargs["table_name"], 
+
+SQL(db_file, "select", table_name="sigma_bs_mean_df", condition="stratum = 1")
+
+
+kwargs["dataframe"].to_sql(name=kwa
+rgs["table_name"], 
                                                   con=connection, 
                                                   if_exists="append", index=False)
 connection.close()

From a4a51a6d03820da9560985d6df0702f826819a3c Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Thu, 1 Aug 2024 09:37:40 -0700
Subject: [PATCH 10/81] Format some changes to methods

---
 echopop/live/live_acoustics.py       |  65 +++++-
 echopop/live/live_biology.py         |   2 +
 echopop/live/live_core.py            |   4 +-
 echopop/live/live_data_loading.py    |   4 +-
 echopop/live/live_spatial_methods.py |  38 +++-
 echopop/live/live_survey.py          |  38 +++-
 echopop/live/sql_methods.py          |  36 ++-
 echopop/test_workflow.py             |  33 +++
 echopop/zarr_read_ingest_test.py     | 329 ---------------------------
 9 files changed, 181 insertions(+), 368 deletions(-)
 create mode 100644 echopop/test_workflow.py

diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py
index 21ba1e23..6afc5bc2 100644
--- a/echopop/live/live_acoustics.py
+++ b/echopop/live/live_acoustics.py
@@ -3,6 +3,8 @@
 import pandas as pd
 
 from echopop.acoustics import ts_length_regression, to_linear, to_dB
+from .live_spatial_methods import apply_spatial_definitions
+from .sql_methods import sql_data_exchange
 
 # TODO: Documentation
 def configure_transmit_frequency(frequency_values: pd.Series,
@@ -25,6 +27,7 @@ def configure_transmit_frequency(frequency_values: pd.Series,
     
 # TODO: Documentation
 def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame,
+                             spatial_dict: dict,
                              file_configuration: dict) -> pd.DataFrame:
 
     # Get acoustic processing settings
@@ -34,8 +37,8 @@ def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame,
 
     # Filter the dataset
     # ---- Configure `frequency_nominal`, if necessary
-    prc_nasc_df["frequency_nominal"] = (
-        configure_transmit_frequency(prc_nasc_df["frequency_nominal"],
+    prc_nasc_df.loc[:, "frequency_nominal"] = (
+        configure_transmit_frequency(prc_nasc_df.loc[:, "frequency_nominal"],
                                      transmit_settings,
                                      acoustic_analysis_settings["dataset_units"]["frequency"])
     )
@@ -43,6 +46,11 @@ def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame,
     prc_nasc_df_filtered = (
         prc_nasc_df[prc_nasc_df["frequency_nominal"] == transmit_settings["frequency"]]
     )
+    
+    # Apply spatial settings
+    prc_nasc_df_filtered.loc[:, "stratum"] = (
+        apply_spatial_definitions(prc_nasc_df_filtered["latitude"], spatial_dict)
+    )
 
     # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object
     # ---- Replace NASC `NaN` values with `0.0`
@@ -176,13 +184,19 @@ def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True):
     # Convert `nasc_dict` to a DataFrame and return the output
     return pd.Series(nasc_dict)
 
-def compute_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True):
+def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict,
+                 echometrics: bool = True):
 
+    # Get spatial definitions, if any
+    spatial_column = file_configuration["spatial_column"]
+    
     # Integrate NASC (and compute the echometrics, if necessary)
     nasc_data_df = (
-        acoustic_data_df.groupby(["longitude", "latitude", "ping_time"])
-        .apply(integrate_nasc, echometrics, include_groups=False)
+        acoustic_data_df
+        .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, observed=False)
+        .apply(integrate_nasc, echometrics)
         .unstack().reset_index()
+        .sort_values("ping_time")
     )
     # ---- Amend the dtypes if echometrics were computed
     if echometrics:
@@ -194,8 +208,39 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True):
                     "aggregation_index": float, "occupied_area": float})
         )
         # ---- Reorder columns
-        nasc_data_df = nasc_data_df[[
-            "longitude", "latitude", "ping_time", "nasc", "n_layers", "nasc_db", "mean_Sv", 
-            "max_Sv", "aggregation_index", "center_of_mass", "dispersion", "evenness", 
-            "occupied_area"
-        ]]
+        nasc_data_df = nasc_data_df[
+            spatial_column
+            + ["longitude", "latitude", "ping_time", "source", "nasc", "n_layers", "nasc_db", 
+               "mean_Sv", "max_Sv", "aggregation_index", "center_of_mass", "dispersion", "evenness", 
+               "occupied_area"]
+        ]
+
+    # Return the output
+    return nasc_data_df
+
+def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict):
+
+    # Get acoustic database filename
+    acoustic_db = file_configuration["database"]["acoustics"]
+   
+        # Create a copy of the dataframe
+    df = nasc_data_df.copy()
+    
+    # Add population-specific columns (specified in the file configuration)
+    # TODO: Add to `yaml` file for configuration; hard-code for now
+    add_columns = ["number_density", "biomass_density", "abundance", "biomass"]
+    # ----
+    df[add_columns] = 0.0
+    # ---- Assign values for key values
+    key_values = [f"{str(index)}-{df.loc[index, 'source']}" for index in df.index]    
+    # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint
+    df.loc[:, "id"] = key_values
+    
+    # Insert the new data into the database & pull in the combined dataset
+    # TODO: Replace with single-direction INSERT statement instead of INSERT/SELECT
+    _ = sql_data_exchange(acoustic_db, dataframe=df, table_name="survey_data_df",
+                          id_columns=["id"], primary_keys=["id"], output_type=pd.DataFrame)
+           
+    # Return the formatted dataframe
+    return df
+                                    
diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py
index cf04589b..76e24e6d 100644
--- a/echopop/live/live_biology.py
+++ b/echopop/live/live_biology.py
@@ -1,6 +1,8 @@
 import pandas as pd
 import numpy as np
 from .sql_methods import SQL, sql_data_exchange, get_table_key_names
+from .live_spatial_methods import apply_spatial_definitions
+from .live_acoustics import average_sigma_bs
 from echopop.acoustics import ts_length_regression, to_dB, to_linear
 from echopop.utils.operations import group_interpolator_creator
 from functools import reduce
diff --git a/echopop/live/live_core.py b/echopop/live/live_core.py
index 677cddc3..256b9f27 100644
--- a/echopop/live/live_core.py
+++ b/echopop/live/live_core.py
@@ -53,13 +53,13 @@
             "dtypes": {
                 "partition": str,
                 "species_code": int,
-                "sample_weight_kg": float,
+                "overall_weight": float,
                 "catch_perc": float,
             },
             "names": {
                 "partition": "trawl_partition",
                 "species_code": "species_id",
-                "sample_weight_kg": "haul_weight",
+                "overall_weight": "haul_weight",
                 "catch_perc": "catch_percentage",
             }
         },
diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index 1220591f..823ebac4 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -213,11 +213,11 @@ def validate_data_directory(file_configuration: dict, dataset: str,
 
     # Validate that files even exist
     # ---- List available *.zarr files
-    data_files = list(directory_path.glob(f"*{'.'+file_settings["extension"]}"))
+    data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}"))
     # ---- Error evaluation (if applicable)
     if not data_files:
         raise FileNotFoundError(
-            f"No `*.{file_settings["extension"]}` files found in [{directory_path}]!"
+            f"No `*.{file_settings['extension']}` files found in [{directory_path}]!"
         )
     
     # Check and format specific input filenames
diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py
index c83f35de..2dd8cefc 100644
--- a/echopop/live/live_spatial_methods.py
+++ b/echopop/live/live_spatial_methods.py
@@ -4,6 +4,7 @@
 from geopy.distance import distance
 from ..spatial.projection import utm_string_generator
 import shapely.geometry
+from typing import Union
 
 def create_inpfc_strata(spatial_config: dict):
 
@@ -34,28 +35,43 @@ def create_inpfc_strata(spatial_config: dict):
     return inpfc_strata_df
 
 def apply_inpfc_definitions(dataset: pd.DataFrame, inpfc_df: pd.DataFrame):
+    
+    # Create dataset copy
+    dataset = dataset.copy()
 
     # Bin the data based on latitude
-    if "latitude" in dataset.columns:
-        dataset["stratum"] = pd.cut(
-            dataset["latitude"],
-            np.unique(np.hstack([inpfc_df["lower"], inpfc_df["upper"]])),
-            labels = inpfc_df["stratum"]
+    if isinstance(dataset, pd.DataFrame) and "latitude" in dataset.columns:
+        dataset.loc[:, "stratum"] = pd.cut(
+            dataset.loc[:, "latitude"],
+            np.unique(np.hstack([inpfc_df.loc[:, "lower"], inpfc_df.loc[:, "upper"]])),
+            labels = inpfc_df.loc[:, "stratum"]
         ).astype(int)
+        
+        return dataset
+    else:
+        strata = pd.cut(dataset.copy(),
+                        np.unique(np.hstack([inpfc_df.loc[:, "lower"], 
+                                             inpfc_df.loc[:, "upper"]])),
+                        labels = inpfc_df.loc[:, "stratum"]
+        )
+        
+        return strata
 
     # Return the INPFC-stratified dataset
-    return dataset
+    # return dataset
 
-def apply_spatial_definitions(data_dict: dict, spatial_dict: dict):
+def apply_spatial_definitions(dataset: Union[dict, pd.Series], spatial_dict: dict):
 
     # Get the acoustic-biology link method
     link_method = spatial_dict["link_method"]
-      
+    
     # Apply spatial definitions
-    if link_method == "INPFC":
-        data_dict.update({
-            k: apply_inpfc_definitions(d, spatial_dict["strata"]) for k, d in data_dict.items()
+    if isinstance(dataset, dict) and link_method == "INPFC":
+        dataset.update({
+            k: apply_inpfc_definitions(d, spatial_dict["strata"]) for k, d in dataset.items()
         })
+    elif isinstance(dataset, pd.Series) and link_method == "INPFC":
+        return apply_inpfc_definitions(dataset, spatial_dict["strata"])
 
 # def apply_inpfc_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict):
 
diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index 306ddeb9..adb67cc3 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -13,7 +13,7 @@
 )
 
 from .sql_methods import query_processed_files
-from .live_acoustics import preprocess_acoustic_data, integrate_nasc
+from .live_acoustics import preprocess_acoustic_data, compute_nasc
 from .live_biology import preprocess_biology_data
 
 
@@ -59,7 +59,6 @@ def __init__(
         if verbose: 
             pass
 
-
     def load_acoustic_data(self,
                            input_filenames: Optional[list] = None,
                            verbose: bool = True):
@@ -76,13 +75,17 @@ def load_acoustic_data(self,
             # ---- Add the `acoustic_data_units` to the dictionary
             self.config["acoustics"]["dataset_units"] = acoustic_data_units   
             # ---- Preprocess the acoustic dataset
-            self.input["acoustics"]["prc_nasc_df"] = preprocess_acoustic_data(prc_nasc_df, 
+            # TODO: SettingWithCopyWarning:
+            self.input["acoustics"]["prc_nasc_df"] = preprocess_acoustic_data(prc_nasc_df.copy(), 
+                                                                              self.input["spatial"],
                                                                               self.config)     
             # TODO: Add verbosity for printing database filepaths/connections 
             if verbose:
+                # ---- Create file list
+                file_list = "\n".join(acoustic_files)
                 print(
                     f"The following acoustic files have been processed:\n"
-                    f"{"\n".join(acoustic_files)}."
+                    f"{file_list}."
                 )
         else:
             self.input["acoustics"]["prc_nasc_df"] = None
@@ -97,9 +100,11 @@ def load_biology_data(self,
         
         # TODO: Add verbosity for printing database filepaths/connections 
         if biology_files and verbose:
-            print(
+            # ---- Create file list
+            file_list = "\n".join(biology_files)
+            print(  
                 f"The following biological files have been processed:\n"
-                f"{"\n".join(biology_files)}."
+                f"{file_list}."
             )
         
         # Read in the biology data files
@@ -111,10 +116,21 @@ def load_biology_data(self,
         )
 
     def process_biology_data(self):
+        # method here
+        pass
+    
+    def process_acoustic_data(self, echometrics: bool = True):
+        
+        # Get the unprocessed acoustic data
+        acoustic_data_df = self.input["acoustics"]["prc_nasc_df"]
 
-        # Separate out processed and unprocessed biological data 
-        # ----- Unprocessed
-        biology_unprocessed = self.input["biology"]
-        # ---- Processed
-        biology_processed = self.input["biology_processed"]
+        # Integrate NASC (and compute the echometrics, if necessary)
+        nasc_data_df = compute_nasc(acoustic_data_df, self.config, echometrics)
+        
+        # Format the dataframe and insert into the LiveSurvey object
+        self.input["nasc_df"] = nasc_data_df
+    
+    def estimate_population(self):
+        # method here
+        pass 
         
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index 4d253455..0bb47306 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -45,7 +45,12 @@ def sql_map_tables(connection: sqla.Connection):
     """
     """
     inspector = inspect(connection)
-    return inspector.get_table_names()
+    table_names = inspector.get_table_names()
+    # result = connection.execute(text("SELECT name FROM sqlite_master WHERE type='table';"))
+    # table_names = result.fetch_all()
+    # Extract table names from the results
+    # table_names = [name[0] for name in table_names]
+    return table_names
 
 def sql_validate(connection: sqla.Connection, table_name: str): 
     """
@@ -83,7 +88,7 @@ def sql_inspect(connection: sqla.Connection, table_name: str, columns: List[str]
     else: 
         # Inspect unique values in specified columns
         # ---- Create SQL command
-        sql_command = f"SELECT DISTINCT {", ".join(columns)} FROM {table_name};"
+        sql_command = f"SELECT DISTINCT {', '.join(columns)} FROM {table_name};"
         # ---- Execute
         table = connection.execute(text(sql_command.strip()))
         # ---- Extract unique values
@@ -96,7 +101,7 @@ def sql_inspect(connection: sqla.Connection, table_name: str, columns: List[str]
 def sql_drop(connection: sqla.Connection, table_name: str):
     """
     """
-    connection.execute(text(f"DROP TABLE IF EXISTS {table_name};"))
+    connection.execute(text(f"DROP TABLE IF EXISTS {table_name}"))
     
 def sql_insert(connection: sqla.Connection, table_name: str, columns: list, dataframe: pd.DataFrame,
                id_columns: Optional[list] = None):
@@ -551,6 +556,31 @@ def sql_data_exchange(database_file: Path, **kwargs):
     # Select existing data frame the database and return the output
     return SQL(database_file, "select", **kwargs)
 
+def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str, List[str]]] = None):
+
+    # Get all database files
+    database_files = file_configuration["database"]
+
+    # Iterate through all keys
+    for _, db_file in database_files.items():
+        # ---- Map the table names
+        table_names = SQL(db_file, "map")
+        # ---- Drop any noted exceptions
+        if not isinstance(table_exception, list):
+            table_exception = [table_exception]
+        # ---- Drop exception table name
+        if None not in table_exception:
+            table_names = list(set(table_names) - set(table_exception))
+        # ---- Iterate through
+        for table_name in table_names:    
+            SQL(db_file, "drop", table_name=table_name)
+        # ---- Validate that all tables were removed  
+        remaining_tables = SQL(table_names, "map")      
+        if set(table_names).intersection(set(remaining_tables)):
+            raise ValueError(
+                f"Attempted reset of [{str(db_file)}] failed."
+            )
+
 
 # TODO: Documentation
 def SQL(db_file: str, command: str, **kwargs):
diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
new file mode 100644
index 00000000..bb8c2bb0
--- /dev/null
+++ b/echopop/test_workflow.py
@@ -0,0 +1,33 @@
+from echopop.live.live_survey import LiveSurvey
+from echopop.live.sql_methods import reset_db_files
+
+live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
+live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
+
+realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path)
+
+####################################################################################################
+# TEST: ACOUSTICS
+####################################################################################################
+# NOTE: Reset database file for utility purposes
+reset_db_files(realtime_survey.config)
+
+# NOTE: LOAD DATA
+realtime_survey.load_acoustic_data()
+# NOTE: INITIAL PROCESSING [JUST ACOUSTIC]
+realtime_survey.process_acoustic_data()
+realtime_survey.input
+####################################################################################################
+# TEST: BIOLOGY
+####################################################################################################
+# NOTE: Reset database file for utility purposes
+reset_db_files(realtime_survey.config)
+
+# NOTE: LOAD DATA
+realtime_survey.load_biology_data()
+realtime_survey.input
+####################################################################################################
+# TEST: POPULATION ESTIMATES
+####################################################################################################
+# NOTE: Acoustic / biological data converge here to derive population estimates 
+realtime_survey.estimate_population()
\ No newline at end of file
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
index b657b07d..2e7ef567 100644
--- a/echopop/zarr_read_ingest_test.py
+++ b/echopop/zarr_read_ingest_test.py
@@ -149,195 +149,6 @@ def process_biology_data(self):
 root_directory = file_configuration["data_root_dir"]
 
 
-####################################################################################################  
-# TEST: ACOUSTIC ZARR FILE INGESTION CONFIGURATION
-# NOTE: 
-# ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration`
-acoustic_data = load_acoustic_data(file_configuration)
-acoustic_data
-file_configuration["database"]
-
-def estimate_echometrics(acoustic_data_df: pd.DataFrame):
-
-    # Create copy
-    acoustic_df = acoustic_data_df.copy().reset_index(drop=True)
-
-    # Pre-compute the change in depth
-    acoustic_df["dz"] = acoustic_df["depth"].diff()
-
-    # Initialize echometrics dictionary
-    echometrics = {}
-
-    # Compute the metrics center-of-mass
-    if acoustic_df["NASC"].sum() == 0.0:
-        echometrics.update({
-            "n_layers": 0,
-            "mean_Sv": -999,
-            "max_Sv": -999,
-            "nasc_db": np.nan,
-            "center_of_mass": np.nan,
-            "dispersion": np.nan,
-            "evenness": np.nan,
-            "aggregation": np.nan,    
-            "occupied_area": 0.0,        
-        })
-    else:
-        
-        # Compute the number of layers
-        echometrics.update({
-            "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size
-        })
-
-        # Compute ABC
-        # ---- Convert NASC to ABC
-        acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2)
-        # ---- Estimate mean Sv
-        echometrics.update({
-            "mean_Sv": 10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max())
-        })
-        # --- Estimate max Sv (i.e. )
-        echometrics.update({
-            "max_Sv": 10 * np.log10(acoustic_df["ABC"].max() 
-                                    / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"])
-        })
-
-        # Compute (acoustic) abundance
-        echometrics.update({
-            "nasc_db": 10 * np.log10(acoustic_df["ABC"].sum())
-        })
-
-        # Compute center of mass
-        echometrics.update({
-            "center_of_mass": (
-                (acoustic_df["depth"] * acoustic_df["NASC"]).sum()
-                / (acoustic_df["NASC"]).sum()
-            )
-        })
-
-        # Compute the dispersion
-        echometrics.update({
-            "dispersion": (
-                ((acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2 
-                * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum()                
-            )
-        })
-
-        # Compute the evenness
-        echometrics.update({
-            "evenness": (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2
-        })
-
-        # Compute the index of aggregation
-        echometrics.update({
-            "aggregation": 1 / echometrics["evenness"]
-        })
-
-        # Get the occupied area
-        echometrics.update({
-            "occupied_area": (
-                acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max()
-            )
-        })
-
-    # Return the dictionary
-    return echometrics
-
-def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True):
-
-    # Vertically integrate PRC NASC
-    nasc_dict = {"nasc": acoustic_data_df["NASC"].sum()}
-    
-    # Horizontally concatenate `echometrics`, if `True`
-    if echometrics:
-        # ---- Compute values
-        # NOTE: This uses NASC instead of linear `sv`
-        echometrics_dict = estimate_echometrics(acoustic_data_df)
-        # ---- Merge
-        nasc_dict.update(echometrics_dict)
-
-    # Convert `nasc_dict` to a DataFrame and return the output
-    return pd.Series(nasc_dict)
-
-
-acoustic_data_df = acoustic_data["prc_nasc_df"]
-
-
-
-# SQL(database_file, "drop", table_name="nasc_df")
-# SQL(database_file, "validate", **kwargs)
-# SQL(database_file, "create", table_name="nasc_df", primary_keys=["latitude", "longitude", "ping_time"], dataframe=nasc_data_df)
-# SQL(database_file, "validate", **kwargs)
-# SQL(database_file, "select", table_name="nasc_df")
-# SQL(database_file, "insert", table_name="nasc_df", id_columns=["latitude", "longitude", "ping_time"], dataframe=nasc_data_df)
-# SQL(database_file, "select", table_name="nasc_df")
-# SQL(database_file, "insert", table_name="nasc_df", id_columns=["latitude", "longitude", "ping_time"], dataframe=nasc_data_df)
-# SQL(database_file, "select", table_name="nasc_df")
-# SQL(database_file, "insert", table_name="nasc_df", dataframe=nasc_data_df)
-# SQL(database_file, "drop", table_name="nasc_df")
-# SQL_DTYPES[type(dataframe["ping_time"][0]).__name__]
-
-
-def process_acoustic_data(self,
-                            echometrics: bool = True):
-    
-    # Get the unprocessed acoustic data
-    acoustic_data_df = self.input["acoustics"]["prc_nasc_df"]
-
-    # Integrate NASC (and compute the echometrics, if necessary)
-    nasc_data_df = (
-        acoustic_data_df.groupby(["longitude", "latitude", "ping_time"])
-        .apply(integrate_nasc, echometrics, include_groups=False)
-        .unstack().reset_index()
-    )
-
-    # ---- Amend the dtypes if echometrics were computed
-    if echometrics:
-        # ---- Set dtypes
-        nasc_data_df = (
-            nasc_data_df
-            .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float,
-                    "center_of_mass": float, "dispersion": float, "evenness": float,
-                    "aggregation_index": float, "occupied_area": float})
-        )
-        # ---- Reorder columns
-        nasc_data_df = nasc_data_df[[
-            "longitude", "latitude", "ping_time", "nasc", "n_layers", "nasc_db", "mean_Sv", 
-            "max_Sv", "aggregation_index", "center_of_mass", "dispersion", "evenness", 
-            "occupied_area"
-        ]]
-
-
-def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: dict, 
-                          echometrics: bool = True):
-
-    # Integrate NASC (and compute the echometrics, if necessary)
-    nasc_data_df = (
-        acoustic_data_df.groupby(["longitude", "latitude", "ping_time"])
-        .apply(lambda group: integrate_nasc(group, echometrics), include_groups=False)
-        .reset_index()
-    )
-    # ---- Amend the dtypes if echometrics were computed
-    if echometrics:
-        nasc_data_df = (
-            nasc_data_df
-            .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float,
-                    "center_of_mass": float, "dispersion": float, "evenness": float,
-                    "aggregation": float, "occupied_area": float})
-        )
-
-    # Get the acoustics database file
-    acoustics_db = file_configuration["database"]["acoustics"]
-
-    # Insert the new data into the database and pull in the combined previous and new data combined
-    full_nasc_df = sql_data_exchange(acoustics_db, dataframe=nasc_data_df, 
-                                     table_name="nasc_df", 
-                                     id_columns=["longitude", "latitude", "ping_time"],
-                                     primary_keys=["longitude", "latitude", "ping_time"],
-                                     output_type=pd.DataFrame)
-
-    # Return the output
-    return full_nasc_df
-
 ####################################################################################################
 def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str, List[str]]] = None):
 
@@ -811,146 +622,6 @@ def parse_condition(condition):
 
     return condition
 
-columns = ["sigma_bs_sum", "sigma_bs_count"]
-operation = "+"
-table_name = "sigma_bs_mean_df"
-dataframe = sigma_bs_df
-condition = condition_str
-
-SQL(acoustic_db, "select", table_name="files_read")
-average_sigma_bs
-
-ts_lengths_df.groupby(["haul_num"]).apply(average_sigma_bs).apply(lambda x: to_dB(x))
-def integrate_nasc(prc_nasc_df: pd.DataFrame):
-
-# Compute the number of layers
-echometrics.update({
-    "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size
-})
-
-# Compute the index of aggregation
-echometrics.update({
-    "aggregation": 1 / echometrics["evenness"]
-})
-
-# Get the occupied area
-echometrics.update({
-    "occupied_area": (
-        acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max()
-    )
-})
-
-
-current = 10 ** (-60/10)
-count = 5
-old_tuple = (current, count)
-
-new = 10 ** (-50/10)
-count = 2
-
-data = pd.DataFrame({"value": 10 ** (np.array([-60.0, -50.0]) / 10.0),
-                     "count": np.array([5, 2]) })
-
-data = pd.DataFrame({"value": 10 ** (np.array([-61, -62, -63, -62, -61]) / 10.0)})
-data_new = pd.DataFrame({"value": 10 ** (np.array([-51, -52, -53, -52, -54, -56, -58]) / 10.0)})
-data["value"].sum() / data["value"].size
-data_new["value"].sum() / data_new["value"].size
-
-(data["value"].sum() + data_new["value"].sum()) / (data["value"].size + data_new["value"].size)
-
-data_test = pd.DataFrame({"value": 10 ** (np.array([-61, -62, -63, -62, -61, -51, -52, -53, -52, -54, -56, -58]) / 10.0)})
-data_test["value"].mean()
-
-data["value"].mean()
-
-data["value"].sum()
-
-old_number = np.average(data["value"], weights=data["count"])
-old_count = data["count"].sum()
-
-new_number = np.array([-80.0, -70.0, -60.0, -70.0, -80.0])
-new_count = len(new_number)
-new_mean = 10 ** (new_number.mean() / 10)
-
-np.average(np.concatenate([[old_number], [new_mean]]), 
-           weights = np.concatenate([[old_count], [new_count]]))
-
-np.mean(10 ** (np.array([-60.0, -60.0, -60.0, -60.0, -60.0, -50.0, -50.0, -80.0, -70.0, -60.0, -70.0, -80.0]) / 10))
-np.average(data["value"], weights=data["count"])
-
-np.sum(10 ** (np.array([-60.0, -60.0, -60.0, -60.0, -60.0, -50.0, -50.0, -80.0, -70.0, -60.0, -70.0, -80.0]) / 10))
-
-
-pd.read_fr
-pd.read_sql(text(SQL_COMMANDS["select"].format(**kwargs)), con=connection)
-db_file = self.config["database"]["acoustics"]
-engine = create_engine(f"sqlite:///{db_file}")
-connection = engine.connect()
-
-SQL(db_file, "select", table_name="sigma_bs_mean_df", condition="stratum = 1")
-
-
-kwargs["dataframe"].to_sql(name=kwa
-rgs["table_name"], 
-                                                  con=connection, 
-                                                  if_exists="append", index=False)
-connection.close()
-engine.dispose()
-SQL(db_file, "insert", table_name=table_name, columns="*", 
-        filter_columns=insertion_filter,
-        dataframe=df)
-
-SQL(db_file, "select", table_name="files_read")
-SQL(db_file, "select", table_name="catch_df")
-SQL(db_file, "select", table_name="specimen_df")
-SQL(db_file, "select", table_name="length_df")
-
-def check_table_schema(connection, **kwargs):
-    query = text(("PRAGMA table_info({table_name});").format(**kwargs))
-    schema = connection.execute(query).fetchall()
-    print("Table Schema:", schema)
-
-check_table_schema(connection, table_name=table_name)
-
-def insert_test_data(connection, table_name):
-    test_data = pd.DataFrame({
-        'trawl_partition': ['test'],
-        'species_id': ['test'],
-        'haul_weight': [0.0],
-        'catch_percentage': [0.0],
-        'haul_num': [1]
-    })
-    
-    test_data.to_sql(name=table_name, con=connection, if_exists='append', index=False)
-    print("Test data inserted.")
-
-insert_test_data(connection, table_name)
-
-kwargs = {}
-command = "insert"
-kwargs["table_name"] = "catch_df"
-kwargs["dataframe"] = df
-kwargs["filter_columns"] = insertion_filter
-columns = "*"
-
-
-re.compile(file_name_format)
-pattern = file_name_format
-pattern = pattern.replace('{DATE:YYYYMM}', r'(?P<DATE>\d{6})')
-pattern = pattern.replace('{HAUL}', r'(?P<HAUL>\d+)')
-pattern = pattern.replace('{FILE_ID}', r'(?P<FILE_ID>.+)')
-regex = re.compile(pattern)
-haul_values = []
-
-file_name_format.search(file.name)
-sub_df_lst = []
-for file in subcsv_files:
-    match = regex.search(file.name)
-    if match:
-        haul_value = match.group('HAUL')
-        df = pd.read_csv(file, usecols=list(sub_config_map.keys()))
-        df['HAUL'] = haul_value  # Append HAUL value as a new column
-        sub_df_lst.append(df)
 ####################################################################################################
 def load_spatial_data(file_configuration: dict,
                       acoustic_data: pd.DataFrame,

From e395405efba52088ec6734994a8aedee647a42b1 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Thu, 1 Aug 2024 10:34:50 -0700
Subject: [PATCH 11/81] Quick patch

---
 echopop/test_workflow.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index bb8c2bb0..9678736b 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -26,8 +26,12 @@
 # NOTE: LOAD DATA
 realtime_survey.load_biology_data()
 realtime_survey.input
+# NOTE: INITIAL PROCESSING [JUST BIOLOGY]
+realtime_survey.process_biology_data()
+realtime_survey.input
 ####################################################################################################
 # TEST: POPULATION ESTIMATES
 ####################################################################################################
 # NOTE: Acoustic / biological data converge here to derive population estimates 
+# TODO: Add argument that indicates what the new datasets and what data need to be pulled in
 realtime_survey.estimate_population()
\ No newline at end of file

From 2f04cab604d6100c8afaea2003d392a9da6c1372 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Thu, 1 Aug 2024 12:26:56 -0700
Subject: [PATCH 12/81] Fleshed out biology processing methods

---
 echopop/__init__.py              |   2 +-
 echopop/live/__init__.py         |   2 +-
 echopop/live/live_acoustics.py   |   2 +-
 echopop/live/live_biology.py     |  22 ++--
 echopop/live/live_survey.py      |  94 +++++++++++--
 echopop/test_workflow.py         |  14 +-
 echopop/zarr_read_ingest_test.py | 219 +++++++++++++++++++++++++++----
 7 files changed, 296 insertions(+), 59 deletions(-)

diff --git a/echopop/__init__.py b/echopop/__init__.py
index a28b91b0..7dfc17fd 100644
--- a/echopop/__init__.py
+++ b/echopop/__init__.py
@@ -3,4 +3,4 @@
 
 __all__ = ["Survey", "operations"]
 
-from _echopop_version import version as __version__  # noqa
+# from _echopop_version import version as __version__  # noqa
diff --git a/echopop/live/__init__.py b/echopop/live/__init__.py
index f4e742bb..325afcbb 100644
--- a/echopop/live/__init__.py
+++ b/echopop/live/__init__.py
@@ -2,4 +2,4 @@
 
 __all__ = ["operations"]
 
-from _echopop_version import version as __version__  # noqa
\ No newline at end of file
+# from _echopop_version import version as __version__  # noqa
\ No newline at end of file
diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py
index 6afc5bc2..11d0b392 100644
--- a/echopop/live/live_acoustics.py
+++ b/echopop/live/live_acoustics.py
@@ -2,7 +2,7 @@
 import numpy as np
 import pandas as pd
 
-from echopop.acoustics import ts_length_regression, to_linear, to_dB
+from ..acoustics import ts_length_regression, to_linear, to_dB
 from .live_spatial_methods import apply_spatial_definitions
 from .sql_methods import sql_data_exchange
 
diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py
index 76e24e6d..efc88765 100644
--- a/echopop/live/live_biology.py
+++ b/echopop/live/live_biology.py
@@ -3,8 +3,8 @@
 from .sql_methods import SQL, sql_data_exchange, get_table_key_names
 from .live_spatial_methods import apply_spatial_definitions
 from .live_acoustics import average_sigma_bs
-from echopop.acoustics import ts_length_regression, to_dB, to_linear
-from echopop.utils.operations import group_interpolator_creator
+from ..acoustics import ts_length_regression, to_dB, to_linear
+from ..utils.operations import group_interpolator_creator
 from functools import reduce
 
 def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict):
@@ -129,16 +129,12 @@ def preprocess_biology_data(biology_output: dict, spatial_dict: dict, file_confi
 def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, 
                      file_configuration: dict):
 
-    # Assign contrast columns
-    contrast_list = []
-    # ---- Check for "stratum" column
-    if "stratum" in specimen_data.columns and "stratum" in length_data.columns:
-        contrast_list.append(["stratum"])
-    # ---- Add the additional columns
-    contrast_list.append(["haul_num", "species_id", "length"])
-    # ---- Concatenate
-    contrast_columns = list(np.concatenate(contrast_list))
-
+    # Determine contrast columns
+    # ----- Check for "stratum" column in spatial definitions configuration
+    stratum_column = file_configuration["spatial_column"]
+    # ---- Append to other defined keys
+    contrast_columns = stratum_column + ["haul_num", "species_id", "length"]
+    
     # Meld the biological datasets
     length_datasets = specimen_data.meld(length_data, 
                                          contrasts=contrast_columns)
@@ -167,7 +163,7 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame,
     sigma_bs_df = (
         ts_length_df
         .groupby(list(set(contrast_columns) - set(["length"])), observed=False)
-        .apply(lambda x: average_sigma_bs(x, weighted="length_count"), include_groups=False)
+        .apply(lambda x: average_sigma_bs(x, weights="length_count"), include_groups=False)
         .reset_index(name="sigma_bs")
     )
 
diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index adb67cc3..e8589b93 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -13,8 +13,21 @@
 )
 
 from .sql_methods import query_processed_files
-from .live_acoustics import preprocess_acoustic_data, compute_nasc
-from .live_biology import preprocess_biology_data
+from .live_acoustics import (
+    compute_nasc,
+    preprocess_acoustic_data
+)
+    
+from .live_biology import (
+    bin_length_data,
+    compute_average_weights,
+    compute_sigma_bs,
+    length_bin_counts,
+    length_weight_regression,
+    number_proportions,
+    length_bin_weights,
+    preprocess_biology_data
+)
 
 
 from . import live_data_processing as eldp
@@ -116,19 +129,74 @@ def load_biology_data(self,
         )
 
     def process_biology_data(self):
-        # method here
-        pass
-    
-    def process_acoustic_data(self, echometrics: bool = True):
-        
-        # Get the unprocessed acoustic data
-        acoustic_data_df = self.input["acoustics"]["prc_nasc_df"]
 
-        # Integrate NASC (and compute the echometrics, if necessary)
-        nasc_data_df = compute_nasc(acoustic_data_df, self.config, echometrics)
+        # TODO: How and when should the already processed data be imported?
+        # Separate out processed and unprocessed biological data 
+        # ----- Unprocessed
+        biology_unprocessed = self.input["biology"]
+
+        # Compute `sigma_bs` by sending it to the appropriate database table
+        compute_sigma_bs(biology_unprocessed["specimen_df"], 
+                         biology_unprocessed["length_df"], 
+                         self.config)
+
+        # Bin the length measurements of the biological data
+        bin_length_data(biology_unprocessed, self.config["length_distribution"])
+
+        # Compute the length-weight regression and add it to the SQL table
+        length_weight_df = length_weight_regression(biology_unprocessed["specimen_df"], 
+                                                    self.config["length_distribution"],
+                                                    self.config)
         
-        # Format the dataframe and insert into the LiveSurvey object
-        self.input["nasc_df"] = nasc_data_df
+        # Compute length-binned counts for the aggregated and individual-based measurements
+        specimen_binned, specimen_binned_filtered, length_binned = (
+            length_bin_counts(biology_unprocessed["length_df"], 
+                              biology_unprocessed["specimen_df"], 
+                              self.config)
+        )
+
+        # Compute the number proportions
+        specimen_number_proportion, length_number_proportion, sex_number_proportions = (
+            number_proportions(specimen_binned, specimen_binned_filtered, 
+                               length_binned, self.config)
+        )
+
+        # Compute the length-binned weights for the aggregated and individual-based measurements
+        length_weight_binned, specimen_weight_binned = (
+            length_bin_weights(biology_unprocessed["length_df"],
+                               biology_unprocessed["specimen_df"],
+                               length_weight_df,self.config)
+        )
+
+        # Calculate the average weights among male, female, and all fish
+        fitted_weight_df = compute_average_weights(specimen_number_proportion,
+                                                   length_number_proportion, 
+                                                   sex_number_proportions,
+                                                   length_weight_df,
+                                                   self.config["length_distribution"],
+                                                   self.config)
+
+    def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True):
+
+        # Check for if any data is present; if not, provide report
+        if self.input["acoustics"]["prc_nasc_df"] is None:
+            # ---- Set the corresponding `nasc_df` DataFrame to None
+            self.input["nasc_df"] = None
+            # ---- Print, if verbose
+            if verbose:
+                print(
+                    "No acoustic data located in `*.input['acoustics']['prc_nasc_df']"
+                    " DataFrame. Data processing step will therefore be skipped."
+                )
+        else:        
+            # Get the unprocessed acoustic data
+            acoustic_data_df = self.input["acoustics"]["prc_nasc_df"]
+
+            # Integrate NASC (and compute the echometrics, if necessary)
+            nasc_data_df = compute_nasc(acoustic_data_df, self.config, echometrics)
+            
+            # Format the dataframe and insert into the LiveSurvey object
+            self.input["nasc_df"] = nasc_data_df
     
     def estimate_population(self):
         # method here
diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index 9678736b..ee87216b 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -1,8 +1,17 @@
 from echopop.live.live_survey import LiveSurvey
 from echopop.live.sql_methods import reset_db_files
+from echopop.live.sql_methods import query_processed_files
+from echopop.live.live_acoustics import preprocess_acoustic_data, compute_nasc
+from echopop.live.live_biology import preprocess_biology_data
+from echopop.live.live_core import(
+    LIVE_DATA_STRUCTURE,
+)
 
-live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
-live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
+from echopop.live import live_data_processing as eldp
+from echopop.live import live_data_loading as eldl
+
+live_init_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
+live_file_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
 
 realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path)
 
@@ -34,4 +43,5 @@
 ####################################################################################################
 # NOTE: Acoustic / biological data converge here to derive population estimates 
 # TODO: Add argument that indicates what the new datasets and what data need to be pulled in
+# TODO: ARGUMENT {working_dataset: Literal["acoustic", "biology"]}
 realtime_survey.estimate_population()
\ No newline at end of file
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
index 2e7ef567..1d53749a 100644
--- a/echopop/zarr_read_ingest_test.py
+++ b/echopop/zarr_read_ingest_test.py
@@ -50,12 +50,6 @@
 
 def process_biology_data(self):
 
-    # Separate out processed and unprocessed biological data 
-    # ----- Unprocessed
-    biology_unprocessed = self.input["biology"]
-    # ---- Processed
-    biology_processed = self.input["biology_processed"]
-
     # Compute `sigma_bs` by sending it to the appropriate database table
     compute_sigma_bs(biology_unprocessed["specimen_df"], biology_unprocessed["length_df"], 
                      self.config)
@@ -98,15 +92,23 @@ def process_biology_data(self):
 catch_data = self.input["biology"]["catch_df"]
 
 # Get the spatial column name, if there is one
-contrast_columns = file_configuration["spatial_column"].copy()
+spatial_column = file_configuration["spatial_column"]
 # ---- Append additional columns that will be used
-contrast_columns.extend(["sex", "species_id"])
+contrast_columns = spatial_column + ["sex", "species_id"]
 
 # Calculate grouped totals
+# ---- Sum the net haul weights from station 1/unaged fish
+catch_weights = catch_data.count_variable(
+    contrasts=["species_id"] + spatial_column, 
+    variable="haul_weight", fun="sum"
+)
+# ---- Rename resulting columns for both
+catch_weights.rename(columns={"count": "total_weight"}, inplace=True)
+
 # ---- Specimen
 specimen_weights = specimen_weight_binned.sum().reset_index(name="total_weight")
 
-
+specimen_weight_binned
 # Calculate the sexed and total stratum weights for each sex among unaged fish
 # ---- Sum the net haul weights from station 1/unaged fish
 catch_weights = catch_data.count_variable(
@@ -116,30 +118,191 @@ def process_biology_data(self):
 # ---- Rename resulting columns for both
 catch_weights.rename(columns={"count": "total_weight"}, inplace=True)
 
-# Sum the sexed and total weights from the weight-fitted unaged data
-# ---- Extract the unaged/length quantized weights
-unaged_weights_binned = distributions_dict["unaged_length_weight_tbl"].copy()
-# ---- Calculate the total weight per stratum per sex
-unaged_weights_sex = unaged_weights_binned.sum()
-# ---- Length (by sex)
-length_weights_sex = length_weight_binned.groupby(contrast_columns)["weight_interp"].sum()#.to_frame("weight")
-# ---- Further reduce
-length_weight_total = length_weights_sex.transpose().unstack(0).sum(axis=0)
+# For the specimen data 
+# ---- Sum the net haul weights from station 1/unaged fish
+# ---- Specimen
+specimen_weights_sex = (
+    specimen_weight_binned
+    .groupby(contrast_columns)["weight"]
+    .sum()
+)
+# ---- Total (per stratum, if it exists)
+specimen_weight_total = specimen_weights_sex.transpose().unstack(1).sum(axis=1)
+
+# For the length (unaged) dataset
+length_weights_sex = (
+    length_weight_binned
+    .groupby(contrast_columns)["weight_interp"]
+    .sum()
+)
+# ---- Further reduce to the grand total (per stratum, if it exists)
+length_weight_total = length_weights_sex.transpose().unstack(1).sum(axis=1)
+
 # ---- Standardize the unaged sexed weights
-(length_weights_sex / length_weight_total).unstack(0) * catch_weights["total_weight"].to_numpy()
+length_weight_standardized = (
+    (length_weights_sex / length_weight_total).unstack(0) 
+    * catch_weights["total_weight"].to_numpy()
+)
+
+# Calculate the specimen weight proportions
+# ---- Pivot weight bins
+specimen_weight_binned_pvt = (
+    specimen_weight_binned.pivot_table(
+        columns=spatial_column,
+        index=["length_bin", "species_id", "sex"],
+        values="weight",
+        observed = False
+    )
+)
+# ---- Divide by the aged stratum weights (relative to only aged fish)
+specimen_weight_proportions_pvt = (
+    specimen_weight_binned_pvt / specimen_weight_total.to_numpy()
+)
+# ---- Pivot back to the desired format
+specimen_weight_proportion = (
+    specimen_weight_proportions_pvt
+    .stack().reset_index(name="weight_proportion")
+    .pivot_table(columns=stratum_column + ["species_id", "sex"], 
+                 index="length_bin", values="weight_proportion")
+)    
+# ---- Calculate the internal (i.e. only aged fish) for each sex
+within_specimen_sex_proportions = (
+    specimen_weight_proportion.sum()
+)
 
+# Calculate the total strata weights
+# ---- Index `catch_weights`
+catch_weights_idx = catch_weights.set_index(stratum_column + ["species_id"])
+# ---- Compute the spatially-stratified/grouped weights
+spatial_weights = (
+    pd.concat([specimen_weight_total.to_frame("total_weight"), catch_weights_idx])
+    .pivot_table(
+        columns=stratum_column, 
+        aggfunc="sum", 
+        values="total_weight", 
+        observed=False
+    )
+)
 
-length_weight_total = (
-    length_weights_sex.reset_index(list(set(contrast_columns)-set(file_configuration["spatial_column"].copy())))
-    ["weight_interp"].sum()
+# Calculate the weight proportions relative to the overall stratum weights
+# ---- Aged
+# -------- Reformat into dataframe and merge with total stratum weights
+specimen_weights_binned_df = (
+    specimen_weight_binned_pvt.stack()
+    .to_frame("specimen_weight")
+    .reset_index()
+    .merge(spatial_weights.T.reset_index(), on=stratum_column)
+)
+# -------- Calculate proportions
+specimen_weights_binned_df["weight_proportion_overall"] = (
+    specimen_weights_binned_df["specimen_weight"] / specimen_weights_binned_df["total_weight"]
+)
+# -------- Consolidate to calculate the sexed proportions per stratum
+specimen_weight_sex_proportions = specimen_weights_binned_df.groupby(stratum_column + ["species_id", "sex"])[
+    "weight_proportion_overall"
+].sum()
+# ---- Unaged
+# -------- Reformat into dataframe and merge with total stratum weights
+length_weights_sex_standardized_df = (
+    length_weight_standardized.stack()
+    .to_frame("catch_weight")
+    .reset_index()
+    .merge(spatial_weights.T.reset_index(), on=stratum_column)
+)
+# -------- Calculate proportions
+length_weights_sex_standardized_df["weight_proportion_overall"] = (
+    length_weights_sex_standardized_df["catch_weight"]
+    / length_weights_sex_standardized_df["total_weight"]
+)
+# -------- Back-calculate the sexed weight proportions relative to just unaged fish
+# ------------ Aggregate proportions
+length_total_sex_proportions = length_weights_sex_standardized_df.pivot_table(
+    columns=["species_id", "sex"], index=stratum_column, values="weight_proportion_overall"
+).transpose().unstack(["species_id"]).sum(axis=0)
+# ------------ Re-compute the proportions
+length_weight_sex_proportions = (
+    length_weights_sex_standardized_df.pivot_table(
+        index=["species_id", "sex"], columns=stratum_column, 
+        values="weight_proportion_overall"
+    )
+    / length_total_sex_proportions.to_numpy()
 )
-# ---- Calculate the stratum totals
-unaged_strata_weights = unaged_weights_sex.unstack(0).sum(axis=0)
-# ---- Standardize the unaged sexed weights
-unaged_weights_sex_standardized = (unaged_weights_sex / unaged_strata_weights).unstack(
-    0
-) * catch_strata_weights["stratum_weight"].to_numpy()
 
+# Compute the overall length-binned weight distributions among unaged fish
+# ---- Extract the number proportions computed for unaged fish
+length_number_proportions = length_number_proportion.copy()
+# ---- Filter out values besides those computed for 'all' fish
+length_number_proportions = length_number_proportions[length_number_proportions["sex"] == "all"]
+# ---- Convert to a table
+length_number_proportions_tbl = length_number_proportions.pivot_table(
+    columns=stratum_column + ["species_id"],
+    index=["length_bin"],
+    values="proportion_number_length",
+    aggfunc="sum",
+    observed=False,
+)
+# ---- Extract the fitted weight values calculated for all fish
+length_weight_all = length_weight_df[length_weight_df["sex"] == "all"]
+# ---- Generate the fitted weight array
+fitted_weights = length_weight_all.copy()
+# ---- Get actual length bins in dataset
+fitted_weights = fitted_weights[fitted_weights["length_bin"].isin(length_number_proportions["length_bin"])]
+# ---- Apportion the averaged weights
+length_apportioned_weights = length_number_proportions_tbl.T * fitted_weights["weight_fitted"].to_numpy()
+# ---- Compute the average weight proportions per length bin per stratum
+average_length_bin_weights = length_apportioned_weights.T / length_apportioned_weights.sum(axis=1)
+# ---- Convert back to a DataFrame
+average_length_bin_weights_df = average_length_bin_weights.unstack().reset_index(
+    name="weight_proportion"
+)
+
+# Calculate the aged and unaged weight proportions
+# ---- Aged
+aged_proportions = specimen_weight_sex_proportions.unstack("sex").sum(axis=1)
+# ---- Unaged
+unaged_proportions = 1 - aged_proportions
+# -------- Re-weight the unaged sexed proportions
+unaged_weight_sex_proportions_overall = (
+    (length_weight_sex_proportions * unaged_proportions.unstack().transpose()).astype(float).fillna(0.0)
+)
+
+unaged_proportions.unstack().transpose()
+# Format the outputs
+# ---- Aged: stratum-sex-age-length relative to aged and total weights
+aged_overall_df = (
+    specimen_weight_proportion.unstack()
+    .reset_index(name="weight_proportions")
+    .merge(
+        specimen_weights_binned_df[
+            stratum_column + ["length_bin", "sex", "species_id", "weight_proportion_overall"]
+        ]
+    )
+)
+# ---- Aged: stratum-sex relative to total weights
+aged_sex_df =within_specimen_sex_proportions.reset_index(name="weight_proportion_aged").set_index(
+        stratum_column + ["species_id", "sex"]
+    )
+# ---- Add the aged sex proportiosn relative to the overall survey
+aged_sex_df["weight_proportion_overall_aged"] = specimen_weight_sex_proportions
+# ---- Consolidate the aged and unaged sexed dataframes
+# -------- Initialize the dataframe
+aged_unaged_sex_proportions = aged_sex_df.reset_index().set_index(["species_id", "sex"] + stratum_column)
+# --------- Add the within-unaged weight proportions
+aged_unaged_sex_proportions["weight_proportion_unaged"] = (
+    length_weight_sex_proportions.stack()
+)
+# --------- Add the overall-unaged weight proportions
+aged_unaged_sex_proportions["weight_proportion_overall_unaged"] = (
+    unaged_weight_sex_proportions_overall.stack()
+)
+# ---- Overall aged and unaged proportions
+aged_unaged_proportions = aged_proportions.reset_index(name="aged_proportions")
+# ---- Set index
+aged_unaged_proportions.set_index(stratum_column + ["species_id"], inplace=True)
+# -------- Add unaged proportions
+aged_unaged_proportions["unaged_proportions"] = unaged_proportions#.reset_index()
+# ---- Reset the index
+aged_unaged_proportions = aged_unaged_proportions.reset_index()
 ####################################################################################################
 # * Functionality for reading in processed acoustic data
 # TODO: Expand data validator and limit cases to '*.zarr' (for now)

From c95cf8dcd3d1f666c0c9d4cbe08804fb43aa62ce Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Thu, 1 Aug 2024 14:07:44 -0700
Subject: [PATCH 13/81] Further refinement of `process_biology_data` meth

---
 echopop/biology.py               |   2 +-
 echopop/live/live_biology.py     | 228 ++++++++++++++++++++++++++++++-
 echopop/test_workflow.py         |  15 +-
 echopop/utils/operations.py      |   4 +-
 echopop/zarr_read_ingest_test.py |   2 +-
 5 files changed, 241 insertions(+), 10 deletions(-)

diff --git a/echopop/biology.py b/echopop/biology.py
index 0d24ef6b..cf8f0faa 100644
--- a/echopop/biology.py
+++ b/echopop/biology.py
@@ -75,7 +75,7 @@ def fit_length_weight_relationship(
                 np.polyfit(np.log10(df["length"]), np.log10(df["weight"]), 1),
                 index=["rate", "initial"],
             ),
-            include_groups=False,
+            # include_groups=False,
         )
         .reset_index()
     )
diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py
index efc88765..f42dbe82 100644
--- a/echopop/live/live_biology.py
+++ b/echopop/live/live_biology.py
@@ -163,7 +163,7 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame,
     sigma_bs_df = (
         ts_length_df
         .groupby(list(set(contrast_columns) - set(["length"])), observed=False)
-        .apply(lambda x: average_sigma_bs(x, weights="length_count"), include_groups=False)
+        .apply(lambda x: average_sigma_bs(x, weights="length_count"))
         .reset_index(name="sigma_bs")
     )
 
@@ -244,7 +244,7 @@ def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.Da
                 np.polyfit(np.log10(df["length"]), np.log10(df["weight"]), 1),
                 index=["rate", "initial"],
             ),
-            include_groups=False,
+            # include_groups=False,
         )
         .reset_index()
     )
@@ -807,4 +807,226 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame,
     )
 
     # Return output
-    return fitted_weight_df
\ No newline at end of file
+    return fitted_weight_df
+
+def weight_proportions(catch_data: pd.DataFrame,
+                       specimen_data: pd.DataFrame,
+                       length_data: pd.DataFrame,
+                       specimen_weight_binned: pd.DataFrame,
+                       length_weight_binned: pd.DataFrame,
+                       length_number_proportion: pd.DataFrame,
+                       length_weight_df: pd.DataFrame,
+                       file_configuration: dict):
+    
+    # Get the spatial column name, if there is one
+    spatial_column = file_configuration["spatial_column"]
+    # ---- Append additional columns that will be used
+    contrast_columns = spatial_column + ["sex", "species_id"]
+
+    # Calculate grouped totals
+    # ---- Sum the net haul weights from station 1/unaged fish
+    catch_weights = catch_data.count_variable(
+        contrasts=["species_id"] + spatial_column, 
+        variable="haul_weight", fun="sum"
+    )
+    # ---- Rename resulting columns for both
+    catch_weights.rename(columns={"count": "total_weight"}, inplace=True)
+    
+    # Sum total weights for specimen data
+    specimen_weights = specimen_weight_binned.sum().reset_index(name="total_weight")
+    
+    # For the specimen data 
+    # ---- Sum the net haul weights from station 1/unaged fish
+    specimen_weights_sex = (
+        specimen_weight_binned
+        .groupby(contrast_columns)["weight"]
+        .sum()
+    )
+    # ---- Total (per stratum, if it exists)
+    specimen_weight_total = specimen_weights_sex.transpose().unstack(1).sum(axis=1)
+    
+    # For the length (unaged) dataset
+    length_weights_sex = (
+        length_weight_binned
+        .groupby(contrast_columns)["weight_interp"]
+        .sum()
+    )
+    # ---- Further reduce to the grand total (per stratum, if it exists)
+    length_weight_total = length_weights_sex.transpose().unstack(1).sum(axis=1)
+
+    # ---- Standardize the unaged sexed weights
+    length_weight_standardized = (
+        (length_weights_sex / length_weight_total).unstack(0) 
+        * catch_weights["total_weight"].to_numpy()
+    )
+    
+    # Calculate the specimen weight proportions
+    # ---- Pivot weight bins
+    specimen_weight_binned_pvt = (
+        specimen_weight_binned.pivot_table(
+            columns=spatial_column,
+            index=["length_bin", "species_id", "sex"],
+            values="weight",
+            observed = False
+        )
+    )
+    # ---- Divide by the aged stratum weights (relative to only aged fish)
+    specimen_weight_proportions_pvt = (
+        specimen_weight_binned_pvt / specimen_weight_total.to_numpy()
+    )
+    # ---- Pivot back to the desired format
+    specimen_weight_proportion = (
+        specimen_weight_proportions_pvt
+        .stack().reset_index(name="weight_proportion")
+        .pivot_table(columns=spatial_column + ["species_id", "sex"], 
+                    index="length_bin", values="weight_proportion")
+    )    
+    # ---- Calculate the internal (i.e. only aged fish) for each sex
+    within_specimen_sex_proportions = (
+        specimen_weight_proportion.sum()
+    )
+
+    # Calculate the total strata weights
+    # ---- Index `catch_weights`
+    catch_weights_idx = catch_weights.set_index(spatial_column + ["species_id"])
+    # ---- Compute the spatially-stratified/grouped weights
+    spatial_weights = (
+        pd.concat([specimen_weight_total.to_frame("total_weight"), catch_weights_idx])
+        .pivot_table(
+            columns=spatial_column, 
+            aggfunc="sum", 
+            values="total_weight", 
+            observed=False
+        )
+    )
+    
+    # Calculate the weight proportions relative to the overall stratum weights
+    # ---- Aged
+    # -------- Reformat into dataframe and merge with total stratum weights
+    specimen_weights_binned_df = (
+        specimen_weight_binned_pvt.stack()
+        .to_frame("specimen_weight")
+        .reset_index()
+        .merge(spatial_weights.T.reset_index(), on=spatial_column)
+    )
+    # -------- Calculate proportions
+    specimen_weights_binned_df["weight_proportion_overall"] = (
+        specimen_weights_binned_df["specimen_weight"] / specimen_weights_binned_df["total_weight"]
+    )
+    # -------- Consolidate to calculate the sexed proportions per stratum
+    specimen_weight_sex_proportions = specimen_weights_binned_df.groupby(spatial_column + ["species_id", "sex"])[
+        "weight_proportion_overall"
+    ].sum()
+    # ---- Unaged
+    # -------- Reformat into dataframe and merge with total stratum weights
+    length_weights_sex_standardized_df = (
+        length_weight_standardized.stack()
+        .to_frame("catch_weight")
+        .reset_index()
+        .merge(spatial_weights.T.reset_index(), on=spatial_column)
+    )
+    # -------- Calculate proportions
+    length_weights_sex_standardized_df["weight_proportion_overall"] = (
+        length_weights_sex_standardized_df["catch_weight"]
+        / length_weights_sex_standardized_df["total_weight"]
+    )
+    # -------- Back-calculate the sexed weight proportions relative to just unaged fish
+    # ------------ Aggregate proportions
+    length_total_sex_proportions = length_weights_sex_standardized_df.pivot_table(
+        columns=["species_id", "sex"], index=spatial_column, values="weight_proportion_overall"
+    ).transpose().unstack(["species_id"]).sum(axis=0)
+    # ------------ Re-compute the proportions
+    length_weight_sex_proportions = (
+        length_weights_sex_standardized_df.pivot_table(
+            index=["species_id", "sex"], columns=spatial_column, 
+            values="weight_proportion_overall"
+        )
+        / length_total_sex_proportions.to_numpy()
+    )
+
+    # Compute the overall length-binned weight distributions among unaged fish
+    # ---- Extract the number proportions computed for unaged fish
+    length_number_proportions = length_number_proportion.copy()
+    # ---- Filter out values besides those computed for 'all' fish
+    length_number_proportions = length_number_proportions[length_number_proportions["sex"] == "all"]
+    # ---- Convert to a table
+    length_number_proportions_tbl = length_number_proportions.pivot_table(
+        columns=spatial_column + ["species_id"],
+        index=["length_bin"],
+        values="proportion_number_length",
+        aggfunc="sum",
+        observed=False,
+    )
+    # ---- Extract the fitted weight values calculated for all fish
+    length_weight_all = length_weight_df[length_weight_df["sex"] == "all"]
+    # ---- Generate the fitted weight array
+    fitted_weights = length_weight_all.copy()
+    # ---- Get actual length bins in dataset
+    fitted_weights = fitted_weights[fitted_weights["length_bin"].isin(length_number_proportions["length_bin"])]
+    # ---- Apportion the averaged weights
+    length_apportioned_weights = length_number_proportions_tbl.T * fitted_weights["weight_fitted"].to_numpy()
+    # ---- Compute the average weight proportions per length bin per stratum
+    average_length_bin_weights = length_apportioned_weights.T / length_apportioned_weights.sum(axis=1)
+    # ---- Convert back to a DataFrame
+    average_length_bin_weights_df = average_length_bin_weights.unstack().reset_index(
+        name="weight_proportion"
+    )
+
+    # Calculate the aged and unaged weight proportions
+    # ---- Aged
+    aged_proportions = specimen_weight_sex_proportions.unstack("sex").sum(axis=1)
+    # ---- Unaged
+    unaged_proportions = 1 - aged_proportions
+    # -------- Re-weight the unaged sexed proportions
+    unaged_weight_sex_proportions_overall = (
+        (length_weight_sex_proportions * unaged_proportions.unstack().transpose()).astype(float).fillna(0.0)
+    )
+
+    unaged_proportions.unstack().transpose()
+    # Format the outputs
+    # ---- Aged: stratum-sex-age-length relative to aged and total weights
+    aged_overall_df = (
+        specimen_weight_proportion.unstack()
+        .reset_index(name="weight_proportions")
+        .merge(
+            specimen_weights_binned_df[
+                spatial_column + ["length_bin", "sex", "species_id", "weight_proportion_overall"]
+            ]
+        )
+    )
+    # ---- Aged: stratum-sex relative to total weights
+    aged_sex_df =within_specimen_sex_proportions.reset_index(name="weight_proportion_aged").set_index(
+            spatial_column + ["species_id", "sex"]
+        )
+    # ---- Add the aged sex proportiosn relative to the overall survey
+    aged_sex_df["weight_proportion_overall_aged"] = specimen_weight_sex_proportions
+    # ---- Consolidate the aged and unaged sexed dataframes
+    # -------- Initialize the dataframe
+    aged_unaged_sex_proportions = aged_sex_df.reset_index().set_index(["species_id", "sex"] + spatial_column)
+    # --------- Add the within-unaged weight proportions
+    aged_unaged_sex_proportions["weight_proportion_unaged"] = (
+        length_weight_sex_proportions.stack()
+    )
+    # --------- Add the overall-unaged weight proportions
+    aged_unaged_sex_proportions["weight_proportion_overall_unaged"] = (
+        unaged_weight_sex_proportions_overall.stack()
+    )
+    # ---- Overall aged and unaged proportions
+    aged_unaged_proportions = aged_proportions.reset_index(name="aged_proportions")
+    # ---- Set index
+    aged_unaged_proportions.set_index(spatial_column + ["species_id"], inplace=True)
+    # -------- Add unaged proportions
+    aged_unaged_proportions["unaged_proportions"] = unaged_proportions#.reset_index()
+    # ---- Reset the index
+    aged_unaged_proportions = aged_unaged_proportions.reset_index()
+    
+    # Return output
+    return {
+        "aged_weight_proportions_df": aged_overall_df,
+        "unaged_weight_proportions_df": average_length_bin_weights_df,
+        "aged_unaged_sex_weight_proportions_df": (
+            aged_unaged_sex_proportions.astype(float).reset_index().fillna(0.0)
+        ),
+        "aged_unaged_weight_proportions_df": aged_unaged_proportions,
+    }
+
diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index ee87216b..dec45397 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -6,12 +6,21 @@
 from echopop.live.live_core import(
     LIVE_DATA_STRUCTURE,
 )
-
+from echopop.live.live_biology import (
+    bin_length_data,
+    compute_average_weights,
+    compute_sigma_bs,
+    length_bin_counts,
+    length_weight_regression,
+    number_proportions,
+    length_bin_weights,
+    preprocess_biology_data
+)
 from echopop.live import live_data_processing as eldp
 from echopop.live import live_data_loading as eldl
 
-live_init_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
-live_file_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
+live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
+live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
 
 realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path)
 
diff --git a/echopop/utils/operations.py b/echopop/utils/operations.py
index eae68ec2..5db0e84c 100644
--- a/echopop/utils/operations.py
+++ b/echopop/utils/operations.py
@@ -178,7 +178,7 @@ def meld(specimen_dataframe: pd.DataFrame, length_dataframe: pd.DataFrame, contr
     specimen_stacked = (
         specimen_dataframe.copy()
         .groupby(contrasts, observed=False)[["length"]]
-        .apply(lambda x: len(x), include_groups=True)
+        .apply(lambda x: len(x))
         .reset_index(name="length_count")
     )
 
@@ -329,7 +329,7 @@ def interpolator_factory(sub_group):
     # Produce a dictionary comprising all of the produced interpolators
     interpolators = (
         grouped_data.groupby(contrast).apply(
-            lambda group: interpolator_factory(group), include_groups=False
+            lambda group: interpolator_factory(group)
         )
     ).to_dict()
 
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
index 1d53749a..03c47590 100644
--- a/echopop/zarr_read_ingest_test.py
+++ b/echopop/zarr_read_ingest_test.py
@@ -553,7 +553,7 @@ def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: di
     # Integrate NASC (and compute the echometrics, if necessary)
     nasc_data_df = (
         acoustic_data_df.groupby(["longitude", "latitude", "ping_time"])
-        .apply(lambda group: integrate_nasc(group, echometrics), include_groups=False)
+        .apply(lambda group: integrate_nasc(group, echometrics))
         .reset_index()
     )
     # ---- Amend the dtypes if echometrics were computed

From 4e4ca876087e3c6a164aaf3ab3a2ab372d16eb7b Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Thu, 1 Aug 2024 22:34:31 -0700
Subject: [PATCH 14/81] Complete biology processing code

---
 echopop/live/live_biology.py     |  11 +---
 echopop/live/live_survey.py      |  21 +++++--
 echopop/live/sql_methods.py      |   2 +-
 echopop/test_workflow.py         |   9 ++-
 echopop/zarr_read_ingest_test.py | 103 ++++++++++++++++++++++++++++++-
 5 files changed, 128 insertions(+), 18 deletions(-)

diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py
index f42dbe82..896b3b23 100644
--- a/echopop/live/live_biology.py
+++ b/echopop/live/live_biology.py
@@ -1,6 +1,6 @@
 import pandas as pd
 import numpy as np
-from .sql_methods import SQL, sql_data_exchange, get_table_key_names
+from .sql_methods import SQL, sql_data_exchange, get_table_key_names, sql_group_update
 from .live_spatial_methods import apply_spatial_definitions
 from .live_acoustics import average_sigma_bs
 from ..acoustics import ts_length_regression, to_dB, to_linear
@@ -198,9 +198,9 @@ def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.Da
                              file_configuration: dict):
     
     # Get the spatial column name, if there is one
-    contrast_columns = file_configuration["spatial_column"].copy()
+    spatial_column = file_configuration["spatial_column"].copy()
     # ---- Append additional columns that will be used
-    contrast_columns.extend(["trawl_partition", "sex", "haul_num", "species_id", "length_bin"])
+    contrast_columns = spatial_column + ["trawl_partition", "sex", "haul_num", "species_id", "length_bin"]
     
     # Gather specimen measurements to represent 'all' fish
     specimen_data_all = specimen_data.assign(sex="all")
@@ -810,8 +810,6 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame,
     return fitted_weight_df
 
 def weight_proportions(catch_data: pd.DataFrame,
-                       specimen_data: pd.DataFrame,
-                       length_data: pd.DataFrame,
                        specimen_weight_binned: pd.DataFrame,
                        length_weight_binned: pd.DataFrame,
                        length_number_proportion: pd.DataFrame,
@@ -832,9 +830,6 @@ def weight_proportions(catch_data: pd.DataFrame,
     # ---- Rename resulting columns for both
     catch_weights.rename(columns={"count": "total_weight"}, inplace=True)
     
-    # Sum total weights for specimen data
-    specimen_weights = specimen_weight_binned.sum().reset_index(name="total_weight")
-    
     # For the specimen data 
     # ---- Sum the net haul weights from station 1/unaged fish
     specimen_weights_sex = (
diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index e8589b93..ac5bfcc3 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -15,6 +15,7 @@
 from .sql_methods import query_processed_files
 from .live_acoustics import (
     compute_nasc,
+    format_acoustic_dataset,
     preprocess_acoustic_data
 )
     
@@ -23,10 +24,11 @@
     compute_average_weights,
     compute_sigma_bs,
     length_bin_counts,
-    length_weight_regression,
-    number_proportions,
     length_bin_weights,
-    preprocess_biology_data
+    length_weight_regression,
+    number_proportions,    
+    preprocess_biology_data,
+    weight_proportions
 )
 
 
@@ -175,6 +177,17 @@ def process_biology_data(self):
                                                    length_weight_df,
                                                    self.config["length_distribution"],
                                                    self.config)
+        
+        # Compute the weight proportions
+        self.input["biology"].update({
+                "proportions": weight_proportions(biology_unprocessed["catch_df"], 
+                                                  specimen_weight_binned,
+                                                  length_weight_binned,
+                                                  length_number_proportion,
+                                                  length_weight_df,
+                                                  self.config)
+        })
+        
 
     def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True):
 
@@ -196,7 +209,7 @@ def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True):
             nasc_data_df = compute_nasc(acoustic_data_df, self.config, echometrics)
             
             # Format the dataframe and insert into the LiveSurvey object
-            self.input["nasc_df"] = nasc_data_df
+            self.input["nasc_df"] = format_acoustic_dataset(nasc_data_df, self.config)
     
     def estimate_population(self):
         # method here
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index 0bb47306..db5e3a06 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -589,7 +589,7 @@ def SQL(db_file: str, command: str, **kwargs):
     engine = create_engine(f"sqlite:///{db_file}")
     
     # Format the data columns, if necessary, to fit within the SQL commands
-    if command not in ["inspect", "update", "select"]:
+    if command not in ["inspect", "update"]:
         kwargs = format_sql_columns(kwargs)
     
     # Run the command
diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index dec45397..872849fd 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -3,6 +3,8 @@
 from echopop.live.sql_methods import query_processed_files
 from echopop.live.live_acoustics import preprocess_acoustic_data, compute_nasc
 from echopop.live.live_biology import preprocess_biology_data
+from echopop.live.sql_methods import SQL, SQL_COMMANDS, query_processed_files, format_sql_columns, sql_group_update, sql_data_exchange
+
 from echopop.live.live_core import(
     LIVE_DATA_STRUCTURE,
 )
@@ -14,13 +16,14 @@
     length_weight_regression,
     number_proportions,
     length_bin_weights,
-    preprocess_biology_data
+    preprocess_biology_data,
+    weight_proportions
 )
 from echopop.live import live_data_processing as eldp
 from echopop.live import live_data_loading as eldl
 
-live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
-live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
+live_init_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
+live_file_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
 
 realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path)
 
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
index 03c47590..2cfd0cd8 100644
--- a/echopop/zarr_read_ingest_test.py
+++ b/echopop/zarr_read_ingest_test.py
@@ -15,11 +15,11 @@
 from sqlalchemy import create_engine, text, Engine, inspect
 from echopop.live.live_core import LIVE_DATA_STRUCTURE, LIVE_FILE_FORMAT_MAP, LIVE_INPUT_FILE_CONFIG_MAP, SPATIAL_CONFIG_MAP
 from echopop.live.live_data_loading import validate_data_directory
-from echopop.live.sql_methods import SQL, SQL_COMMANDS, query_processed_files, format_sql_columns
+from echopop.live.sql_methods import SQL, SQL_COMMANDS, query_processed_files, format_sql_columns, sql_group_update, sql_data_exchange
 from echopop.live import live_data_processing as eldp
 from echopop.live import live_data_loading as eldl
 from echopop.live.live_survey import LiveSurvey
-from echopop.live.live_acoustics import preprocess_acoustic_data
+from echopop.live.live_acoustics import preprocess_acoustic_data, compute_nasc, format_acoustic_dataset
 from echopop.live.live_biology import preprocess_biology_data
 from echopop.survey import Survey
 
@@ -89,6 +89,105 @@ def process_biology_data(self):
                                                self.config["length_distribution"],
                                                self.config)
 
+# NOTE: ARGUMENT: {working_dataset: Literal["acoustic", "biology"]}
+working_dataset = "biology"
+
+#
+acoustic_db = self.config["database"]["acoustics"]
+biology_db = self.config["database"]["biology"]
+
+# 
+spatial_column = file_configuration["spatial_column"]
+
+# Create conditional string
+condition_str = (
+    f"stratum in {np.unique(self.input["nasc_df"]["stratum"])} "
+    f"& nasc > 0.0"
+)
+
+# Get corresponding data
+acoustic_df = SQL(acoustic_db,"select",table_name="survey_data_df",
+                  condition=condition_str)
+
+# Get corresponding `sigma_bs`
+sigma_bs_df = SQL(acoustic_db,"select",table_name="sigma_bs_mean_df",
+                  condition=f"stratum in {np.unique(self.input["nasc_df"]["stratum"])}")
+# ---- Compute the weighted average
+sigma_bs_mean_df = (
+    sigma_bs_df.groupby(spatial_column + ["species_id"])[["sigma_bs", "sigma_bs_count"]]
+    .apply(lambda df: np.average(df.sigma_bs, weights=df.sigma_bs_count))
+    .to_frame("sigma_bs_mean")
+    .reset_index()
+)
+
+#
+nasc_biology = acoustic_df.merge(sigma_bs_df, on=spatial_column)
+
+#
+nasc_biology["number_density"] = (
+    nasc_biology["nasc"]
+    / (4.0 * np.pi * nasc_biology["sigma_bs"])
+)
+
+psi = 10 ** (-21/10)
+psi * 280**2 * 1500 * 128e-6 / 2
+psi / 3 * 280 ** 3 / 280 / 1852 ** 2 * nasc_biology["number_density"]
+
+psi * (280.0 ** 2) / 1852 ** 2
+depth_area = 280 ** 2 * psi
+swath_length = 0.5 * 1852
+depth_area * swath_length / 1852 ** 2 * nasc_biology["number_density"]
+280 ** 2 * psi / 1852 ** 2 * nasc_biology["number_density"]
+
+SQL(acoustic_db, "map") 
+beam_angle = 9.0 * np.pi / 180.0
+280.0 * np.tan(beam_angle) * 2.0 * swath_length / 1852 ** 2 * nasc_biology["number_density"]
+280.0 * np.tan(beam_angle) * 2.0 ** 2 * np.pi * swath_length / 1852 ** 2 * nasc_biology["number_density"]
+area = 2.0 * nasc_biology["center_of_mass"] ** 2 * np.tan(beam_angle)
+area / 1852 ** 2 * nasc_biology["number_density"]
+SQL(acoustic_db, "map") 
+
+# Merge hake fraction data into `nasc_interval_df`
+# ---- Initial merge
+nasc_interval_df = nasc_interval_df.merge(
+    input_dict["spatial"]["strata_df"], on=[stratum_col, "haul_num"], how="outer"
+)
+# ---- Replace `fraction_hake` where NaN occurs
+nasc_interval_df["fraction_hake"] = nasc_interval_df["fraction_hake"].fillna(0.0)
+# ---- Drop NaN
+nasc_interval_df.dropna(subset=["transect_num"], inplace=True)
+
+# Calculate the along-transect number density (animals per nmi^2)
+# ---- Merge NASC measurements with mean sigma_bs for each stratum
+nasc_biology = nasc_interval_df.merge(sigma_bs_strata, on=[stratum_col])
+# ---- Calculate the number densities
+nasc_biology["number_density"] = (
+    nasc_biology["fraction_hake"]
+    * nasc_biology["nasc"]
+    / (4.0 * np.pi * nasc_biology["sigma_bs_mean"])
+)
+
+
+if working_dataset == "acoustic":
+    db_file = self.config["database"]["acoustic"]
+elif working_dataset == "biology":
+    db_file = self.config["database"]["biology"]
+else:
+    raise ValueError(
+        f"Argument for `working_dataset` [{working_dataset}] is invalid."
+        f" Value must either be 'acoustic' or 'biology'."
+    )
+
+# Extract the necessary correct strata mean sigma_bs
+sigma_bs_strata = analysis_dict["acoustics"]["sigma_bs"]["strata_mean_df"]
+
+# Pull out the length-weight conversion for each stratum
+length_weight_strata = analysis_dict["biology"]["weight"]["weight_stratum_df"]
+
+# Get the name of the stratum column
+stratum_col = settings_dict["transect"]["stratum_name"]
+
+
 catch_data = self.input["biology"]["catch_df"]
 
 # Get the spatial column name, if there is one

From d0f4208fc3d2a8ddc0ba0ab00d1a9344e9c672de Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Sun, 4 Aug 2024 19:10:54 -0700
Subject: [PATCH 15/81] More changes to methods

---
 echopop/live/live_acoustics.py       |  40 +++-
 echopop/live/live_data_processing.py |  53 +++++
 echopop/live/live_survey.py          |  24 +-
 echopop/live/sql_methods.py          |   1 +
 echopop/zarr_read_ingest_test.py     | 334 ++++++++++++++++++++++-----
 5 files changed, 379 insertions(+), 73 deletions(-)

diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py
index 11d0b392..5bd29aca 100644
--- a/echopop/live/live_acoustics.py
+++ b/echopop/live/live_acoustics.py
@@ -1,10 +1,11 @@
-from typing import Union, Optional
+from typing import Union, Optional, List
 import numpy as np
 import pandas as pd
 
 from ..acoustics import ts_length_regression, to_linear, to_dB
 from .live_spatial_methods import apply_spatial_definitions
-from .sql_methods import sql_data_exchange
+from .sql_methods import sql_data_exchange, SQL
+from .live_data_processing import get_unique_identifiers, query_dataset
 
 # TODO: Documentation
 def configure_transmit_frequency(frequency_values: pd.Series,
@@ -243,4 +244,37 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict
            
     # Return the formatted dataframe
     return df
-                                    
+                                
+def get_nasc_sql_data(db_file: str,
+                      data_dict: dict, 
+                      unique_columns: List[str]):
+    # ---- Add SELECTION columns
+    data_columns = (
+        unique_columns + ["x", "y", "longitude", "latitude", "ping_time", "nasc", "number_density", 
+                          "biomass_density"]
+    )
+    # ----- Get the SQL dataset
+    nasc_sql_data = query_dataset(db_file, 
+                                  data_dict,
+                                  table_name="survey_data_df",
+                                  data_columns = data_columns,
+                                  unique_columns=unique_columns,
+                                  constraint="nasc > 0.0")
+    # ---- Use SQL table data if present
+    if nasc_sql_data is not None and not nasc_sql_data.empty:
+        return nasc_sql_data
+    elif "nasc_df" in data_dict.keys():
+        return data_dict["nasc_df"]
+
+def get_sigma_bs_sql_data(db_file: str,
+                          data_dict: dict,
+                          unique_columns: list):
+
+    # Get corresponding `sigma_bs` DataFrame
+    sigma_bs_df = query_dataset(db_file, 
+                                  data_dict,
+                                  table_name="sigma_bs_mean_df",
+                                  data_columns=["sigma_bs", "sigma_bs_count"],
+                                  unique_columns=unique_columns)
+
+    sigma_bs_df = SQL(db_file, "select", table_name="sigma_bs_mean_df")
diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py
index cf126230..9587c935 100644
--- a/echopop/live/live_data_processing.py
+++ b/echopop/live/live_data_processing.py
@@ -1,6 +1,8 @@
 import yaml
 import re
 
+from functools import reduce
+from .sql_methods import SQL
 from pathlib import Path
 from typing import Union, Tuple, Optional, List
 
@@ -12,3 +14,54 @@
     LIVE_FILE_FORMAT_MAP,
     LIVE_INPUT_FILE_CONFIG_MAP
 )
+
+def get_unique_identifiers(data_dict: dict,
+                           unique_columns: List[str]) -> pd.DataFrame:
+
+    # Gather all dataframes from a dictionary into a list
+    df_list = [df for _, df in data_dict.items()]
+
+    # Get unique values of each contrast column across the biological datasets    
+    dfs = [pd.DataFrame({col: df[col].unique().tolist()}) for col in unique_columns 
+           for df in df_list if not df.empty and isinstance(df, pd.DataFrame)]
+    
+    # Reduce into a single DataFrame
+    if len(unique_columns) > 1:
+        return reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs)
+    else:
+        return reduce(lambda left, right: pd.merge(left, right, how='inner'), dfs)
+
+
+def query_dataset(db_file: str,
+                  data_dict: dict,
+                  table_name: str,
+                  data_columns: List[str],
+                  unique_columns: List[str],
+                  constraint: str = None):
+    
+    # Validate that the desired table exists
+    if SQL(db_file, "validate", table_name=table_name):
+        # ---- Inspect the SQL table
+        inspected_table = SQL(db_file, "inspect", table_name=table_name)
+        # ---- Create a list of intersecting column names
+        unique_keys = list(set(inspected_table.keys()).intersection(set(unique_columns)))
+        # ---- Create list of valid columns
+        valid_keys = list(set(inspected_table.keys()).intersection(set(data_columns)))
+        # ---- Get unique identifiers
+        unique_keys_df = get_unique_identifiers(data_dict, unique_keys)
+        # ---- Create conditional string
+        conditional_str = (
+            " & ".join([f"{col} in {np.unique(unique_keys_df[col])}" 
+                        for col in unique_keys_df.columns])
+        )
+        # ---- Append the additional constraint statement if present
+        if constraint is not None:
+            conditional_str += f" & {constraint}"
+        # ---- SELECT the dataset using the conidtional statement
+        data_sql = SQL(db_file, "select", table_name=table_name, columns=valid_keys,
+                       condition=conditional_str).filter(data_columns)
+    else:
+        data_sql = None
+
+    # Return the table DataFrame
+    return data_sql
diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index ac5bfcc3..6f0b568e 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -1,4 +1,4 @@
-from typing import Union, Optional
+from typing import Union, Optional, Literal
 from pathlib import Path
 import copy
 
@@ -171,12 +171,14 @@ def process_biology_data(self):
         )
 
         # Calculate the average weights among male, female, and all fish
-        fitted_weight_df = compute_average_weights(specimen_number_proportion,
-                                                   length_number_proportion, 
-                                                   sex_number_proportions,
-                                                   length_weight_df,
-                                                   self.config["length_distribution"],
-                                                   self.config)
+        self.input["weight_stratumn_df"] = (
+            compute_average_weights(specimen_number_proportion,
+                                    length_number_proportion, 
+                                    sex_number_proportions,
+                                    length_weight_df,
+                                    self.config["length_distribution"],
+                                    self.config)
+        )
         
         # Compute the weight proportions
         self.input["biology"].update({
@@ -209,9 +211,11 @@ def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True):
             nasc_data_df = compute_nasc(acoustic_data_df, self.config, echometrics)
             
             # Format the dataframe and insert into the LiveSurvey object
-            self.input["nasc_df"] = format_acoustic_dataset(nasc_data_df, self.config)
+            self.input["acoustics"]["nasc_df"] = format_acoustic_dataset(nasc_data_df, self.config)
     
-    def estimate_population(self):
-        # method here
+    def estimate_population(self,
+                            working_dataset: Literal["acoustic", "biology"]):
+        
+        # method
         pass 
         
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index db5e3a06..3cdca5fd 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -333,6 +333,7 @@ def sql_select(connection: sqla.Connection, table_name: str,
     "float": "FLOAT",
     "int": "INTEGER",
     'bool': 'BOOLEAN',
+    "Interval": "TEXT",
     "Timestamp": "DATETIME",
     'object': 'TEXT',
     "str": "TEXT",
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
index 2cfd0cd8..09e93fcc 100644
--- a/echopop/zarr_read_ingest_test.py
+++ b/echopop/zarr_read_ingest_test.py
@@ -48,70 +48,34 @@
 database_file = biology_db
 kwargs = dict(dataframe=df, table_name=table_name, id_columns=["id"], primary_keys=["id"], output_type=pd.DataFrame)
 
-def process_biology_data(self):
-
-    # Compute `sigma_bs` by sending it to the appropriate database table
-    compute_sigma_bs(biology_unprocessed["specimen_df"], biology_unprocessed["length_df"], 
-                     self.config)
-    
-    # Bin the length measurements of the biological data
-    bin_length_data(biology_unprocessed, self.config["length_distribution"])
-
-    # Compute the length-weight regression and add it to the SQL table
-    length_weight_df = length_weight_regression(biology_unprocessed["specimen_df"], 
-                                                self.config["length_distribution"],
-                                                self.config)
-    
-    # Compute length-binned counts for the aggregated and individual-based measurements
-    specimen_binned, specimen_binned_filtered, length_binned = (
-        length_bin_counts(biology_unprocessed["length_df"], biology_unprocessed["specimen_df"], 
-                          self.config)
-    )
-
-    # Compute the number proportions
-    specimen_number_proportion, length_number_proportion, sex_number_proportions = (
-        number_proportions(specimen_binned, specimen_binned_filtered, length_binned,
-                           self.config)
-    )
-    
-    # Compute the length-binned weights for the aggregated and individual-based measurements
-    length_weight_binned, specimen_weight_binned = (
-        length_bin_weights(biology_unprocessed["length_df"],
-                           biology_unprocessed["specimen_df"],
-                           length_weight_df,self.config)
-    )
-
-    # Calculate the average weights among male, female, and all fish
-    fitted_weight_df = compute_average_weights(specimen_number_proportion, 
-                                               length_number_proportion, 
-                                               sex_number_proportions,
-                                               length_weight_df,
-                                               self.config["length_distribution"],
-                                               self.config)
-
-# NOTE: ARGUMENT: {working_dataset: Literal["acoustic", "biology"]}
-working_dataset = "biology"
-
-#
-acoustic_db = self.config["database"]["acoustics"]
-biology_db = self.config["database"]["biology"]
+# NOTE: ARGUMENT: {working_dataset: Literal["acoustics", "biology"]}
+working_dataset = "acoustics"
+self = realtime_survey
+file_configuration = self.config
+self.results["biology"] = self.input["biology_processed"]
+self.results["acoustics"] = self.input["nasc_df"]
 
-# 
+# Get spatial column
 spatial_column = file_configuration["spatial_column"]
 
-# Create conditional string
-condition_str = (
-    f"stratum in {np.unique(self.input["nasc_df"]["stratum"])} "
-    f"& nasc > 0.0"
-)
+# Initialize the working data dictionary
+working_data = copy.deepcopy(self.results)
+contrast_columns = []
+# ---- Define unique columns
+unique_columns = spatial_column + contrast_columns
+
+if working_dataset == "acoustics" and self.input["nasc_df"] is not None:
+    # ---- Get dataset
+    acoustic_df = get_nasc_sql_data(acoustic_db, 
+                                    self.input["acoustics"], 
+                                    unique_columns=unique_columns)
 
-# Get corresponding data
-acoustic_df = SQL(acoustic_db,"select",table_name="survey_data_df",
-                  condition=condition_str)
 
 # Get corresponding `sigma_bs`
-sigma_bs_df = SQL(acoustic_db,"select",table_name="sigma_bs_mean_df",
-                  condition=f"stratum in {np.unique(self.input["nasc_df"]["stratum"])}")
+# sigma_bs_df = SQL(acoustic_db,"select",table_name="sigma_bs_mean_df",
+#                   condition=f"stratum in {np.unique(self.input["nasc_df"]["stratum"])}")
+sigma_bs_df = SQL(acoustic_db, "select", table_name="sigma_bs_mean_df")
+sigma_bs_df["stratum"] = 2
 # ---- Compute the weighted average
 sigma_bs_mean_df = (
     sigma_bs_df.groupby(spatial_column + ["species_id"])[["sigma_bs", "sigma_bs_count"]]
@@ -121,13 +85,263 @@ def process_biology_data(self):
 )
 
 #
-nasc_biology = acoustic_df.merge(sigma_bs_df, on=spatial_column)
+nasc_biology = acoustic_df.merge(sigma_bs_mean_df, on=spatial_column)
+
+# Get the spatially averaged weights
+weight_spatial_averages = self.input["weight_stratumn_df"]
+# ---- Sub-select 'all'
+general_weight_averages = weight_spatial_averages[weight_spatial_averages["sex"] == "all"]
+general_weight_averages["stratum"] = 2
 
 #
 nasc_biology["number_density"] = (
     nasc_biology["nasc"]
-    / (4.0 * np.pi * nasc_biology["sigma_bs"])
+    / (4.0 * np.pi * nasc_biology["sigma_bs_mean"])
+)
+
+#
+nasc_biology = nasc_biology.merge(general_weight_averages)
+
+nasc_biology["biomass_density"] = nasc_biology["number_density"] * nasc_biology["average_weight"]
+
+sql_group_update(acoustic_db, dataframe=nasc_biology, 
+                 table_name="survey_data_df", columns=["number_density", "biomass_density"], 
+                 unique_columns=["stratum", "longitude", "latitude", "ping_time"])
+
+strata_df = self.input["spatial"]["strata"].copy()
+strata_df[["length_mean", "weight_mean", "TS_mean", "number_density_mean", 
+           "biomass_density_mean", "abundance_sum", "biomass_sum"]] = np.nan
+strata_df.drop(columns=["latitude_interval"], inplace=True)
+SQL(acoustic_db, "select", table_name="survey_data_df")
+
+SQL(biology_db, "drop", table_name="strata_summary_df")
+SQL(biology_db, "create", table_name="strata_summary_df", dataframe=strata_df, primary_keys=["stratum"])
+SQL(biology_db, "insert", table_name="strata_summary_df", dataframe=strata_df,
+    id_columns=["stratum"])
+
+tt = pd.DataFrame({
+    "x": np.array([1, 1, 1, 2, 2, 2, 3, 3, 3]),
+    "y": np.array([1, 2, 3, 1, 2, 3, 1, 2, 3]),
+    "area": 50 ** 2,
+    "mean_number_density": 0.0,
+    "mean_biomass_density": 0.0,
+    "abundance": 0.0,
+    "biomass": 0.0
+})
+
+nasc_biology_output_a = self.input["nasc_df"].assign(x=1, y=1).reset_index(drop=True)
+nasc_biology_output_a.loc[3, "x"] = 2
+nasc_biology_output_a.loc[3, "y"] = 3
+nasc_biology_output_a = nasc_biology_output_a.filter(["stratum", "x", "y", "longitude", "latitude", "nasc", "number_density", "biomass_density"])
+nasc_biology_output = nasc_biology_output_a.merge(sigma_bs_mean_df, on=spatial_column)
+nasc_biology_output["number_density"] = (
+    nasc_biology_output["nasc"]
+    / (4.0 * np.pi * nasc_biology_output["sigma_bs_mean"])
 )
+nasc_biology_output =nasc_biology_output.merge(general_weight_averages)
+nasc_biology_output["biomass_density"] = nasc_biology_output["number_density"] * nasc_biology_output["average_weight"]
+nasc_biology_output = nasc_biology_output.filter(["stratum", "x", "y", "longitude", "latitude", "number_density", "biomass_density"])
+nasc_biology_output = nasc_biology_output[nasc_biology_output["number_density"] > 0.0].reset_index()
+
+SQL(acoustic_db, "drop", table_name="reference")
+SQL(acoustic_db, "drop", table_name="grid")
+
+SQL(acoustic_db, "create", table_name = "reference", dataframe=tt)
+SQL(acoustic_db, "create", table_name = "grid", dataframe=nasc_biology_output_a)
+
+SQL(acoustic_db, "insert", table_name = "reference", dataframe=tt)
+SQL(acoustic_db, "insert", table_name = "grid", dataframe=nasc_biology_output_a)
+
+SQL(acoustic_db, "select", table_name="grid")
+SQL(acoustic_db, "select", table_name="reference")
+
+sql_group_update(acoustic_db, dataframe=nasc_biology_output, 
+                 table_name="grid", columns=["number_density", "biomass_density"], 
+                 unique_columns=["stratum", "x", "y", "longitude", "latitude"])
+
+SQL(acoustic_db, "select", table_name="grid")
+
+from typing import List
+
+data_table = "grid"
+grid_table = "reference"
+column_pairs = [("number_density", "abundance"), ("biomass_density", "biomass")]
+coordinates = ["x", "y"]
+dataframe = nasc_biology_output
+
+def update_population_grid(db_file: str, 
+                           data_table: str,
+                           grid_table: str,
+                           dataframe: pd.DataFrame,
+                           column_pairs: Union[List[tuple[str, str]], tuple[str, str]],
+                           coordinates: List[str]):
+    
+    # Convert `column_pairs` to a list, if needed
+    if not isinstance(column_pairs, list):
+        column_pairs = [column_pairs]
+
+    dataframe[coordinates]
+    # Format the coordinate pairs
+    # ---- Convert coordinate values into a list of tuples
+    coord_pairs = [tuple(row) for row in dataframe[coordinates].itertuples(index=False)]    
+    # ---- Get unique pairs
+    coords = list(set(coord_pairs))
+
+    # Format the SQL script command
+    # ---- Initialize
+    sql_script = []
+    # ---- Iteratively update
+    for input_column, output_column in column_pairs:
+        sql_script.append(
+        f"""
+        BEGIN TRANSACTION;
+                        
+        -- Calculate averages for input_column and update grid_table
+        WITH avgs AS (
+            SELECT
+                {coordinates[0]},
+                {coordinates[1]},
+                AVG(d.{input_column}) as avg_value
+            FROM {data_table} d
+            GROUP BY d.{coordinates[0]}, d.{coordinates[1]}
+        )
+
+        -- Update the grid_table with both average and computed total
+        UPDATE {grid_table}
+        SET 
+            mean_{input_column} = (
+                SELECT avg_value
+                FROM avgs
+                WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
+                    AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
+            ),
+            {output_column} = (
+                SELECT avg_value * {grid_table}.area
+                FROM avgs
+                WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
+                    AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
+            )       
+        WHERE EXISTS (
+            SELECT 1
+            FROM avgs
+            WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
+              AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
+        );
+
+        COMMIT;
+        """
+        )
+
+    # Create the engine
+    engine = create_engine(f"sqlite:///{db_file}")
+
+    # Create the SQL database connection and send the script 
+    with engine.connect() as connection:
+        dbapi_conn = connection.connection
+        _ = dbapi_conn.executescript("\n".join(sql_script))
+
+
+SQL(acoustic_db, "select", table_name=data_table)
+SQL(acoustic_db, "select", table_name=grid_table)
+
+
+SQL(acoustic_db, "update", table_name="grid", dataframe=nasc_biology_output, unique_columns=["stratum", "x", "y"], columns=["number_density", "biomass_density"])
+SQL(acoustic_db, "select", table_name="reference")
+
+source_db = acoustic_db
+target_db = biology_db
+
+source_table = "grid"
+target_table = "strata_summary_df"
+
+data_columns = ["number_density", "biomass_density"]
+strata_columns = ["stratum"]
+strata = [2]
+stratum_list = ', '.join(map(str, stratum_values))
+
+data_column = data_columns[0]
+data_columns = data_columns[0]
+def sql_update_strata_summary(source_db: str,
+                              target_db: str,
+                              arg_fun: str,
+                              data_columns: List[tuple[str, str]],
+                              strata: list):
+    
+    # Format strata list as a string
+    strata_str = ', '.join(map(str, strata))
+
+    # Function reference map
+    FUNCTION_MAP = {
+        "sum": {"function": "SUM", 
+                "suffix": "sum"},
+        "mean": {"function": "AVG",
+                "suffix": "mean"}
+    }
+
+    # Prepare the SQL script
+    sql_script = f"""
+    -- Attach the source and target databases
+    ATTACH DATABASE '{source_db}' AS source;
+    ATTACH DATABASE '{target_db}' AS target;
+
+    """
+
+    # Dynamically format the cross-database command
+    for data_column, method in data_columns:
+        # ----- Format the function-method-suffic keys
+        suffix = FUNCTION_MAP[method]["suffix"]
+        fun = FUNCTION_MAP[method]["function"]
+        # ---- Create the combined SQL command using f-strings
+        sql_script += f"""
+        -- Calculate averages and directly update the target table
+        UPDATE target.{target_table}
+        SET {data_column}_{suffix} = (
+            SELECT {fun}({data_column})
+            FROM source.{source_table}
+            WHERE stratum = target.{target_table}.stratum
+        )
+        WHERE stratum IN ({strata_str});
+        """
+    # ----- Append DETACH commands only once at the end   
+    sql_script += """
+    -- Detach the databases
+    DETACH DATABASE source;
+    DETACH DATABASE target;
+    """
+
+    # Create the engine
+    engine = create_engine(f"sqlite:///{target_db}")
+
+    # Create the SQL database connection and send the script 
+    with engine.connect() as connection:
+        dbapi_conn = connection.connection
+        _ = dbapi_conn.executescript(sql_script)
+
+SQL(biology_db, "select", table_name=target_table)
+SQL(acoustic_db, "select", table_name=source_table)
+connection.close()
+dbapi_conn.close()
+
+
+pairs = [(1, 2), (3, 4), (5, 6)]
+
+# Convert the pairs into a format suitable for SQL IN clause
+pairs_placeholder = ', '.join(f'({x}, {y})' for x, y in pairs)
+
+# Construct the SQL command as a text string
+sql_command = f'''
+BEGIN TRANSACTION;
+
+UPDATE reference
+SET total = (
+    SELECT AVG(g.sigma_bs) * r.area
+    FROM grid g
+    WHERE g.stratum = r.stratum_x
+)
+WHERE (stratum_x, stratum_y) IN ({pairs_placeholder});
+
+COMMIT;
+'''
 
 psi = 10 ** (-21/10)
 psi * 280**2 * 1500 * 128e-6 / 2

From 6020f7946f06afcf66ae7ea0c7ce8f4925764864 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 7 Aug 2024 12:31:55 -0700
Subject: [PATCH 16/81] Full drafted workflow

---
 config_files/live_survey_year_2019_config.yml |   1 +
 echopop/live/live_acoustics.py                |  78 +++----
 echopop/live/live_biology.py                  | 127 ++++++++--
 echopop/live/live_data_loading.py             |   5 +-
 echopop/live/live_data_processing.py          | 217 +++++++++++++++++-
 echopop/live/live_spatial_methods.py          |   2 +-
 echopop/live/live_survey.py                   | 207 ++++++++++++-----
 echopop/live/sql_methods.py                   | 195 +++++++++++++---
 echopop/test_workflow.py                      |  66 +++---
 echopop/zarr_read_ingest_test.py              | 167 ++++++++++----
 10 files changed, 828 insertions(+), 237 deletions(-)

diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml
index b7b7aef4..4111ea05 100644
--- a/config_files/live_survey_year_2019_config.yml
+++ b/config_files/live_survey_year_2019_config.yml
@@ -45,4 +45,5 @@ input_directories:
   coastline: 
     directory: coastline/
     coastline_name: ne_110m_land
+
 ...
diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py
index 5bd29aca..82e4c1a3 100644
--- a/echopop/live/live_acoustics.py
+++ b/echopop/live/live_acoustics.py
@@ -4,8 +4,7 @@
 
 from ..acoustics import ts_length_regression, to_linear, to_dB
 from .live_spatial_methods import apply_spatial_definitions
-from .sql_methods import sql_data_exchange, SQL
-from .live_data_processing import get_unique_identifiers, query_dataset
+from .sql_methods import sql_data_exchange, SQL, query_processed_files
 
 # TODO: Documentation
 def configure_transmit_frequency(frequency_values: pd.Series,
@@ -49,8 +48,9 @@ def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame,
     )
     
     # Apply spatial settings
-    prc_nasc_df_filtered.loc[:, "stratum"] = (
-        apply_spatial_definitions(prc_nasc_df_filtered["latitude"], spatial_dict)
+    prc_nasc_df_filtered = (
+        prc_nasc_df_filtered
+        .assign(stratum=apply_spatial_definitions(prc_nasc_df_filtered["latitude"], spatial_dict))
     )
 
     # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object
@@ -192,13 +192,25 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict,
     spatial_column = file_configuration["spatial_column"]
     
     # Integrate NASC (and compute the echometrics, if necessary)
-    nasc_data_df = (
-        acoustic_data_df
-        .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, observed=False)
-        .apply(integrate_nasc, echometrics)
-        .unstack().reset_index()
-        .sort_values("ping_time")
-    )
+    # ---- Get number of unique sources
+    if len(np.unique(acoustic_data_df["source"])) == 1:
+        nasc_data_df = (
+            acoustic_data_df
+            .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, 
+                     observed=False)
+            .apply(integrate_nasc, echometrics)
+            .reset_index()
+            .sort_values("ping_time")
+        )
+    else:
+        nasc_data_df = (
+            acoustic_data_df
+            .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, 
+                     observed=False)
+            .apply(integrate_nasc, echometrics, include_groups=False)
+            .unstack().reset_index()
+            .sort_values("ping_time")
+        )
     # ---- Amend the dtypes if echometrics were computed
     if echometrics:
         # ---- Set dtypes
@@ -219,7 +231,7 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict,
     # Return the output
     return nasc_data_df
 
-def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict):
+def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict, meta_dict: dict):
 
     # Get acoustic database filename
     acoustic_db = file_configuration["database"]["acoustics"]
@@ -236,6 +248,12 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict
     key_values = [f"{str(index)}-{df.loc[index, 'source']}" for index in df.index]    
     # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint
     df.loc[:, "id"] = key_values
+
+    # Update the successfully processed files
+    query_processed_files(file_configuration["data_root_dir"], 
+                          file_configuration["input_directories"]["acoustics"],
+                          meta_dict["provenance"]["acoustic_files"],
+                          processed=True)
     
     # Insert the new data into the database & pull in the combined dataset
     # TODO: Replace with single-direction INSERT statement instead of INSERT/SELECT
@@ -243,38 +261,4 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict
                           id_columns=["id"], primary_keys=["id"], output_type=pd.DataFrame)
            
     # Return the formatted dataframe
-    return df
-                                
-def get_nasc_sql_data(db_file: str,
-                      data_dict: dict, 
-                      unique_columns: List[str]):
-    # ---- Add SELECTION columns
-    data_columns = (
-        unique_columns + ["x", "y", "longitude", "latitude", "ping_time", "nasc", "number_density", 
-                          "biomass_density"]
-    )
-    # ----- Get the SQL dataset
-    nasc_sql_data = query_dataset(db_file, 
-                                  data_dict,
-                                  table_name="survey_data_df",
-                                  data_columns = data_columns,
-                                  unique_columns=unique_columns,
-                                  constraint="nasc > 0.0")
-    # ---- Use SQL table data if present
-    if nasc_sql_data is not None and not nasc_sql_data.empty:
-        return nasc_sql_data
-    elif "nasc_df" in data_dict.keys():
-        return data_dict["nasc_df"]
-
-def get_sigma_bs_sql_data(db_file: str,
-                          data_dict: dict,
-                          unique_columns: list):
-
-    # Get corresponding `sigma_bs` DataFrame
-    sigma_bs_df = query_dataset(db_file, 
-                                  data_dict,
-                                  table_name="sigma_bs_mean_df",
-                                  data_columns=["sigma_bs", "sigma_bs_count"],
-                                  unique_columns=unique_columns)
-
-    sigma_bs_df = SQL(db_file, "select", table_name="sigma_bs_mean_df")
+    return df
\ No newline at end of file
diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py
index 896b3b23..ae7dde6b 100644
--- a/echopop/live/live_biology.py
+++ b/echopop/live/live_biology.py
@@ -1,6 +1,6 @@
 import pandas as pd
 import numpy as np
-from .sql_methods import SQL, sql_data_exchange, get_table_key_names, sql_group_update
+from .sql_methods import SQL, sql_data_exchange, get_table_key_names, sql_group_update, query_processed_files, sql_update_strata_summary
 from .live_spatial_methods import apply_spatial_definitions
 from .live_acoustics import average_sigma_bs
 from ..acoustics import ts_length_regression, to_dB, to_linear
@@ -163,15 +163,22 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame,
     sigma_bs_df = (
         ts_length_df
         .groupby(list(set(contrast_columns) - set(["length"])), observed=False)
+        [["TS_L_slope", "TS_L_intercept", "length", "length_count"]]
         .apply(lambda x: average_sigma_bs(x, weights="length_count"))
-        .reset_index(name="sigma_bs")
+        .to_frame("sigma_bs")
     )
 
     # For SQL database storage purposes, the sum and count are stored instead
     # ---- Count sum
-    sigma_bs_df["sigma_bs_count"] = ts_length_df["length_count"].sum()
+    sigma_bs_df["sigma_bs_count"] = (
+        ts_length_df.reset_index()
+        .groupby(list(set(contrast_columns) - set(["length"])), observed=False)["length_count"]
+        .sum()
+    )
     # ---- Value sum
     sigma_bs_df["sigma_bs_sum"] = sigma_bs_df["sigma_bs"] * sigma_bs_df["sigma_bs_count"]
+    # ---- Reset index
+    sigma_bs_df = sigma_bs_df.reset_index()
     
     # Get the database file name
     acoustic_db = file_configuration["database"]["acoustics"]
@@ -185,14 +192,22 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame,
         # ---- Populate table
         SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df)
     else:
-        # ---- Create a filter condition command
-        condition_str = " & ".join([f"{key} in {np.unique(sigma_bs_df[key])}" for key in key_list])
-        # ---- Update the table key 
-        SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df, 
-            operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], condition=condition_str)
-        # ---- Update the actual `sigma_bs` value in the table
-        SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", columns=["sigma_bs"],
-            operation="sigma_bs_sum / sigma_bs_count", condition=condition_str)
+        # ---- Check the present keys
+        current_keys_dict = SQL(acoustic_db, "inspect", table_name="sigma_bs_mean_df", 
+                                columns=key_list)
+        # ---- Insert if missing
+        if not all([all(sigma_bs_df[key].isin(current_keys_dict[key])) for key in key_list]):
+            SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df)
+        # ---- Update if not missing
+        else:
+            # ---- Create a filter condition command
+            condition_str = " & ".join([f"{key} in {np.unique(sigma_bs_df[key])}" for key in key_list])
+            # ---- Update the table key 
+            SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df, 
+                operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], condition=condition_str)
+            # ---- Update the actual `sigma_bs` value in the table
+            SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", columns=["sigma_bs"],
+                operation="sigma_bs_sum / sigma_bs_count", condition=condition_str)
         
 def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.DataFrame, 
                              file_configuration: dict):
@@ -754,11 +769,11 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame,
     )
 
     specimen_length_complete = complete_distrib_df.copy()
-    specimen_length_complete["number_proportion"] = specimen_length_distribution.set_index(contrast_columns + ["length_bin"])
+    specimen_length_complete["number_proportion"] = specimen_length_distribution.set_index(contrast_columns + ["length_bin"]).sort_index()
     specimen_length_complete.loc[:, "number_proportion"] = specimen_length_complete["number_proportion"].fillna(0.0)
 
     length_length_complete = complete_distrib_df.copy()
-    length_length_complete["number_proportion"] = length_length_distribution.set_index(contrast_columns + ["length_bin"])
+    length_length_complete["number_proportion"] = length_length_distribution.set_index(contrast_columns + ["length_bin"]).sort_index()
     length_length_complete.loc[:, "number_proportion"] = length_length_complete["number_proportion"].fillna(0.0)
 
     # ---- Concatenate the two datasets
@@ -806,6 +821,53 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame,
         np.concatenate([weight_all, weight_male, weight_female])
     )
 
+    # Get database file
+    biology_db = file_configuration["database"]["biology"]
+
+    # Insert/update the table
+    # ---- Create id/primary key
+    key_values = ["-".join(fitted_weight_df.reset_index()
+                           .loc[idx, contrast_columns]
+                           .values.astype(str)) 
+                for idx in fitted_weight_df.reset_index().index]
+    # ---- Add to the output
+    fitted_weight_df["id"] = key_values
+    if not SQL(biology_db, "validate", table_name="weight_stratum_df"):
+        # ---- Create
+        SQL(biology_db, "create", table_name="weight_stratum_df", 
+            dataframe=fitted_weight_df, primary_keys=["id"])       
+        # ---- Populate table
+        SQL(biology_db, "insert", table_name="weight_stratum_df", 
+            dataframe=fitted_weight_df, id_columns=["id"])
+    else:
+        # ---- Get previous values in the table
+        table_df = SQL(biology_db, "select", table_name="weight_stratum_df")
+        # ---- Check the table keys
+        table_keys = np.unique(table_df[contrast_columns].apply(tuple, axis=1)).tolist()
+        # ---- Check the current keys
+        fitted_weight_df["current_keys"] = fitted_weight_df[contrast_columns].apply(tuple, axis=1)
+        # ---- Get unique values
+        current_keys = np.unique(fitted_weight_df["current_keys"]).tolist()
+        # ---- Get INSERTION keys
+        insertion_keys = list(set(current_keys).difference(set(table_keys)))
+        # ---- Get UPDATE keys
+        update_keys = list(set(current_keys).intersection(set(table_keys)))
+        # ---- INSERT values
+        if insertion_keys:
+            # ---- Create DataFrame
+            insertion_df = fitted_weight_df[fitted_weight_df["current_keys"].isin(insertion_keys)]
+            # ---- INSERT
+            SQL(biology_db, "insert", table_name="weight_stratum_df", 
+                dataframe=insertion_df.drop(columns="current_keys"))
+        # ---- UPDATE values
+        if update_keys:
+            # ---- Create DataFrame
+            update_df = fitted_weight_df[fitted_weight_df["current_keys"].isin(update_keys)]
+            # ---- UPDATE
+            sql_group_update(biology_db, dataframe=update_df, 
+                             table_name="weight_stratum_df", columns=["average_weight"],
+                             unique_columns=contrast_columns,
+                             id_columns=["id"])
     # Return output
     return fitted_weight_df
 
@@ -1025,3 +1087,42 @@ def weight_proportions(catch_data: pd.DataFrame,
         "aged_unaged_weight_proportions_df": aged_unaged_proportions,
     }
 
+# TODO: NEED TO UPDATE TO EITHER INSERT IF NOT PRESENT OR UPDATE OTHERWISE ! ! !
+# ! SEE ABOVE
+def summarize_strata(nasc_biology_data: pd.DataFrame, spatial_data: pd.DataFrame,
+                     file_configuration: dict):
+
+    # Get biology database
+    acoustic_db = file_configuration["database"]["acoustics"]
+
+    # Get biology database
+    biology_db = file_configuration["database"]["biology"]
+
+    # Validate table
+    if not SQL(biology_db, "validate", table_name="strata_summary_df"):
+
+        # Create copy
+        strata_df = spatial_data.copy()
+
+        # Define new columns 
+        strata_df[["length_mean", "weight_mean", "TS_mean", "number_density_mean", 
+                "biomass_density_mean", "abundance_sum", "biomass_sum"]] = np.nan
+        # ---- Drop 'latitude_interval'
+        strata_df.drop(columns=["latitude_interval"], inplace=True)
+
+        # ---- Create
+        SQL(biology_db, "create", table_name="strata_summary_df", 
+            dataframe=strata_df, primary_keys=["stratum"])       
+        # ---- Populate table
+        SQL(biology_db, "insert", table_name="strata_summary_df", 
+            dataframe=strata_df, id_columns=["stratum"])
+        
+    # Get unique strata values
+    strata_values = np.unique(nasc_biology_data["stratum"]).tolist()
+    
+    # Update the table
+    sql_update_strata_summary(source_db=acoustic_db, target_db=biology_db, 
+                              source_table="survey_data_df", target_table="strata_summary_df", 
+                              data_columns=[("number_density", "mean"), 
+                                            ("biomass_density", "mean")], 
+                              strata=strata_values)
\ No newline at end of file
diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index 823ebac4..f507d63f 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -2,7 +2,7 @@
 from typing import Union, Tuple, Optional, List
 import yaml
 import re
-from .sql_methods import SQL, query_processed_files, sql_data_exchange
+from .sql_methods import SQL, query_processed_files, sql_data_exchange, initialize_database
 import pandas as pd
 from datetime import datetime
 import xarray as xr
@@ -229,6 +229,9 @@ def validate_data_directory(file_configuration: dict, dataset: str,
             "Data loading argument `input_filenames` must be a list."
         )        
     
+    # Initialize the database file
+    initialize_database(root_directory, file_settings)
+    
     # Query the SQL database to process only new files (or create the db file in the first place)
     valid_files, file_configuration["database"][dataset] = (
         query_processed_files(root_directory, file_settings, data_files)
diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py
index 9587c935..928ced70 100644
--- a/echopop/live/live_data_processing.py
+++ b/echopop/live/live_data_processing.py
@@ -2,7 +2,8 @@
 import re
 
 from functools import reduce
-from .sql_methods import SQL
+from .sql_methods import SQL, sql_group_update
+from .live_biology import summarize_strata
 from pathlib import Path
 from typing import Union, Tuple, Optional, List
 
@@ -23,14 +24,13 @@ def get_unique_identifiers(data_dict: dict,
 
     # Get unique values of each contrast column across the biological datasets    
     dfs = [pd.DataFrame({col: df[col].unique().tolist()}) for col in unique_columns 
-           for df in df_list if not df.empty and isinstance(df, pd.DataFrame)]
+           for df in df_list if isinstance(df, pd.DataFrame) and not df.empty]
     
     # Reduce into a single DataFrame
     if len(unique_columns) > 1:
         return reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs)
     else:
-        return reduce(lambda left, right: pd.merge(left, right, how='inner'), dfs)
-
+        return reduce(lambda left, right: pd.merge(left, right, how="outer"), dfs)
 
 def query_dataset(db_file: str,
                   data_dict: dict,
@@ -49,10 +49,10 @@ def query_dataset(db_file: str,
         valid_keys = list(set(inspected_table.keys()).intersection(set(data_columns)))
         # ---- Get unique identifiers
         unique_keys_df = get_unique_identifiers(data_dict, unique_keys)
-        # ---- Create conditional string
+        # ---- Create conditional string            
         conditional_str = (
-            " & ".join([f"{col} in {np.unique(unique_keys_df[col])}" 
-                        for col in unique_keys_df.columns])
+           " & ".join([f"{col} in {np.unique(unique_keys_df[col]).tolist()}" 
+                       for col in unique_keys_df.columns])  
         )
         # ---- Append the additional constraint statement if present
         if constraint is not None:
@@ -65,3 +65,206 @@ def query_dataset(db_file: str,
 
     # Return the table DataFrame
     return data_sql
+
+def get_average_strata_weights(db_file: str,
+                               data_dict: dict,
+                               unique_columns: list):
+    
+    # Get corresponding `weight_fitted_df` from the database
+    weight_fitted_sql_df = query_dataset(db_file, data_dict, table_name="weight_stratum_df",
+                                         data_columns=unique_columns + ["average_weight"],
+                                         unique_columns=unique_columns,
+                                         constraint="sex == 'all'")
+    # ---- Use SQL table data if present
+    if weight_fitted_sql_df is not None and not weight_fitted_sql_df.empty:
+        # ---- Return output
+        return weight_fitted_sql_df
+    else:
+        return None
+
+def acoustic_pipeline(acoustic_dict: dict, 
+                      strata_df: pd.DataFrame, 
+                      file_configuration: dict, 
+                      verbose: bool,
+                      contrast_columns: List[str] = []):
+
+    # Get spatial column
+    spatial_column = file_configuration["spatial_column"]
+    unique_columns = spatial_column + contrast_columns
+
+    # Get database file
+    acoustic_db = file_configuration["database"]["acoustics"]
+
+    # Get biology database file
+    biology_db = file_configuration["database"]["biology"]
+
+    # Check whether data dictionary is empty
+    if acoustic_dict["nasc_df"] is None or acoustic_dict["nasc_df"].empty:
+        # ---- Print, if verbose
+        if verbose:
+            print(
+                f"No new processed acoustic data available for processing."
+            )
+    else:
+        # Get related acoustic data
+        acoustic_df = get_nasc_sql_data(acoustic_db, 
+                                        acoustic_dict, 
+                                        unique_columns=unique_columns)
+        
+        # Get the corresopding `sigma_bs` data (and also compute the sample-number weighted average)
+        sigma_bs_df = get_sigma_bs_sql_data(acoustic_db, 
+                                            acoustic_dict,
+                                            unique_columns=unique_columns)
+        
+        # Calculate population estimates if valid data are available
+        if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]):
+
+            # ---- Merge the NASC and sigma_bs datasets
+            nasc_biology = acoustic_df.merge(sigma_bs_df, on=unique_columns)
+            # ---- Compute the number densities (animals nmi^-2)
+            nasc_biology["number_density"] = (
+                nasc_biology["nasc"]
+                / (4.0 * np.pi * nasc_biology["sigma_bs_mean"])
+            )
+
+            # Get the corresponding average strata weights (computed for all fish)
+            weight_spatial_averages = get_average_strata_weights(biology_db,
+                                                                acoustic_dict,
+                                                                unique_columns=unique_columns)
+            
+            if weight_spatial_averages is not None:
+                # Merge average weights with number density estimates
+                nasc_biology = nasc_biology.merge(weight_spatial_averages, on=unique_columns)
+
+                # Compute biomass densities
+                nasc_biology["biomass_density"] = (
+                    nasc_biology["number_density"] * nasc_biology["average_weight"]
+                )
+
+            # Update the survey population estimate DataFrame with the newly computed densities
+            if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]):        
+                sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df", 
+                                columns=["number_density", "biomass_density"], 
+                                unique_columns=["stratum", "longitude", "latitude", "ping_time"])
+            
+                # Summarize strata
+                summarize_strata(nasc_biology, strata_df, file_configuration)
+
+def get_nasc_sql_data(db_file: str,
+                      data_dict: dict, 
+                      unique_columns: List[str]):
+    
+    # Add SELECTION columns
+    data_columns = (
+        unique_columns + ["x", "y", "longitude", "latitude", "ping_time", "nasc", "number_density", 
+                          "biomass_density", "id"]
+    )
+    # ----- Get the SQL dataset
+    nasc_sql_data = query_dataset(db_file, 
+                                  data_dict,
+                                  table_name="survey_data_df",
+                                  data_columns = data_columns,
+                                  unique_columns=unique_columns,
+                                  constraint="nasc > 0.0")
+    # ---- Use SQL table data if present
+    if nasc_sql_data is not None and not nasc_sql_data.empty:
+        return nasc_sql_data
+    elif "nasc_df" in data_dict.keys():
+        return data_dict["nasc_df"]
+
+def get_sigma_bs_sql_data(db_file: str,
+                          data_dict: dict,
+                          unique_columns: list):
+
+    # Get corresponding `sigma_bs` DataFrame
+    sigma_bs_sql_df = query_dataset(db_file, data_dict, table_name="sigma_bs_mean_df",
+                                    data_columns=unique_columns + ["sigma_bs", "sigma_bs_count"],
+                                    unique_columns=unique_columns)
+    # ---- Use SQL table data if present
+    if sigma_bs_sql_df is not None and not sigma_bs_sql_df.empty:
+        # ---- Compute the weighted average
+        sigma_bs_mean_sql_df = (
+            sigma_bs_sql_df.groupby(unique_columns)[["sigma_bs", "sigma_bs_count"]]
+            .apply(lambda df: np.average(df.sigma_bs, weights=df.sigma_bs_count))
+            .to_frame("sigma_bs_mean")
+            .reset_index()
+        )
+        # ---- Return output
+        return sigma_bs_mean_sql_df
+    else:
+        return None
+    
+
+
+def biology_pipeline(biology_dict: dict, 
+                     strata_df: pd.DataFrame, 
+                     file_configuration: dict, 
+                     verbose: bool,
+                     contrast_columns: List[str] = []):
+
+    # Get spatial column
+    spatial_column = file_configuration["spatial_column"]
+    unique_columns = spatial_column + contrast_columns
+
+    # Get database file
+    acoustic_db = file_configuration["database"]["acoustics"]
+
+    # Get biology database file
+    biology_db = file_configuration["database"]["biology"]
+
+    # Check for data completion
+    # ---- List of boolean values
+    full_biology_data = (
+        [True if (isinstance(df, pd.DataFrame) and not df.empty) or (isinstance(df, dict)) 
+         else False for _, df in biology_dict.items()]
+    )
+    # ---- Validation
+    if not all(full_biology_data):
+        # ---- Print, if verbose
+        if verbose:
+            print(
+                f"No new processed biology data available for processing."
+            )
+    else:
+        # Get related biology data
+        acoustic_df = get_nasc_sql_data(acoustic_db, 
+                                        biology_dict, 
+                                        unique_columns=unique_columns)        
+
+        # Get the corresopding `sigma_bs` data (and also compute the sample-number weighted average)
+        sigma_bs_df = get_sigma_bs_sql_data(acoustic_db, 
+                                            biology_dict,
+                                            unique_columns=unique_columns)
+
+        # Calculate population estimates if valid data are available
+        if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]):    
+            # ---- Merge the NASC and sigma_bs datasets
+            nasc_biology = acoustic_df.merge(sigma_bs_df, on=unique_columns)
+            # ---- Compute the number densities (animals nmi^-2)
+            nasc_biology["number_density"] = (
+                nasc_biology["nasc"]
+                / (4.0 * np.pi * nasc_biology["sigma_bs_mean"])
+            )
+
+            # Get the corresponding average strata weights (computed for all fish)
+            weight_spatial_averages = get_average_strata_weights(biology_db,
+                                                                biology_dict,
+                                                                unique_columns=unique_columns)
+            
+            if weight_spatial_averages is not None:
+                # Merge average weights with number density estimates
+                nasc_biology = nasc_biology.merge(weight_spatial_averages, on=unique_columns)
+
+                # Compute biomass densities
+                nasc_biology["biomass_density"] = (
+                    nasc_biology["number_density"] * nasc_biology["average_weight"]
+                )
+
+            # Update the survey population estimate DataFrame with the newly computed densities
+            if not nasc_biology.empty:        
+                sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df", 
+                                columns=["number_density", "biomass_density"], 
+                                unique_columns=["stratum", "longitude", "latitude", "ping_time"])
+            
+                # Summarize strata
+                summarize_strata(nasc_biology, strata_df, file_configuration)
diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py
index 2dd8cefc..6ce7741f 100644
--- a/echopop/live/live_spatial_methods.py
+++ b/echopop/live/live_spatial_methods.py
@@ -53,7 +53,7 @@ def apply_inpfc_definitions(dataset: pd.DataFrame, inpfc_df: pd.DataFrame):
                         np.unique(np.hstack([inpfc_df.loc[:, "lower"], 
                                              inpfc_df.loc[:, "upper"]])),
                         labels = inpfc_df.loc[:, "stratum"]
-        )
+        ).astype(int)
         
         return strata
 
diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index 6f0b568e..f3cb7f5a 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -1,6 +1,10 @@
 from typing import Union, Optional, Literal
 from pathlib import Path
+from datetime import datetime
 import copy
+import pandas as pd
+
+from .sql_methods import query_processed_files
 
 from .live_core import(
     LIVE_DATA_STRUCTURE,
@@ -48,6 +52,8 @@ def __init__(
     ):
         # Initialize `meta` attribute
         self.meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"])
+        # ---- Add datetime
+        self.meta["date"] = datetime.now()
 
         # Loading the configuration settings and definitions that are used to
         # initialize the Survey class object
@@ -74,6 +80,48 @@ def __init__(
         if verbose: 
             pass
 
+    def __repr__(self):
+
+        # Get any acoustic files created
+        if "acoustic_files" in self.meta["provenance"]:
+            # ---- Get the filenames
+            acoustic_filenames = self.meta["provenance"]["acoustic_files"]
+            # ---- Subset if many files are being processed
+            if len(acoustic_filenames) > 2:
+                acoustic_filenames = acoustic_filenames[:2] + ["..."] + [f"[n = {len(acoustic_filenames)}]"]
+            # ---- Format string
+            acoustic_files = ", ".join(acoustic_filenames)
+        else:
+            acoustic_files = "None"
+
+        # Get any biology files created
+        if "biology_files" in self.meta["provenance"]:
+            # ---- Get the filenames
+            biology_filenames = self.meta["provenance"]["biology_files"]
+            # ---- Subset if many files are being processed
+            if len(biology_filenames) > 4:
+                biology_filenames = biology_filenames + ["..."]
+            # ---- Format string
+            biology_files = ", ".join(biology_filenames)
+        else:
+            biology_files = "None"
+
+        # Get linked database names
+        linked_dbs = (
+            "\n   ".join([f"{key.title()}: {db}" for key, db in self.config["database"].items()])
+        )
+
+        return (
+            f"LiveSurvey-class object \n"
+            f"Timestamp: {self.meta['date']} \n"
+            f"Acoustic files being processed: \n   {acoustic_files}\n"
+            f"Biology files being processed: \n   {biology_files}\n"
+            f"Linked databases: \n   {linked_dbs}"
+        )
+    
+    def __str__(self):
+        return self.__repr__()
+
     def load_acoustic_data(self,
                            input_filenames: Optional[list] = None,
                            verbose: bool = True):
@@ -93,13 +141,17 @@ def load_acoustic_data(self,
             # TODO: SettingWithCopyWarning:
             self.input["acoustics"]["prc_nasc_df"] = preprocess_acoustic_data(prc_nasc_df.copy(), 
                                                                               self.input["spatial"],
-                                                                              self.config)     
+                                                                              self.config)  
+            # ---- Add meta key
+            self.meta["provenance"].update({
+                "acoustic_files": acoustic_files,
+            })   
             # TODO: Add verbosity for printing database filepaths/connections 
             if verbose:
                 # ---- Create file list
                 file_list = "\n".join(acoustic_files)
                 print(
-                    f"The following acoustic files have been processed:\n"
+                    f"The following acoustic files are being processed:\n"
                     f"{file_list}."
                 )
         else:
@@ -118,17 +170,22 @@ def load_biology_data(self,
             # ---- Create file list
             file_list = "\n".join(biology_files)
             print(  
-                f"The following biological files have been processed:\n"
+                f"The following biological files are being processed:\n"
                 f"{file_list}."
             )
         
-        # Read in the biology data files
-        initial_biology_output = eldl.read_biology_files(biology_files, self.config)
+            # Read in the biology data files
+            initial_biology_output = eldl.read_biology_files(biology_files, self.config)
 
-        # Preprocess the biology dataset
-        self.input["biology"], self.input["biology_processed"] = (
-            preprocess_biology_data(initial_biology_output, self.input["spatial"], self.config)
-        )
+            # Preprocess the biology dataset
+            self.input["biology"], self.input["biology_processed"] = (
+                preprocess_biology_data(initial_biology_output, self.input["spatial"], self.config)
+            )
+
+            # Add meta key
+            self.meta["provenance"].update({
+                "biology_files": biology_files,
+            })  
 
     def process_biology_data(self):
 
@@ -137,59 +194,73 @@ def process_biology_data(self):
         # ----- Unprocessed
         biology_unprocessed = self.input["biology"]
 
-        # Compute `sigma_bs` by sending it to the appropriate database table
-        compute_sigma_bs(biology_unprocessed["specimen_df"], 
-                         biology_unprocessed["length_df"], 
-                         self.config)
+        # Check if data are present
+        unprocess_data_dfs = (
+            [True if isinstance(df, pd.DataFrame) and not df.empty else False 
+             for _, df in biology_unprocessed.items()]
+        )
+        # ---- Proceed in processing the unprocessed data
+        if all(unprocess_data_dfs):
 
-        # Bin the length measurements of the biological data
-        bin_length_data(biology_unprocessed, self.config["length_distribution"])
+            # Compute `sigma_bs` by sending it to the appropriate database table
+            compute_sigma_bs(biology_unprocessed["specimen_df"], 
+                            biology_unprocessed["length_df"], 
+                            self.config)
 
-        # Compute the length-weight regression and add it to the SQL table
-        length_weight_df = length_weight_regression(biology_unprocessed["specimen_df"], 
-                                                    self.config["length_distribution"],
-                                                    self.config)
-        
-        # Compute length-binned counts for the aggregated and individual-based measurements
-        specimen_binned, specimen_binned_filtered, length_binned = (
-            length_bin_counts(biology_unprocessed["length_df"], 
-                              biology_unprocessed["specimen_df"], 
-                              self.config)
-        )
+            # Bin the length measurements of the biological data
+            bin_length_data(biology_unprocessed, self.config["length_distribution"])
 
-        # Compute the number proportions
-        specimen_number_proportion, length_number_proportion, sex_number_proportions = (
-            number_proportions(specimen_binned, specimen_binned_filtered, 
-                               length_binned, self.config)
-        )
+            # Compute the length-weight regression and add it to the SQL table
+            length_weight_df = length_weight_regression(biology_unprocessed["specimen_df"], 
+                                                        self.config["length_distribution"],
+                                                        self.config)
+            
+            # Compute length-binned counts for the aggregated and individual-based measurements
+            specimen_binned, specimen_binned_filtered, length_binned = (
+                length_bin_counts(biology_unprocessed["length_df"], 
+                                biology_unprocessed["specimen_df"], 
+                                self.config)
+            )
 
-        # Compute the length-binned weights for the aggregated and individual-based measurements
-        length_weight_binned, specimen_weight_binned = (
-            length_bin_weights(biology_unprocessed["length_df"],
-                               biology_unprocessed["specimen_df"],
-                               length_weight_df,self.config)
-        )
+            # Compute the number proportions
+            specimen_number_proportion, length_number_proportion, sex_number_proportions = (
+                number_proportions(specimen_binned, specimen_binned_filtered, 
+                                length_binned, self.config)
+            )
 
-        # Calculate the average weights among male, female, and all fish
-        self.input["weight_stratumn_df"] = (
-            compute_average_weights(specimen_number_proportion,
-                                    length_number_proportion, 
-                                    sex_number_proportions,
-                                    length_weight_df,
-                                    self.config["length_distribution"],
-                                    self.config)
-        )
-        
-        # Compute the weight proportions
-        self.input["biology"].update({
-                "proportions": weight_proportions(biology_unprocessed["catch_df"], 
-                                                  specimen_weight_binned,
-                                                  length_weight_binned,
-                                                  length_number_proportion,
-                                                  length_weight_df,
-                                                  self.config)
-        })
-        
+            # Compute the length-binned weights for the aggregated and individual-based measurements
+            length_weight_binned, specimen_weight_binned = (
+                length_bin_weights(biology_unprocessed["length_df"],
+                                biology_unprocessed["specimen_df"],
+                                length_weight_df,self.config)
+            )
+
+            # Calculate the average weights among male, female, and all fish
+            self.input["weight_stratum_df"] = (
+                compute_average_weights(specimen_number_proportion,
+                                        length_number_proportion, 
+                                        sex_number_proportions,
+                                        length_weight_df,
+                                        self.config["length_distribution"],
+                                        self.config)
+            )
+            
+            # Compute the weight proportions
+            self.input["biology"].update({
+                    "proportions": weight_proportions(biology_unprocessed["catch_df"], 
+                                                    specimen_weight_binned,
+                                                    length_weight_binned,
+                                                    length_number_proportion,
+                                                    length_weight_df,
+                                                    self.config)
+            })
+
+            # Update the database
+            query_processed_files(self.config["data_root_dir"], 
+                                self.config["input_directories"]["biology"],
+                                self.meta["provenance"]["biology_files"],
+                                processed=True)
+            
 
     def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True):
 
@@ -211,11 +282,27 @@ def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True):
             nasc_data_df = compute_nasc(acoustic_data_df, self.config, echometrics)
             
             # Format the dataframe and insert into the LiveSurvey object
-            self.input["acoustics"]["nasc_df"] = format_acoustic_dataset(nasc_data_df, self.config)
+            self.input["acoustics"]["nasc_df"] = format_acoustic_dataset(nasc_data_df, 
+                                                                         self.config,
+                                                                         self.meta)
+
+            # Update the database
     
     def estimate_population(self,
-                            working_dataset: Literal["acoustic", "biology"]):
+                            working_dataset: Literal["acoustic", "biology"],
+                            verbose: bool = True):
+       
+        # method
+       if working_dataset == "acoustic":
+           eldp.acoustic_pipeline(self.input["acoustics"],
+                                  self.input["spatial"]["strata"],
+                                  self.config,
+                                  verbose=verbose)
         
         # method
-        pass 
+       if working_dataset == "biology":
+           eldp.biology_pipeline(self.input["biology"],
+                                 self.input["spatial"]["strata"],
+                                 self.config,
+                                 verbose=verbose)
         
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index 3cdca5fd..335795b7 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -205,6 +205,16 @@ def sql_update(connection: sqla.Connection, table_name: str, columns: list,
     elif not isinstance(columns, list):
         columns = [columns]
 
+    def format_value(x):
+        if isinstance(x, str):
+            return "'{}'".format(x.replace("'", "''"))
+        elif isinstance(x, pd.Timestamp):
+            return "'{}'".format(x)
+        elif x is None:
+            return 'NULL'
+        else:
+            return str(x)
+
     # Format the SET command
     # ---- Update column by applying arithmetic between table and dataframe
     if operation is not None and dataframe is not None:
@@ -222,7 +232,8 @@ def sql_update(connection: sqla.Connection, table_name: str, columns: list,
         set_list = [f"{column} = {dataframe[column].values[0]}" for column in columns]
     # ---- Join the list
     set_clause = ', '.join(set_list)
-
+    [f"{column} = {dataframe[column].values[0]}" for column in columns]
+    ", ".join(f"({','.join(map(lambda x: format_value(x), row))})"  for row in data_tuple)
     # Add the WHERE clause if a parsed condition is provided
     if condition is not None:
         # ---- Parse the conditional string
@@ -314,6 +325,89 @@ def sql_select(connection: sqla.Connection, table_name: str,
         else:
             return converted_data
 
+def validate_tables(db_file: str, table_name: Union[str, List[str]], 
+                    reference_dataframe: pd.DataFrame):
+
+    # Helper function
+    def _validate_table(table):
+        # ---- Check table existence
+        if not SQL(db_file, "validate", table_name=table):
+            raise KeyError(
+            f"SQL database table `{table}` in `{db_file}` failed to initialize!"
+        )
+        # ---- Get DataFrame dtypes (avoid 'object' and similar ambiguous typing)
+        expected_dtypes = (
+            {col: type(reference_dataframe[col][0]).__name__ for col in reference_dataframe.columns}
+        )
+        # ---- Inspect the table 
+        inspected_table = SQL(db_file, "inspect", table_name=table)
+        # ---- Get the column dtypes (with back-formatting via configuration mapping)
+        table_dtypes = {
+            col: SQL_DTYPES[type(inspected_table["filepath"]["type"]).__name__].__name__ 
+            for col in inspected_table.keys()
+        }
+        # ---- Compare keys
+        key_difference = list(set(expected_dtypes).difference(set(table_dtypes)))
+        # -------- Raise error, if needed
+        if key_difference:
+            raise KeyError(
+                f"The following columns are missing from table `{table}` in `{db_file}`: "
+                f"{', '.join(key_difference)}."
+            )
+        # ---- Compare dtypes
+        dtypes_comparison = (
+            {key: table_dtypes[key] for key in table_dtypes 
+             if table_dtypes[key] != expected_dtypes.get(key)}
+        )
+        # ---- Get key names
+        dtypes_different_names = list(set(dtypes_comparison))
+        # ---- Raise error, if needed
+        if dtypes_different_names:
+            raise TypeError(
+                f"The following columns from table `{table}` in `{db_file}` had unexpected "
+                f"datatypes: {', '.join(dtypes_different_names)}."
+            )
+        
+    # Iterate through tables to validate
+    if isinstance(table_name, list):
+        _ = [_validate_table(table) for table in table_name]
+    else:
+        _validate_table(table_name)
+
+def initialize_database(root_directory: Path, file_settings: dict):
+
+    # Get the database name 
+    db_name = file_settings["database_name"]
+
+    # Create filepath to the SQL database
+    # ---- Create Path to SQL database file
+    db_directory = root_directory / "database"
+    # ---- Create the directory if it does not already exist
+    db_directory.mkdir(parents=True, exist_ok=True)
+    # ---- Complete path to the database file
+    db_file = db_directory / db_name
+
+    # Spoof an empty DataFrame for formatting purposes
+    template_df = pd.DataFrame({"filepath": ["dummy/path/string"]})
+
+    # Create two tables for 'files read' and 'files processed'
+    # ---- Read files
+    SQL(db_file, "create", table_name="files_read", dataframe=template_df, 
+        primary_keys=["filepath"])
+    # ---- Processed files
+    SQL(db_file, "create", table_name="files_processed", dataframe=template_df, 
+        primary_keys=["filepath"])    
+    
+    # Query the database ensure it exists
+    # ---- File existence
+    if not Path(db_file).exists():
+        raise FileExistsError(
+            f"SQL database file `{db_file}` failed to initialize!"
+        )
+    
+    # Validate the created tables
+    validate_tables(db_file, ["files_read", "files_processed"], template_df)
+
 SQL_COMMANDS = {
     "create": dict(function=sql_create, args=["table_name", "dataframe", "primary_keys"]),
     "drop": dict(function=sql_drop, args=["table_name"]),
@@ -492,16 +586,15 @@ def format_sql_columns(kwargs: dict):
     return kwargs
 
 # TODO: Documentation
-def query_processed_files(root_directory: Path, file_settings: dict, files: List[Path]) -> dict:
+def query_processed_files(root_directory: Path, file_settings: dict, files: List[Path],
+                          processed=False) -> dict:
 
     # Get the database name 
     db_name = file_settings["database_name"]
 
     # Create filepath to the SQL database
     # ---- Create Path to SQL database file
-    db_directory = root_directory / "database"
-    # ---- Create the directory if it does not already exist
-    db_directory.mkdir(parents=True, exist_ok=True)
+    db_directory = Path(root_directory) / "database"
     # ---- Complete path to the database file
     db_file = db_directory / db_name
 
@@ -509,28 +602,19 @@ def query_processed_files(root_directory: Path, file_settings: dict, files: List
     files_str = [str(file) for file in files]
     # ---- Create DataFrame
     current_files = pd.DataFrame(files_str, columns=["filepath"])
-
-    # Check for the table `files_read`
-    files_read_tbl = SQL(db_file, "validate", table_name="files_read")
-
-    # Validate whether the table exists; if not, create the table and then insert
-    if not files_read_tbl:
-        # ---- Create table
-        SQL(db_file, "create", table_name="files_read", dataframe=current_files, 
-            primary_keys = ["filepath"])
-        # ---- Populate table
-        SQL(db_file, "insert", table_name="files_read", dataframe=current_files)
-        # ---- Break early
-        return files_str, db_file
-    
-    # Query already existing files
-    previous_files = SQL(db_file, "select", table_name="files_read", output_type=str)
-    # ---- Insert file list
-    SQL(db_file, "insert", table_name="files_read", dataframe=current_files, id_columns=["filepath"])
-
-    # Filter out previously processed files
-    # ---- Apply filter by comparing sets and return the output
-    return list(set(files_str) - set(previous_files)), db_file
+   
+    # Check against `files_processed`
+    previous_files = SQL(db_file, "select", table_name="files_processed", output_type=str)
+
+    # Insert the files into the `files_read` table
+    if processed: 
+        SQL(db_file, "insert", table_name="files_processed", dataframe=current_files, 
+            id_columns=["filepath"])
+    else:
+        SQL(db_file, "insert", table_name="files_read", dataframe=current_files, 
+            id_columns=["filepath"])
+        # ---- Apply filter by comparing sets and return the output
+        return list(set(files_str) - set(previous_files)), db_file
 
 # TODO: Documentation
 def sql_data_exchange(database_file: Path, **kwargs):
@@ -582,6 +666,63 @@ def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str
                 f"Attempted reset of [{str(db_file)}] failed."
             )
 
+def sql_update_strata_summary(source_db: str,
+                              target_db: str,
+                              source_table: str,
+                              target_table: str,
+                              data_columns: List[tuple[str, str]],
+                              strata: list):
+    
+    # Format strata list as a string
+    strata_str = ', '.join(map(str, strata))
+
+    # Function reference map
+    FUNCTION_MAP = {
+        "sum": {"function": "SUM", 
+                "suffix": "sum"},
+        "mean": {"function": "AVG",
+                "suffix": "mean"}
+    }
+
+    # Prepare the SQL script
+    sql_script = f"""
+    -- Attach the source and target databases
+    ATTACH DATABASE '{source_db}' AS source;
+    ATTACH DATABASE '{target_db}' AS target;
+
+    """
+
+    # Dynamically format the cross-database command
+    for data_column, method in data_columns:
+        # ----- Format the function-method-suffic keys
+        suffix = FUNCTION_MAP[method]["suffix"]
+        fun = FUNCTION_MAP[method]["function"]
+        # ---- Create the combined SQL command using f-strings
+        sql_script += f"""
+        -- Calculate averages and directly update the target table
+        UPDATE target.{target_table}
+        SET {data_column}_{suffix} = (
+            SELECT {fun}({data_column})
+            FROM source.{source_table}
+            WHERE stratum = target.{target_table}.stratum
+        )
+        WHERE stratum IN ({strata_str});
+        """
+    # ----- Append DETACH commands only once at the end   
+    sql_script += """
+    -- Detach the databases
+    DETACH DATABASE source;
+    DETACH DATABASE target;
+    """
+
+    # Create the engine
+    engine = create_engine(f"sqlite:///{target_db}")
+
+    # Create the SQL database connection and send the script 
+    with engine.connect() as connection:
+        dbapi_conn = connection.connection
+        _ = dbapi_conn.executescript(sql_script)
+
 
 # TODO: Documentation
 def SQL(db_file: str, command: str, **kwargs):
diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index 872849fd..35ca3b3a 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -1,59 +1,51 @@
 from echopop.live.live_survey import LiveSurvey
-from echopop.live.sql_methods import reset_db_files
-from echopop.live.sql_methods import query_processed_files
-from echopop.live.live_acoustics import preprocess_acoustic_data, compute_nasc
-from echopop.live.live_biology import preprocess_biology_data
-from echopop.live.sql_methods import SQL, SQL_COMMANDS, query_processed_files, format_sql_columns, sql_group_update, sql_data_exchange
-
-from echopop.live.live_core import(
-    LIVE_DATA_STRUCTURE,
-)
-from echopop.live.live_biology import (
-    bin_length_data,
-    compute_average_weights,
-    compute_sigma_bs,
-    length_bin_counts,
-    length_weight_regression,
-    number_proportions,
-    length_bin_weights,
-    preprocess_biology_data,
-    weight_proportions
-)
-from echopop.live import live_data_processing as eldp
-from echopop.live import live_data_loading as eldl
-
-live_init_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
-live_file_config_path = "C:/Users/15052/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
-
-realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path)
+from echopop.live.sql_methods import SQL
 
+# Set up `LiveSurvey` object
+live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
+live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
+realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path, verbose=True)
+realtime_survey
 ####################################################################################################
 # TEST: ACOUSTICS
 ####################################################################################################
-# NOTE: Reset database file for utility purposes
-reset_db_files(realtime_survey.config)
-
 # NOTE: LOAD DATA
 realtime_survey.load_acoustic_data()
+realtime_survey
+SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="files_read")
+SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="files_processed")
+SQL(realtime_survey.config["database"]["acoustics"], "map")
+realtime_survey.config["database"]
+realtime_survey.meta["provenance"]
 # NOTE: INITIAL PROCESSING [JUST ACOUSTIC]
+# ! ERRORS OUT WHEN NUMBER OF FILES == 1
 realtime_survey.process_acoustic_data()
-realtime_survey.input
+realtime_survey.estimate_population(working_dataset="acoustic")
+self = realtime_survey
+SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
 ####################################################################################################
 # TEST: BIOLOGY
 ####################################################################################################
-# NOTE: Reset database file for utility purposes
-reset_db_files(realtime_survey.config)
-
 # NOTE: LOAD DATA
 realtime_survey.load_biology_data()
-realtime_survey.input
 # NOTE: INITIAL PROCESSING [JUST BIOLOGY]
 realtime_survey.process_biology_data()
-realtime_survey.input
+realtime_survey.estimate_population(working_dataset="biology")
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="files_read")
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="files_processed")
+SQL(realtime_survey.config["database"]["biology"], "map")
 ####################################################################################################
 # TEST: POPULATION ESTIMATES
 ####################################################################################################
 # NOTE: Acoustic / biological data converge here to derive population estimates 
 # TODO: Add argument that indicates what the new datasets and what data need to be pulled in
 # TODO: ARGUMENT {working_dataset: Literal["acoustic", "biology"]}
-realtime_survey.estimate_population()
\ No newline at end of file
+# ! SQL ARGUMENT STRINGS FAIL ON > 1000 ENTRIES (250 ROWS)
+realtime_survey.estimate_population(working_dataset="biology")
+realtime_survey.estimate_population(working_dataset="acoustic")
+####################################################################################################
+# TEST: GET DATA
+####################################################################################################
+SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
+SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df")
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df")
\ No newline at end of file
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
index 09e93fcc..1c69351e 100644
--- a/echopop/zarr_read_ingest_test.py
+++ b/echopop/zarr_read_ingest_test.py
@@ -12,14 +12,16 @@
 import os
 import re
 import contextlib
+from echopop.acoustics import ts_length_regression, to_linear, to_dB
 from sqlalchemy import create_engine, text, Engine, inspect
 from echopop.live.live_core import LIVE_DATA_STRUCTURE, LIVE_FILE_FORMAT_MAP, LIVE_INPUT_FILE_CONFIG_MAP, SPATIAL_CONFIG_MAP
 from echopop.live.live_data_loading import validate_data_directory
-from echopop.live.sql_methods import SQL, SQL_COMMANDS, query_processed_files, format_sql_columns, sql_group_update, sql_data_exchange
+from echopop.live.sql_methods import SQL, SQL_COMMANDS, query_processed_files, format_sql_columns, sql_group_update, sql_data_exchange, initialize_database, sql_update_strata_summary
 from echopop.live import live_data_processing as eldp
 from echopop.live import live_data_loading as eldl
+from echopop.live.live_data_processing import query_dataset, get_unique_identifiers
 from echopop.live.live_survey import LiveSurvey
-from echopop.live.live_acoustics import preprocess_acoustic_data, compute_nasc, format_acoustic_dataset
+from echopop.live.live_acoustics import integrate_nasc
 from echopop.live.live_biology import preprocess_biology_data
 from echopop.survey import Survey
 
@@ -31,6 +33,28 @@
 proportions_dict=analysis_dict["biology"]["proportions"]["number"]
 length_weight_dict = analysis_dict["biology"]["weight"]
 stratum_proportions_sexed["proportion_aged"] + stratum_proportions_sexed["proportion_unaged"]
+
+files = data_files
+
+
+    
+
+    
+    # Map the table names and validate table creation
+    # ---- Get table names
+    tables = SQL(db_file, "map")
+    # ---- `files_read`
+    if "files_read" not in tables:
+        raise KeyError(
+            f"SQL database table `files_read` in `{db_file}` failed to initialize!"
+        )
+    # ---- `files_processed`
+    if "files_processed" not in tables:
+        raise KeyError(
+            f"SQL database table `files_processed` in `{db_file}` failed to initialize!"
+        )
+    
+
 ####################################################################################################  
 # TEST: YAML FILE CONFIGURATION
 # ---- Define filepaths
@@ -64,50 +88,105 @@
 # ---- Define unique columns
 unique_columns = spatial_column + contrast_columns
 
-if working_dataset == "acoustics" and self.input["nasc_df"] is not None:
-    # ---- Get dataset
-    acoustic_df = get_nasc_sql_data(acoustic_db, 
-                                    self.input["acoustics"], 
-                                    unique_columns=unique_columns)
-
-
-# Get corresponding `sigma_bs`
-# sigma_bs_df = SQL(acoustic_db,"select",table_name="sigma_bs_mean_df",
-#                   condition=f"stratum in {np.unique(self.input["nasc_df"]["stratum"])}")
-sigma_bs_df = SQL(acoustic_db, "select", table_name="sigma_bs_mean_df")
-sigma_bs_df["stratum"] = 2
-# ---- Compute the weighted average
-sigma_bs_mean_df = (
-    sigma_bs_df.groupby(spatial_column + ["species_id"])[["sigma_bs", "sigma_bs_count"]]
-    .apply(lambda df: np.average(df.sigma_bs, weights=df.sigma_bs_count))
-    .to_frame("sigma_bs_mean")
-    .reset_index()
-)
-
-#
-nasc_biology = acoustic_df.merge(sigma_bs_mean_df, on=spatial_column)
-
-# Get the spatially averaged weights
-weight_spatial_averages = self.input["weight_stratumn_df"]
-# ---- Sub-select 'all'
-general_weight_averages = weight_spatial_averages[weight_spatial_averages["sex"] == "all"]
-general_weight_averages["stratum"] = 2
-
-#
-nasc_biology["number_density"] = (
-    nasc_biology["nasc"]
-    / (4.0 * np.pi * nasc_biology["sigma_bs_mean"])
-)
-
-#
-nasc_biology = nasc_biology.merge(general_weight_averages)
+acoustic_db = file_configuration["database"][working_dataset]
+self = realtime_survey
+acoustic_dict = self.input["acoustics"]
+verbose = True
+contrast_columns = []
+db_file = acoustic_db
+table_name="survey_data_df"
+data_columns = data_columns
+unique_columns=unique_columns
+constraint="nasc > 0.0"
+data_dict = self.input["acoustics"]
+data_dict["nasc_df"]["stratum"] = 1
+data_dict["prc_nasc_df"]["stratum"] = 2
+table_name = "sigma_bs_mean_df"
+data_columns=["sigma_bs", "sigma_bs_count"]
+biology_db
+strata_df = self.input["spatial"]["strata"]
+
+def biology_pipeline(biology_dict: dict, 
+                     strata_df: pd.DataFrame, 
+                     file_configuration: dict, 
+                     verbose: bool,
+                     contrast_columns: List[str] = []):
+
+    # Get spatial column
+    spatial_column = file_configuration["spatial_column"]
+    unique_columns = spatial_column + contrast_columns
+
+    # Get database file
+    acoustic_db = file_configuration["database"]["acoustics"]
+
+    # Get biology database file
+    biology_db = file_configuration["database"]["biology"]
+
+    # Check for data completion
+    # ---- List of boolean values
+    full_biology_data = (
+        [True for _, df in biology_dict.items() if isinstance(df, pd.DataFrame) and df is not None]
+    )
+    # ---- Validation
+    if not all(full_biology_data):
+        # ---- Print, if verbose
+        if verbose:
+            print(
+                f"No new processed biology data available for processing."
+            )
+    else:
+        # Get related biology data
+        acoustic_df = get_nasc_sql_data(acoustic_db, 
+                                        biology_dict, 
+                                        unique_columns=unique_columns)        
+
+        # Get the corresopding `sigma_bs` data (and also compute the sample-number weighted average)
+        sigma_bs_df = get_sigma_bs_sql_data(acoustic_db, 
+                                            biology_dict,
+                                            unique_columns=unique_columns)
+
+    # Calculate population estimates if valid data are available
+    if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]):    
+        # ---- Merge the NASC and sigma_bs datasets
+        nasc_biology = acoustic_df.merge(sigma_bs_df, on=unique_columns)
+        # ---- Compute the number densities (animals nmi^-2)
+        nasc_biology["number_density"] = (
+            nasc_biology["nasc"]
+            / (4.0 * np.pi * nasc_biology["sigma_bs_mean"])
+        )
 
-nasc_biology["biomass_density"] = nasc_biology["number_density"] * nasc_biology["average_weight"]
+    # Get the corresponding average strata weights (computed for all fish)
+    weight_spatial_averages = get_average_strata_weights(biology_db,
+                                                         biology_dict,
+                                                         unique_columns=unique_columns)
+    
+    if weight_spatial_averages is not None:
+        # Merge average weights with number density estimates
+        nasc_biology = nasc_biology.merge(weight_spatial_averages, on=unique_columns)
 
-sql_group_update(acoustic_db, dataframe=nasc_biology, 
-                 table_name="survey_data_df", columns=["number_density", "biomass_density"], 
-                 unique_columns=["stratum", "longitude", "latitude", "ping_time"])
+        # Compute biomass densities
+        nasc_biology["biomass_density"] = (
+            nasc_biology["number_density"] * nasc_biology["average_weight"]
+        )
 
+    # Update the survey population estimate DataFrame with the newly computed densities
+    if not nasc_biology.empty:        
+        sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df", 
+                        columns=["number_density", "biomass_density"], 
+                        unique_columns=["stratum", "longitude", "latitude", "ping_time"])
+    
+    # Summarize strata
+    summarize_strata(nasc_biology, strata_df, file_configuration)
+
+db_file=acoustic_db
+dataframe=nasc_biology
+table_name="survey_data_df"
+columns=["number_density", "biomass_density"]
+unique_columns=["stratum", "longitude", "latitude", "ping_time"]
+nasc_biology["number_density"].sum() / 2
+nasc_biology["number_density"]
+SQL(acoustic_db, "select", table_name="survey_data_df")
+SQL(biology_db, "select", table_name="strata_summary_df")
 strata_df = self.input["spatial"]["strata"].copy()
 strata_df[["length_mean", "weight_mean", "TS_mean", "number_density_mean", 
            "biomass_density_mean", "abundance_sum", "biomass_sum"]] = np.nan
@@ -318,7 +397,7 @@ def sql_update_strata_summary(source_db: str,
         _ = dbapi_conn.executescript(sql_script)
 
 SQL(biology_db, "select", table_name=target_table)
-SQL(acoustic_db, "select", table_name=source_table)
+SQL(acoustic_db, "select", table_name=source_table)["number_density"].mean()
 connection.close()
 dbapi_conn.close()
 

From 1ee2e208e7989e9809a24f626c884917fdb0e548 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 7 Aug 2024 18:28:58 -0700
Subject: [PATCH 17/81] Patches

---
 echopop/live/live_acoustics.py       |  47 ++++----
 echopop/live/live_biology.py         | 158 +++++++++++++++++++-----
 echopop/live/live_data_processing.py |   4 +-
 echopop/live/sql_methods.py          |  63 +++++++---
 echopop/test_workflow.py             | 172 ++++++++++++++++++++++++++-
 echopop/zarr_read_ingest_test.py     |   2 +-
 6 files changed, 370 insertions(+), 76 deletions(-)

diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py
index 82e4c1a3..2da07e07 100644
--- a/echopop/live/live_acoustics.py
+++ b/echopop/live/live_acoustics.py
@@ -169,21 +169,24 @@ def estimate_echometrics(acoustic_data_df: pd.DataFrame):
     # Return the dictionary
     return echometrics
 
-def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True):
+def integrate_nasc(data_df: pd.DataFrame, echometrics: bool = True):
 
     # Vertically integrate PRC NASC
-    nasc_dict = {"nasc": acoustic_data_df["NASC"].sum()}
+    nasc_dict = {"nasc": data_df["NASC"].sum()}
     
     # Horizontally concatenate `echometrics`, if `True`
     if echometrics:
         # ---- Compute values
         # NOTE: This uses NASC instead of linear `sv`
-        echometrics_dict = estimate_echometrics(acoustic_data_df)
+        echometrics_dict = estimate_echometrics(data_df)
         # ---- Merge
         nasc_dict.update(echometrics_dict)
 
     # Convert `nasc_dict` to a DataFrame and return the output
-    return pd.Series(nasc_dict)
+    # return pd.Series(nasc_dict)
+    return pd.DataFrame(nasc_dict, index=[0])
+
+    # return pd.DataFrame([nasc_dict])
 
 def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict,
                  echometrics: bool = True):
@@ -193,24 +196,24 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict,
     
     # Integrate NASC (and compute the echometrics, if necessary)
     # ---- Get number of unique sources
-    if len(np.unique(acoustic_data_df["source"])) == 1:
-        nasc_data_df = (
-            acoustic_data_df
-            .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, 
-                     observed=False)
-            .apply(integrate_nasc, echometrics)
-            .reset_index()
-            .sort_values("ping_time")
-        )
-    else:
-        nasc_data_df = (
-            acoustic_data_df
-            .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, 
-                     observed=False)
-            .apply(integrate_nasc, echometrics, include_groups=False)
-            .unstack().reset_index()
-            .sort_values("ping_time")
-        )
+    # if len(np.unique(acoustic_data_df["ping_time"])) > 1:
+    #     nasc_data_df = (
+    #         acoustic_data_df
+    #         .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, 
+    #                  observed=False)
+    #         .apply(integrate_nasc, echometrics, include_groups=False).unstack()
+    #         .reset_index()
+    #         .sort_values("ping_time")
+    #     )
+    # else:
+    nasc_data_df = (
+        acoustic_data_df
+        .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, 
+                    observed=False)
+        .apply(integrate_nasc, echometrics, include_groups=False).droplevel(-1)
+        .reset_index()
+        .sort_values("ping_time")
+    )
     # ---- Amend the dtypes if echometrics were computed
     if echometrics:
         # ---- Set dtypes
diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py
index ae7dde6b..27a53bd1 100644
--- a/echopop/live/live_biology.py
+++ b/echopop/live/live_biology.py
@@ -162,7 +162,7 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame,
     # ---- Compute haul-specific means
     sigma_bs_df = (
         ts_length_df
-        .groupby(list(set(contrast_columns) - set(["length"])), observed=False)
+        .groupby(key_list, observed=False)
         [["TS_L_slope", "TS_L_intercept", "length", "length_count"]]
         .apply(lambda x: average_sigma_bs(x, weights="length_count"))
         .to_frame("sigma_bs")
@@ -172,42 +172,75 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame,
     # ---- Count sum
     sigma_bs_df["sigma_bs_count"] = (
         ts_length_df.reset_index()
-        .groupby(list(set(contrast_columns) - set(["length"])), observed=False)["length_count"]
+        .groupby(key_list, observed=False)["length_count"]
         .sum()
     )
     # ---- Value sum
     sigma_bs_df["sigma_bs_sum"] = sigma_bs_df["sigma_bs"] * sigma_bs_df["sigma_bs_count"]
     # ---- Reset index
     sigma_bs_df = sigma_bs_df.reset_index()
-    
+    # ---- Create a tuple-key that can be used as an identifier
+    sigma_bs_df.loc[:, "id"] = sigma_bs_df[key_list].apply(tuple, axis=1).astype(str)
+
     # Get the database file name
     acoustic_db = file_configuration["database"]["acoustics"]
 
     # Check for `sigma_bs_mean_df` in the database file
     # ---- Query database
     if not SQL(acoustic_db, "validate", table_name="sigma_bs_mean_df"):
+        # ---- Create an insertion dataframe
+        insertion_df = sigma_bs_df.copy()
         # ---- Create
-        SQL(acoustic_db, "create", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df, 
-            primary_keys=list(set(contrast_columns) - set(["length"])))
+        SQL(acoustic_db, "create", table_name="sigma_bs_mean_df", dataframe=insertion_df, 
+            primary_keys=["id"])
         # ---- Populate table
-        SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df)
+        SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", dataframe=insertion_df)
     else:
-        # ---- Check the present keys
-        current_keys_dict = SQL(acoustic_db, "inspect", table_name="sigma_bs_mean_df", 
-                                columns=key_list)
-        # ---- Insert if missing
-        if not all([all(sigma_bs_df[key].isin(current_keys_dict[key])) for key in key_list]):
-            SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df)
-        # ---- Update if not missing
-        else:
+        # ---- Get previous values in the table
+        table_df = SQL(acoustic_db, "select", table_name="sigma_bs_mean_df")
+        # ---- Check the table keys
+        table_keys = np.unique(table_df["id"]).tolist()
+        # ---- Get unique values
+        current_keys = np.unique(sigma_bs_df["id"]).tolist()
+        # ---- Get INSERTION keys
+        insertion_keys = list(set(current_keys).difference(set(table_keys)))
+        # ---- Get UPDATE keys
+        update_keys = list(set(current_keys).intersection(set(table_keys)))
+        # ---- INSERT values
+        if insertion_keys:
+            # ---- Create DataFrame
+            insertion_df = sigma_bs_df[sigma_bs_df["id"].isin(insertion_keys)]
+            # ---- INSERT
+            SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", 
+                dataframe=insertion_df)
+        # ---- UPDATE values
+        if update_keys:
+            update_df = sigma_bs_df[sigma_bs_df["id"].isin(update_keys)]
             # ---- Create a filter condition command
-            condition_str = " & ".join([f"{key} in {np.unique(sigma_bs_df[key])}" for key in key_list])
-            # ---- Update the table key 
-            SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df, 
-                operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], condition=condition_str)
-            # ---- Update the actual `sigma_bs` value in the table
-            SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", columns=["sigma_bs"],
-                operation="sigma_bs_sum / sigma_bs_count", condition=condition_str)
+            sql_group_update(acoustic_db, dataframe=update_df, table_name="sigma_bs_mean_df", 
+                             columns=["sigma_bs_count", "sigma_bs_sum"], operation="+",
+                             unique_columns=["id"], id_columns=["id"])
+            # condition_str = " & ".join([f"id = {id_value}" for id_value in update_keys])
+
+        #     SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=update_df, 
+        #         operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], 
+        #         condition=condition_str)
+        #             # ---- Check the present keys
+        # current_keys_dict = SQL(acoustic_db, "inspect", table_name="sigma_bs_mean_df", 
+        #                         columns=key_list)
+        # # ---- Insert if missing
+        # if not all([all(sigma_bs_df[key].isin(current_keys_dict[key])) for key in key_list]):
+        #     SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df)
+        # # ---- Update if not missing
+        # else:
+        #     # ---- Create a filter condition command
+        #     condition_str = " & ".join([f"{key} in {np.unique(sigma_bs_df[key])}" for key in key_list])
+        #     # ---- Update the table key 
+        #     SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df, 
+        #         operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], condition=condition_str)
+        #     # ---- Update the actual `sigma_bs` value in the table
+        #     SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", columns=["sigma_bs"],
+        #         operation="sigma_bs_sum / sigma_bs_count", condition=condition_str)
         
 def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.DataFrame, 
                              file_configuration: dict):
@@ -361,7 +394,7 @@ def length_bin_weights(length_data: pd.DataFrame, specimen_data: pd.DataFrame,
     #     columns=list(set(length_data.columns) - set(["length_bin"])))
     # list(set(length_data.columns) - set(["length_bin"]))
     # Get length distribution
-    # distribution_df = file_configuration["length_distribution"]
+    distribution_df = file_configuration["length_distribution"]
 
     # Generate sex-specific interpolators for fitted length-weight values for binned length counts
     # ---- Parse the male- and female-specific fitted weight values
@@ -407,7 +440,21 @@ def weight_interpolator(dataframe_row):
     ).reset_index()
 
     # Check for `length_weight_df` in the database file
+    # ---- Combine the datasets
+    full_weight_distrib = (
+        pd.concat([length_table_sexed.rename(columns={"weight_interp": "weight"}), 
+                   specimen_table_sexed], ignore_index=True)
+    )
+    # ---- Sum by bin
+    full_weight_distrib = (
+        full_weight_distrib.groupby(contrast_columns + ["length_bin"])["weight"].sum().reset_index()
+    )
     # ---- Create id/primary key
+    full_weight_distrib.loc[:, "id"] = (
+        full_weight_distrib[contrast_columns + ["length_bin"]].apply(tuple, axis=1).astype(str)
+        .str.replace("'", "")
+    )
+    #
     key_values = ["-".join(length_table_sexed.reset_index()
                            .loc[idx, ["species_id", "sex", "length_bin"]]
                            .values.astype(str)) 
@@ -416,20 +463,65 @@ def weight_interpolator(dataframe_row):
     length_table_sexed["id"] = key_values
     # ---- Query database
     if not SQL(biology_db, "validate", table_name="length_weight_df"):
+        # ---- Create full table
+        overall_weight_distrib = (
+            pd.DataFrame({"stratum": file_configuration["geospatial"]["inpfc"]["stratum_names"] + 
+                          [len(file_configuration["geospatial"]["inpfc"]["stratum_names"]) + 1]})
+            .merge(pd.DataFrame({"sex": ["male", "female"]}), how="cross")
+            .merge(pd.DataFrame(
+                {"species_id": np.unique(file_configuration["species"]["number_code"])}
+            ), how="cross")
+            .merge(distribution_df.filter(["length_bin"]), how="cross")
+        )
+        # ---- Pre-allocate weight
+        overall_weight_distrib.loc[:, "weight"] = 0.0
+        # ---- Create id/primary key
+        overall_weight_distrib.loc[:, "id"] = (
+            overall_weight_distrib[contrast_columns + ["length_bin"]].apply(tuple, axis=1)
+            .astype(str)
+            .str.replace("'", "")
+        )
         # ---- Create
         SQL(biology_db, "create", table_name="length_weight_df", 
-            dataframe=length_table_sexed, primary_keys=["id"])       
-        # ---- Populate table
+            dataframe=overall_weight_distrib, primary_keys=["id"])    
+        # ---- INSERT
         SQL(biology_db, "insert", table_name="length_weight_df", 
-            dataframe=length_table_sexed, id_columns=["id"])
-    else:
-        # ---- Update the table
-        sql_group_update(db_file=biology_db, 
-                         dataframe=length_table_sexed, 
-                         table_name="length_weight_df", 
-                         columns=["weight_interp"],
-                         unique_columns=contrast_columns, 
-                         id_columns=["id"])
+            dataframe=overall_weight_distrib)            
+    # ---- UPDATE
+    sql_group_update(biology_db, dataframe=full_weight_distrib, table_name="length_weight_df", 
+                     columns=["weight"],
+                     unique_columns=["id"], id_columns=["id"])   
+    # table_df = SQL(biology_db, "select", table_name="length_weight_df")
+    # # ---- Check the table keys
+    # table_keys = np.unique(table_df["id"]).tolist()
+    # # ---- Get unique values
+    # current_keys = np.unique(full_weight_distrib["id"]).tolist()
+    # # ---- Get INSERTION keys
+    # insertion_keys = list(set(current_keys).difference(set(table_keys)))
+    # # ---- Get UPDATE keys
+    # update_keys = list(set(current_keys).intersection(set(table_keys)))
+    #     # ---- INSERT values
+    #     if insertion_keys:
+    #         # ---- Create DataFrame
+    #         insertion_df = full_weight_distrib[full_weight_distrib["id"].isin(insertion_keys)]
+    #         # ---- INSERT
+    #         SQL(biology_db, "insert", table_name="length_weight_df", 
+    #             dataframe=insertion_df)
+    #     # ---- UPDATE values
+    #     if update_keys:
+    #         update_df = full_weight_distrib[full_weight_distrib["id"].isin(update_keys)]
+    #         # ---- Create a filter condition command
+    #         sql_group_update(biology_db, dataframe=update_df, table_name="length_weight_df", 
+    #                          columns=["weight"],
+    #                          unique_columns=["id"], id_columns=["id"])            
+
+    #     # ---- Update the table
+    #     sql_group_update(db_file=biology_db, 
+    #                      dataframe=length_table_sexed, 
+    #                      table_name="length_weight_df", 
+    #                      columns=["weight_interp"],
+    #                      unique_columns=contrast_columns, 
+    #                      id_columns=["id"])
     # length_sql_sexed
     
     
diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py
index 928ced70..18d493d0 100644
--- a/echopop/live/live_data_processing.py
+++ b/echopop/live/live_data_processing.py
@@ -144,8 +144,8 @@ def acoustic_pipeline(acoustic_dict: dict,
             # Update the survey population estimate DataFrame with the newly computed densities
             if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]):        
                 sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df", 
-                                columns=["number_density", "biomass_density"], 
-                                unique_columns=["stratum", "longitude", "latitude", "ping_time"])
+                                 columns=["number_density", "biomass_density"], 
+                                 unique_columns=["id"])
             
                 # Summarize strata
                 summarize_strata(nasc_biology, strata_df, file_configuration)
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index 335795b7..f680e908 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -232,8 +232,7 @@ def format_value(x):
         set_list = [f"{column} = {dataframe[column].values[0]}" for column in columns]
     # ---- Join the list
     set_clause = ', '.join(set_list)
-    [f"{column} = {dataframe[column].values[0]}" for column in columns]
-    ", ".join(f"({','.join(map(lambda x: format_value(x), row))})"  for row in data_tuple)
+
     # Add the WHERE clause if a parsed condition is provided
     if condition is not None:
         # ---- Parse the conditional string
@@ -442,6 +441,7 @@ def sql_group_update(db_file: str,
                      table_name: str,
                      columns: List[str],
                      unique_columns: List[str],
+                     operation: Optional[str] = None,
                      id_columns: Optional[List[str]] = None):
     
     # Check for unique values contained within the table
@@ -468,9 +468,7 @@ def sql_group_update(db_file: str,
     # Insert into the table if not otherwise present
     if not filtered_df.empty: 
         SQL(db_file, "insert", table_name=table_name, id_columns=id_columns, dataframe=filtered_df)
-
-    # Update the table
-    # ---- Format the conditional string
+       
     case_statements = []
     for col in columns:
         case_stmt = "CASE"
@@ -482,24 +480,57 @@ def sql_group_update(db_file: str,
             ])
             # Add the WHEN condition to the CASE statement
             case_stmt += f" WHEN {filter_conditions} THEN {row[col]}"
-        case_stmt += " END"
-        case_statements.append(f"{col} = {case_stmt}")
+        case_stmt += f" ELSE {col} END"
+
+        if operation is not None:
+            case_statements.append(f"{col} = {col} {operation} {case_stmt}")
+        else:
+            case_statements.append(f"{col} = {case_stmt}")
+        
+            
+    # Update the table
+    # ---- Format the conditional string
+    # case_statements = []
+    # for col in columns:
+    #     case_stmt = "CASE"
+    #     for _, row in dataframe.iterrows():
+    #         # Construct the filter condition based on unique_columns
+    #         filter_conditions = ' AND '.join([
+    #             f"{col} = '{row[col]}'" if isinstance(row[col], str) else f"{col} = {row[col]}"
+    #             for col in unique_columns
+    #         ])
+    #         # Add the WHEN condition to the CASE statement
+    #         case_stmt += f" WHEN {filter_conditions} THEN {row[col]}"
+    #     case_stmt += " END"
+    #     case_statements.append(f"{col} = {case_stmt}")
 
     # Construct the full SQL UPDATE statement
-    update_clause = ', '.join(case_statements)
+    update_clause = ", ".join(case_statements)
 
     # Format the SQL COMMAND string
+    # sql_command = f"""        
+    # UPDATE {table_name}
+    # SET {update_clause}
+    # WHERE ({' OR '.join([
+    #     ' AND '.join([
+    #         f"{col} = '{row[col]}'" if isinstance(row[col], str) else f"{col} = {row[col]}"
+    #         for col in unique_columns
+    #     ])
+    #     for _, row in dataframe.iterrows()
+    # ])});
+    # """
     sql_command = f"""        
     UPDATE {table_name}
-    SET {update_clause}
-    WHERE ({' OR '.join([
-        ' AND '.join([
-            f"{col} = '{row[col]}'" if isinstance(row[col], str) else f"{col} = {row[col]}"
-            for col in unique_columns
-        ])
-        for _, row in dataframe.iterrows()
-    ])});
+    SET {update_clause};
     """
+    # WHERE ({' OR '.join([
+    #     ' AND '.join([
+    #         f"{col} = '{row[col]}'" if isinstance(row[col], str) else f"{col} = {row[col]}"
+    #         for col in unique_columns
+    #     ])
+    #     for _, row in dataframe.iterrows()
+    # ])});
+    # """
 
     # Create engine
     engine = create_engine(f"sqlite:///{db_file}")
diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index 35ca3b3a..84bb298c 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -1,6 +1,16 @@
 from echopop.live.live_survey import LiveSurvey
 from echopop.live.sql_methods import SQL
-
+from echopop.live.live_biology import (
+    bin_length_data,
+    compute_average_weights,
+    compute_sigma_bs,
+    length_bin_counts,
+    length_bin_weights,
+    length_weight_regression,
+    number_proportions,    
+    preprocess_biology_data,
+    weight_proportions
+)
 # Set up `LiveSurvey` object
 live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
 live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
@@ -8,23 +18,140 @@
 realtime_survey
 ####################################################################################################
 # TEST: ACOUSTICS
+# Actual flow:
+realtime_survey.load_acoustic_data() #`input_filenames` = Optional[List[str]]
+realtime_survey.process_acoustic_data()
+realtime_survey.estimate_population(working_dataset="acoustic")
+amo = SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
+amo[amo.nasc > 0]
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_fitted_df")
+
+realtime_survey.load_biology_data() #`input_filenames` = Optional[List[str]]
+realtime_survey.process_biology_data()
+realtime_survey.estimate_population(working_dataset="biology")
+SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df")
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df")
+tbl = SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df")
+tbl[tbl.weight > 0]
+tbl.weight.sum()
+# NOTE: Pulling successfully processed filenames
+# ! This dictionary key name will change
+realtime_survey.meta["provenance"][f"{working_dataset}_files"]
+SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="files_processed")
 ####################################################################################################
 # NOTE: LOAD DATA
+table_df[table_df.weight > 0]
 realtime_survey.load_acoustic_data()
 realtime_survey
 SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="files_read")
 SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="files_processed")
-SQL(realtime_survey.config["database"]["acoustics"], "map")
+out = SQL(realtime_survey.config["database"]["acoustics"], "map")
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df")
+_ = SQL(realtime_survey.config["database"]["biology"], "drop", table_name="length_weight_df")
 realtime_survey.config["database"]
 realtime_survey.meta["provenance"]
 # NOTE: INITIAL PROCESSING [JUST ACOUSTIC]
 # ! ERRORS OUT WHEN NUMBER OF FILES == 1
 realtime_survey.process_acoustic_data()
+# ! sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near "18": syntax error
 realtime_survey.estimate_population(working_dataset="acoustic")
 self = realtime_survey
 SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
 ####################################################################################################
 # TEST: BIOLOGY
+# Actual flow
+realtime_survey.load_biology_data() #`input_filenames` = Optional[List[str]]
+realtime_survey.process_biology_data()
+realtime_survey.estimate_population(working_dataset="biology")
+self = realtime_survey
+biology_unprocessed = self.input["biology"]
+specimen_data = biology_unprocessed["specimen_df"]
+length_data = biology_unprocessed["length_df"]
+biology_dict = self.input["biology"]
+file_configuration = self.config
+strata_df = self.input["spatial"]["strata"]
+from echopop.live.live_acoustics import average_sigma_bs, compute_nasc, estimate_echometrics, integrate_nasc
+from echopop.live.sql_methods import sql_group_update
+from echopop.live.live_biology import summarize_strata
+import numpy as np; import pandas as pd
+echometrics: bool = True
+acoustic_data_df = self.input["acoustics"]["prc_nasc_df"].copy()
+spatial_column = ["stratum"]
+# acoustic_data_df_copy = acoustic_data_df.copy()
+acoustic_data_df_copy.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(integrate_nasc, echometrics, include_groups=False)
+
+acoustic_data_df.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(integrate_nasc, echometrics, include_groups=False).droplevel(-1).reset_index()
+acoustic_data_df_copy.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(integrate_nasc, echometrics, include_groups=False)
+acoustic_data_df.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(integrate_nasc, echometrics, include_groups=False)
+
+acoustic_data_df_copy.groupby(["longitude", "latitude", "ping_time"] + spatial_column, as_index=True, group_keys=True).apply(integrate_nasc, echometrics, include_groups=False)
+acoustic_data_df.groupby(["longitude", "latitude", "ping_time"] + spatial_column, as_index=True, group_keys=True).apply(integrate_nasc, echometrics, include_groups=False)
+acoustic_data_df.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(lambda g: integrate_nasc(g, echometrics)).reset_index(drop=True)
+dd.index.get_level_values(-1)
+cc.index.get_level_values(-1)
+(
+    acoustic_data_df
+    .groupby(['longitude', 'latitude', 'ping_time', 'source'] + spatial_column)
+    .apply(lambda g: integrate_nasc(g, echometrics=True), include_groups=False)
+    .reset_index()
+    # .rename_axis(None, axis=0)  # Remove any unwanted hierarchical index
+)
+(acoustic_data_df.groupby(['longitude', 'latitude', 'ping_time', 'source', 'stratum'])
+              .apply(integrate_nasc, echometrics=True)
+              .reset_index())
+acoustic_data_df = acoustic_data_df[acoustic_data_df.distance == 0.0]
+acoustic_data_df = acoustic_data_df_copy[acoustic_data_df_copy.distance==0.0]
+pd.Series(nasc_dict).index
+pd.DataFrame.from_dict(nasc_dict, orient="columns") 
+pd.DataFrame(nasc_dict, index=[0])
+SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df")
+(
+    acoustic_data_df.groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, 
+                             observed=False)
+                             .apply(lambda df: integrate_nasc(df, echometrics)).reset_index()
+)
+
+(
+    acoustic_data_df.groupby(["longitude", "latitude", "ping_time", "source", "stratum"], observed=False)
+    .apply(lambda df: integrate_nasc(df, echometrics=True), include_groups=False)
+    .reset_index()
+)
+
+result_df = acoustic_data_df.groupby(['longitude', 'latitude', 'ping_time', 'source', 'stratum']) \
+    .apply(lambda g: integrate_nasc(g, echometrics=True), include_groups=False) \
+    .reset_index()
+
+print(acoustic_data_df.columns)
+print(acoustic_data_df_copy.columns)
+print(acoustic_data_df.dtypes)
+print(acoustic_data_df_copy.dtypes)
+# Inspect DataFrame before groupby
+print(acoustic_data_df.head())
+print(acoustic_data_df_copy.head())
+
+print(acoustic_data_df["longitude"].unique())
+print(acoustic_data_df_copy["longitude"].unique())
+
+print(acoustic_data_df["latitude"].unique())
+print(acoustic_data_df_copy["latitude"].unique())
+
+print("Grouped original index levels:", grouped_original.size().index.names)
+print("Grouped reset index levels:", grouped_reset.size().index.names)
+
+print(acoustic_data_df["ping_time"].unique())
+print(acoustic_data_df_copy["ping_time"].unique())
+
+print(acoustic_data_df["source"].unique())
+print(acoustic_data_df_copy["source"].unique())
+
+grouped_original = acoustic_data_df_copy.groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, observed=False)
+grouped_reset = acoustic_data_df.groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, observed=False)
+print(acoustic_data_df.index)
+print(acoustic_data_df_copy.index)
+grouped_original.index
+print(grouped_original.size())
+print(grouped_reset.size())
+sql_group_update(acoustic_db, sigma_bs_df, table_name="sigma_bs_mean_df")
 ####################################################################################################
 # NOTE: LOAD DATA
 realtime_survey.load_biology_data()
@@ -34,6 +161,8 @@
 SQL(realtime_survey.config["database"]["biology"], "select", table_name="files_read")
 SQL(realtime_survey.config["database"]["biology"], "select", table_name="files_processed")
 SQL(realtime_survey.config["database"]["biology"], "map")
+SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df")
+SQL(realtime_survey.config["database"]["acoustics"], "drop", table_name="sigma_bs_mean_df")
 ####################################################################################################
 # TEST: POPULATION ESTIMATES
 ####################################################################################################
@@ -43,9 +172,48 @@
 # ! SQL ARGUMENT STRINGS FAIL ON > 1000 ENTRIES (250 ROWS)
 realtime_survey.estimate_population(working_dataset="biology")
 realtime_survey.estimate_population(working_dataset="acoustic")
+self = realtime_survey
+acoustic_dict = self.input["acoustics"]
+strata_df = self.input["spatial"]["strata"]
+file_configuration = self.config
+from echopop.live.sql_methods import SQL, sql_group_update
+from echopop.live.live_biology import summarize_strata
+
+db_file = acoustic_db
+dataframe=nasc_biology
+table_name="survey_data_df"
+columns=["number_density", "biomass_density"]
+unique_columns = ["stratum", "longitude", "latitude", "ping_time"]
 ####################################################################################################
 # TEST: GET DATA
 ####################################################################################################
 SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
 SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df")
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df")
+
+SQL(acoustic_db, "drop", table_name="sigma_bs_mean_df")
+
+#####
+# NOTE: Below are hypothetical visualizations
+# 
+survey_data = SQL(realtime_survey.config["database"]["acoustics"], "select", 
+                  table_name="survey_data_df")
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+survey_data.loc[0, "nasc"] = 1e3
+
+plt.plot(survey_data["longitude"], survey_data["latitude"])
+plt.scatter(survey_data["longitude"], survey_data["latitude"], s=survey_data["nasc"])
+plt.show()
+
+SQL(realtime_survey.config["database"]["biology"], "map")
+# ! NEED TO ENSURE THAT TABLE FOR LENGTH/WEIGHT HISTOGRAM IS AVAILABLE
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df")
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_fitted_df")
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df")
+realtime_survey.input["spatial"]["strata"]
+#
+SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df")
 SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df")
\ No newline at end of file
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
index 1c69351e..e6d00cc9 100644
--- a/echopop/zarr_read_ingest_test.py
+++ b/echopop/zarr_read_ingest_test.py
@@ -29,7 +29,7 @@
 survey_2019.transect_analysis()
 survey_2019.analysis["transect"]["biology"]["weight"]["weight_stratum_df"]
 analysis_dict = survey_2019.analysis["transect"]
-
+SQL(acoustic_db, "select", table_name="sigma_bs_mean_df")
 proportions_dict=analysis_dict["biology"]["proportions"]["number"]
 length_weight_dict = analysis_dict["biology"]["weight"]
 stratum_proportions_sexed["proportion_aged"] + stratum_proportions_sexed["proportion_unaged"]

From 0f31b20d1a4bad0a82902f0e70fa6bc7a3972696 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Thu, 8 Aug 2024 10:53:47 -0700
Subject: [PATCH 18/81] Cleaned up `test_workflow`

---
 echopop/test_workflow.py | 252 ++++++++-------------------------------
 1 file changed, 51 insertions(+), 201 deletions(-)

diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index 84bb298c..e52c6739 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -1,219 +1,69 @@
 from echopop.live.live_survey import LiveSurvey
 from echopop.live.sql_methods import SQL
-from echopop.live.live_biology import (
-    bin_length_data,
-    compute_average_weights,
-    compute_sigma_bs,
-    length_bin_counts,
-    length_bin_weights,
-    length_weight_regression,
-    number_proportions,    
-    preprocess_biology_data,
-    weight_proportions
-)
-# Set up `LiveSurvey` object
+
+####################################################################################################
+# TEST: Set up `LiveSurvey` object
+# NOTE: General initialization parameter configuration
 live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
+# NOTE: File configuration
 live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
+# NOTE: Create object
 realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path, verbose=True)
+# NOTE: String-representation via `LiveSurvey.__repr__`: 
+# NOTE: Lists current files being processed and linked databases (WIP)
 realtime_survey
 ####################################################################################################
-# TEST: ACOUSTICS
-# Actual flow:
-realtime_survey.load_acoustic_data() #`input_filenames` = Optional[List[str]]
-realtime_survey.process_acoustic_data()
-realtime_survey.estimate_population(working_dataset="acoustic")
-amo = SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
-amo[amo.nasc > 0]
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_fitted_df")
-
-realtime_survey.load_biology_data() #`input_filenames` = Optional[List[str]]
-realtime_survey.process_biology_data()
-realtime_survey.estimate_population(working_dataset="biology")
-SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df")
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df")
-tbl = SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df")
-tbl[tbl.weight > 0]
-tbl.weight.sum()
-# NOTE: Pulling successfully processed filenames
-# ! This dictionary key name will change
-realtime_survey.meta["provenance"][f"{working_dataset}_files"]
-SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="files_processed")
-####################################################################################################
-# NOTE: LOAD DATA
-table_df[table_df.weight > 0]
+# TEST: TRIGGER --> NEW ACOUSTIC DATA
+# NOTE: Load new acoustic data (Either glob file search or `input_filenames Optional[List[str]]`)
 realtime_survey.load_acoustic_data()
-realtime_survey
-SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="files_read")
-SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="files_processed")
-out = SQL(realtime_survey.config["database"]["acoustics"], "map")
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df")
-_ = SQL(realtime_survey.config["database"]["biology"], "drop", table_name="length_weight_df")
-realtime_survey.config["database"]
-realtime_survey.meta["provenance"]
-# NOTE: INITIAL PROCESSING [JUST ACOUSTIC]
-# ! ERRORS OUT WHEN NUMBER OF FILES == 1
+# NOTE: Process new acoustic data
+# NOTE: This will update linked database tables
 realtime_survey.process_acoustic_data()
-# ! sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near "18": syntax error
+# NOTE: Generate population estimates (or pass if there are no biological data)
+# NOTE: `working_dataset = Literal["acoustic", "biology"]`
 realtime_survey.estimate_population(working_dataset="acoustic")
-self = realtime_survey
-SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
-####################################################################################################
-# TEST: BIOLOGY
-# Actual flow
-realtime_survey.load_biology_data() #`input_filenames` = Optional[List[str]]
-realtime_survey.process_biology_data()
-realtime_survey.estimate_population(working_dataset="biology")
-self = realtime_survey
-biology_unprocessed = self.input["biology"]
-specimen_data = biology_unprocessed["specimen_df"]
-length_data = biology_unprocessed["length_df"]
-biology_dict = self.input["biology"]
-file_configuration = self.config
-strata_df = self.input["spatial"]["strata"]
-from echopop.live.live_acoustics import average_sigma_bs, compute_nasc, estimate_echometrics, integrate_nasc
-from echopop.live.sql_methods import sql_group_update
-from echopop.live.live_biology import summarize_strata
-import numpy as np; import pandas as pd
-echometrics: bool = True
-acoustic_data_df = self.input["acoustics"]["prc_nasc_df"].copy()
-spatial_column = ["stratum"]
-# acoustic_data_df_copy = acoustic_data_df.copy()
-acoustic_data_df_copy.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(integrate_nasc, echometrics, include_groups=False)
-
-acoustic_data_df.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(integrate_nasc, echometrics, include_groups=False).droplevel(-1).reset_index()
-acoustic_data_df_copy.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(integrate_nasc, echometrics, include_groups=False)
-acoustic_data_df.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(integrate_nasc, echometrics, include_groups=False)
-
-acoustic_data_df_copy.groupby(["longitude", "latitude", "ping_time"] + spatial_column, as_index=True, group_keys=True).apply(integrate_nasc, echometrics, include_groups=False)
-acoustic_data_df.groupby(["longitude", "latitude", "ping_time"] + spatial_column, as_index=True, group_keys=True).apply(integrate_nasc, echometrics, include_groups=False)
-acoustic_data_df.groupby(["longitude", "latitude", "ping_time"] + spatial_column).apply(lambda g: integrate_nasc(g, echometrics)).reset_index(drop=True)
-dd.index.get_level_values(-1)
-cc.index.get_level_values(-1)
-(
-    acoustic_data_df
-    .groupby(['longitude', 'latitude', 'ping_time', 'source'] + spatial_column)
-    .apply(lambda g: integrate_nasc(g, echometrics=True), include_groups=False)
-    .reset_index()
-    # .rename_axis(None, axis=0)  # Remove any unwanted hierarchical index
-)
-(acoustic_data_df.groupby(['longitude', 'latitude', 'ping_time', 'source', 'stratum'])
-              .apply(integrate_nasc, echometrics=True)
-              .reset_index())
-acoustic_data_df = acoustic_data_df[acoustic_data_df.distance == 0.0]
-acoustic_data_df = acoustic_data_df_copy[acoustic_data_df_copy.distance==0.0]
-pd.Series(nasc_dict).index
-pd.DataFrame.from_dict(nasc_dict, orient="columns") 
-pd.DataFrame(nasc_dict, index=[0])
-SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df")
-(
-    acoustic_data_df.groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, 
-                             observed=False)
-                             .apply(lambda df: integrate_nasc(df, echometrics)).reset_index()
-)
-
-(
-    acoustic_data_df.groupby(["longitude", "latitude", "ping_time", "source", "stratum"], observed=False)
-    .apply(lambda df: integrate_nasc(df, echometrics=True), include_groups=False)
-    .reset_index()
-)
-
-result_df = acoustic_data_df.groupby(['longitude', 'latitude', 'ping_time', 'source', 'stratum']) \
-    .apply(lambda g: integrate_nasc(g, echometrics=True), include_groups=False) \
-    .reset_index()
-
-print(acoustic_data_df.columns)
-print(acoustic_data_df_copy.columns)
-print(acoustic_data_df.dtypes)
-print(acoustic_data_df_copy.dtypes)
-# Inspect DataFrame before groupby
-print(acoustic_data_df.head())
-print(acoustic_data_df_copy.head())
-
-print(acoustic_data_df["longitude"].unique())
-print(acoustic_data_df_copy["longitude"].unique())
-
-print(acoustic_data_df["latitude"].unique())
-print(acoustic_data_df_copy["latitude"].unique())
-
-print("Grouped original index levels:", grouped_original.size().index.names)
-print("Grouped reset index levels:", grouped_reset.size().index.names)
-
-print(acoustic_data_df["ping_time"].unique())
-print(acoustic_data_df_copy["ping_time"].unique())
-
-print(acoustic_data_df["source"].unique())
-print(acoustic_data_df_copy["source"].unique())
-
-grouped_original = acoustic_data_df_copy.groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, observed=False)
-grouped_reset = acoustic_data_df.groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, observed=False)
-print(acoustic_data_df.index)
-print(acoustic_data_df_copy.index)
-grouped_original.index
-print(grouped_original.size())
-print(grouped_reset.size())
-sql_group_update(acoustic_db, sigma_bs_df, table_name="sigma_bs_mean_df")
+# NOTE: String-representation via `LiveSurvey.__repr__`: 
+# NOTE: Lists current files being processed and linked databases (WIP)
+realtime_survey
 ####################################################################################################
-# NOTE: LOAD DATA
+# TEST: TRIGGER --> NEW BIOLOGY DATA
+# NOTE: Load new biological data (Either glob file search or `input_filenames Optional[List[str]]`)
 realtime_survey.load_biology_data()
-# NOTE: INITIAL PROCESSING [JUST BIOLOGY]
+# NOTE: Process new biological data
+# NOTE: This will update linked database tables
 realtime_survey.process_biology_data()
+# NOTE: Generate population estimates (or pass if there are no acoustic data)
+# NOTE: `working_dataset = Literal["acoustic", "biology"]`
 realtime_survey.estimate_population(working_dataset="biology")
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="files_read")
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="files_processed")
-SQL(realtime_survey.config["database"]["biology"], "map")
-SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df")
-SQL(realtime_survey.config["database"]["acoustics"], "drop", table_name="sigma_bs_mean_df")
-####################################################################################################
-# TEST: POPULATION ESTIMATES
-####################################################################################################
-# NOTE: Acoustic / biological data converge here to derive population estimates 
-# TODO: Add argument that indicates what the new datasets and what data need to be pulled in
-# TODO: ARGUMENT {working_dataset: Literal["acoustic", "biology"]}
-# ! SQL ARGUMENT STRINGS FAIL ON > 1000 ENTRIES (250 ROWS)
-realtime_survey.estimate_population(working_dataset="biology")
-realtime_survey.estimate_population(working_dataset="acoustic")
-self = realtime_survey
-acoustic_dict = self.input["acoustics"]
-strata_df = self.input["spatial"]["strata"]
-file_configuration = self.config
-from echopop.live.sql_methods import SQL, sql_group_update
-from echopop.live.live_biology import summarize_strata
-
-db_file = acoustic_db
-dataframe=nasc_biology
-table_name="survey_data_df"
-columns=["number_density", "biomass_density"]
-unique_columns = ["stratum", "longitude", "latitude", "ping_time"]
+# NOTE: String-representation via `LiveSurvey.__repr__`: 
+# NOTE: Lists current files being processed and linked databases (WIP)
+realtime_survey
 ####################################################################################################
-# TEST: GET DATA
+# TEST: `LiveSurvey` --[`files_processed`]--> `Echodataflow`
+# NOTE: `LiveSurvey.meta` attribute
+# ---- ACOUSTIC
+realtime_survey.meta["provenance"]["acoustic_files"]
+# ---- BIOLOGICAL
+realtime_survey.meta["provenance"]["biology_files"]
+# NOTE: SQL function query from database file [cumulative list]
+# ---- ACOUSTIC
+SQL(db_file=realtime_survey.config["database"]["acoustics"],
+    command="select", table_name="files_processed")
+# ---- BIOLOGICAL
+SQL(db_file=realtime_survey.config["database"]["biology"],
+    command="select", table_name="files_processed")
 ####################################################################################################
-SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
+# TEST: `LiveSurvey` --[(key) SQL tables]--> Users
+# !!! The SQL functions will fail if the tables have not yet been created/initialized
+# ---- ACOUSTICS
+# NOTE: Mean linear backscatter coefficient (`sigma_bs`) keyed for each haul and stratum
 SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df")
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df")
-
-SQL(acoustic_db, "drop", table_name="sigma_bs_mean_df")
-
-#####
-# NOTE: Below are hypothetical visualizations
-# 
-survey_data = SQL(realtime_survey.config["database"]["acoustics"], "select", 
-                  table_name="survey_data_df")
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-survey_data.loc[0, "nasc"] = 1e3
-
-plt.plot(survey_data["longitude"], survey_data["latitude"])
-plt.scatter(survey_data["longitude"], survey_data["latitude"], s=survey_data["nasc"])
-plt.show()
-
-SQL(realtime_survey.config["database"]["biology"], "map")
-# ! NEED TO ENSURE THAT TABLE FOR LENGTH/WEIGHT HISTOGRAM IS AVAILABLE
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df")
+# NOTE: Along-track acoustically-derived number/biomass densities and NASC 
+SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
+# ---- BIOLOGICAL
+# NOTE: Fitted (discretized) length-weight relationship
 SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_fitted_df")
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df")
-realtime_survey.input["spatial"]["strata"]
-#
-SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df")
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df")
\ No newline at end of file
+# NOTE: Quantized length-binned weights (summed)
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df")
+# NOTE: Average weights per stratum
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df")
\ No newline at end of file

From 40f3d7bfc4e368a52304d7453d970ac002813a7b Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 12 Aug 2024 11:35:28 -0700
Subject: [PATCH 19/81] YAML config settings adjustment for db dir

---
 config_files/live_initialization_config.yml   |   4 +-
 config_files/live_survey_year_2019_config.yml |   5 +-
 echopop/live/live_acoustics.py                |  27 +-
 echopop/live/live_biology.py                  |  14 +-
 echopop/live/live_data_loading.py             |   9 +-
 echopop/live/live_data_processing.py          |  54 ++-
 echopop/live/live_spatial_methods.py          | 410 +++++++++++++++---
 echopop/live/live_survey.py                   |   8 +-
 echopop/live/sql_methods.py                   |   5 +-
 9 files changed, 423 insertions(+), 113 deletions(-)

diff --git a/config_files/live_initialization_config.yml b/config_files/live_initialization_config.yml
index 9436cefc..ae265343 100644
--- a/config_files/live_initialization_config.yml
+++ b/config_files/live_initialization_config.yml
@@ -35,8 +35,8 @@
         longitude: [-135.25, -117.00]
       # x/y (or E-W/N-S) grid resolution in nmi
       grid_resolution:
-        x_distance: 50.0
-        y_distance: 50.0
+        x_distance: 25.0
+        y_distance: 25.0
     projection: epsg:4326                   # EPSG integer code for geodetic parameter dataset
   # TODO: Remember to convert this back to a string
   # NOTE: `link_biology_acoustics` defines how biological and acoustic data are linked with one another. This 
diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml
index 4111ea05..e52db83c 100644
--- a/config_files/live_survey_year_2019_config.yml
+++ b/config_files/live_survey_year_2019_config.yml
@@ -15,6 +15,7 @@ species:
 # Directory path that contains all input data needed
 
 data_root_dir: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files
+database_directory: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files/database
 
 ##############################################################################
 # Input data directories
@@ -44,6 +45,8 @@ input_directories:
       trawl_info: operation_info
   coastline: 
     directory: coastline/
-    coastline_name: ne_110m_land
+    coastline_name: ne_10m_land
+  grid:
+    database_name: grid.db
 
 ...
diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py
index 2da07e07..6c1ebf08 100644
--- a/echopop/live/live_acoustics.py
+++ b/echopop/live/live_acoustics.py
@@ -3,7 +3,7 @@
 import pandas as pd
 
 from ..acoustics import ts_length_regression, to_linear, to_dB
-from .live_spatial_methods import apply_spatial_definitions
+from .live_spatial_methods import apply_spatial_definitions, apply_griddify_definitions
 from .sql_methods import sql_data_exchange, SQL, query_processed_files
 
 # TODO: Documentation
@@ -26,7 +26,7 @@ def configure_transmit_frequency(frequency_values: pd.Series,
         return frequency_values
     
 # TODO: Documentation
-def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame,
+def preprocess_acoustic_data(survey_data: pd.DataFrame,
                              spatial_dict: dict,
                              file_configuration: dict) -> pd.DataFrame:
 
@@ -37,15 +37,21 @@ def preprocess_acoustic_data(prc_nasc_df: pd.DataFrame,
 
     # Filter the dataset
     # ---- Configure `frequency_nominal`, if necessary
-    prc_nasc_df.loc[:, "frequency_nominal"] = (
-        configure_transmit_frequency(prc_nasc_df.loc[:, "frequency_nominal"],
+    survey_data.loc[:, "frequency_nominal"] = (
+        configure_transmit_frequency(survey_data.loc[:, "frequency_nominal"],
                                      transmit_settings,
                                      acoustic_analysis_settings["dataset_units"]["frequency"])
     )
     # ---- Filter out any unused frequency coordinates
     prc_nasc_df_filtered = (
-        prc_nasc_df[prc_nasc_df["frequency_nominal"] == transmit_settings["frequency"]]
+        survey_data[survey_data["frequency_nominal"] == transmit_settings["frequency"]]
     )
+
+    # Get grid coordinates
+    prc_nasc_df_filtered = pd.concat([
+        prc_nasc_df_filtered,
+        apply_griddify_definitions(prc_nasc_df_filtered, file_configuration["geospatial"])
+    ], axis = 1)
     
     # Apply spatial settings
     prc_nasc_df_filtered = (
@@ -192,7 +198,10 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict,
                  echometrics: bool = True):
 
     # Get spatial definitions, if any
-    spatial_column = file_configuration["spatial_column"]
+    # spatial_column = file_configuration["spatial_column"]
+
+    # Get stratum column, if any
+    gridding_column = file_configuration["gridding_column"]
     
     # Integrate NASC (and compute the echometrics, if necessary)
     # ---- Get number of unique sources
@@ -208,7 +217,7 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict,
     # else:
     nasc_data_df = (
         acoustic_data_df
-        .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, 
+        .groupby(["longitude", "latitude", "ping_time", "source"] + gridding_column, 
                     observed=False)
         .apply(integrate_nasc, echometrics, include_groups=False).droplevel(-1)
         .reset_index()
@@ -225,7 +234,7 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict,
         )
         # ---- Reorder columns
         nasc_data_df = nasc_data_df[
-            spatial_column
+            gridding_column
             + ["longitude", "latitude", "ping_time", "source", "nasc", "n_layers", "nasc_db", 
                "mean_Sv", "max_Sv", "aggregation_index", "center_of_mass", "dispersion", "evenness", 
                "occupied_area"]
@@ -244,7 +253,7 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict
     
     # Add population-specific columns (specified in the file configuration)
     # TODO: Add to `yaml` file for configuration; hard-code for now
-    add_columns = ["number_density", "biomass_density", "abundance", "biomass"]
+    add_columns = ["number_density", "biomass_density"]
     # ----
     df[add_columns] = 0.0
     # ---- Assign values for key values
diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py
index 27a53bd1..5fcf3c32 100644
--- a/echopop/live/live_biology.py
+++ b/echopop/live/live_biology.py
@@ -183,21 +183,21 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame,
     sigma_bs_df.loc[:, "id"] = sigma_bs_df[key_list].apply(tuple, axis=1).astype(str)
 
     # Get the database file name
-    acoustic_db = file_configuration["database"]["acoustics"]
+    biology_db = file_configuration["database"]["biology"]
 
     # Check for `sigma_bs_mean_df` in the database file
     # ---- Query database
-    if not SQL(acoustic_db, "validate", table_name="sigma_bs_mean_df"):
+    if not SQL(biology_db, "validate", table_name="sigma_bs_mean_df"):
         # ---- Create an insertion dataframe
         insertion_df = sigma_bs_df.copy()
         # ---- Create
-        SQL(acoustic_db, "create", table_name="sigma_bs_mean_df", dataframe=insertion_df, 
+        SQL(biology_db, "create", table_name="sigma_bs_mean_df", dataframe=insertion_df, 
             primary_keys=["id"])
         # ---- Populate table
-        SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", dataframe=insertion_df)
+        SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=insertion_df)
     else:
         # ---- Get previous values in the table
-        table_df = SQL(acoustic_db, "select", table_name="sigma_bs_mean_df")
+        table_df = SQL(biology_db, "select", table_name="sigma_bs_mean_df")
         # ---- Check the table keys
         table_keys = np.unique(table_df["id"]).tolist()
         # ---- Get unique values
@@ -211,13 +211,13 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame,
             # ---- Create DataFrame
             insertion_df = sigma_bs_df[sigma_bs_df["id"].isin(insertion_keys)]
             # ---- INSERT
-            SQL(acoustic_db, "insert", table_name="sigma_bs_mean_df", 
+            SQL(biology_db, "insert", table_name="sigma_bs_mean_df", 
                 dataframe=insertion_df)
         # ---- UPDATE values
         if update_keys:
             update_df = sigma_bs_df[sigma_bs_df["id"].isin(update_keys)]
             # ---- Create a filter condition command
-            sql_group_update(acoustic_db, dataframe=update_df, table_name="sigma_bs_mean_df", 
+            sql_group_update(biology_db, dataframe=update_df, table_name="sigma_bs_mean_df", 
                              columns=["sigma_bs_count", "sigma_bs_sum"], operation="+",
                              unique_columns=["id"], id_columns=["id"])
             # condition_str = " & ".join([f"id = {id_value}" for id_value in update_keys])
diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index f507d63f..84316027 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -112,10 +112,14 @@ def read_biology_files(biology_files: List[Path], file_configuration: dict):
     directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"]
     
     # Add SQL file to dict
+    # file_configuration["database"]["biology"] = (
+    #     Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"]         
+    # )
     file_configuration["database"]["biology"] = (
-        Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"]         
+        Path(file_configuration["database_directory"]) / file_settings["database_name"]         
     )
 
+
     # Iterate through the different biology datasets and read them in
     for dataset in list(biology_file_ids.keys()):
         # ---- Get dataset-specific file lists
@@ -540,6 +544,9 @@ def configure_spatial_settings(file_configuration: dict):
         # ---- Empty `spatial_column` key
         file_configuration.update({"spatial_column": []})
 
+    # Add grid
+    file_configuration.update({"gridding_column": file_configuration["stratum_column"] + ["x", "y"]})
+
     # Return the dictionary as an output
     return spatial_dict
 
diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py
index 18d493d0..c46317a0 100644
--- a/echopop/live/live_data_processing.py
+++ b/echopop/live/live_data_processing.py
@@ -20,17 +20,25 @@ def get_unique_identifiers(data_dict: dict,
                            unique_columns: List[str]) -> pd.DataFrame:
 
     # Gather all dataframes from a dictionary into a list
-    df_list = [df for _, df in data_dict.items()]
+    if isinstance(data_dict, dict):
+        df_list = [df for _, df in data_dict.items()]
+    else:
+        df_list = [data_dict]
 
     # Get unique values of each contrast column across the biological datasets    
-    dfs = [pd.DataFrame({col: df[col].unique().tolist()}) for col in unique_columns 
-           for df in df_list if isinstance(df, pd.DataFrame) and not df.empty]
+    # dfs = [pd.DataFrame({col: df[col].unique().tolist()}) for col in unique_columns 
+    #        for df in df_list if isinstance(df, pd.DataFrame) and not df.empty and col in df.columns]    
+    combined_df = pd.concat(
+        [df[unique_columns] for df in df_list if all(col in df.columns for col in unique_columns)], 
+        ignore_index=True
+    ).drop_duplicates()
     
     # Reduce into a single DataFrame
-    if len(unique_columns) > 1:
-        return reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs)
-    else:
-        return reduce(lambda left, right: pd.merge(left, right, how="outer"), dfs)
+    return combined_df
+    # if len(unique_columns) > 1:
+    #     return reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs)
+    # else:
+    #     return reduce(lambda left, right: pd.merge(left, right, how="outer"), dfs)
 
 def query_dataset(db_file: str,
                   data_dict: dict,
@@ -49,14 +57,18 @@ def query_dataset(db_file: str,
         valid_keys = list(set(inspected_table.keys()).intersection(set(data_columns)))
         # ---- Get unique identifiers
         unique_keys_df = get_unique_identifiers(data_dict, unique_keys)
-        # ---- Create conditional string            
-        conditional_str = (
-           " & ".join([f"{col} in {np.unique(unique_keys_df[col]).tolist()}" 
-                       for col in unique_keys_df.columns])  
-        )
+        # ---- Create conditional string  
+        conditional_str = " | ".join(
+            [" & ".join([f"{col} = {val}" for col, val in row.items()]) 
+            for _, row in unique_keys_df.iterrows()]
+        )          
+        # conditional_str = (
+        #    " & ".join([f"{col} in {np.unique(unique_keys_df[col]).tolist()}" 
+        #                for col in unique_keys_df.columns])  
+        # )
         # ---- Append the additional constraint statement if present
         if constraint is not None:
-            conditional_str += f" & {constraint}"
+            conditional_str = f"({conditional_str})" + f" & {constraint}"
         # ---- SELECT the dataset using the conidtional statement
         data_sql = SQL(db_file, "select", table_name=table_name, columns=valid_keys,
                        condition=conditional_str).filter(data_columns)
@@ -90,7 +102,8 @@ def acoustic_pipeline(acoustic_dict: dict,
 
     # Get spatial column
     spatial_column = file_configuration["spatial_column"]
-    unique_columns = spatial_column + contrast_columns
+    gridding_column = file_configuration["gridding_column"]
+    unique_columns = gridding_column + contrast_columns
 
     # Get database file
     acoustic_db = file_configuration["database"]["acoustics"]
@@ -112,15 +125,15 @@ def acoustic_pipeline(acoustic_dict: dict,
                                         unique_columns=unique_columns)
         
         # Get the corresopding `sigma_bs` data (and also compute the sample-number weighted average)
-        sigma_bs_df = get_sigma_bs_sql_data(acoustic_db, 
+        sigma_bs_df = get_sigma_bs_sql_data(biology_db, 
                                             acoustic_dict,
-                                            unique_columns=unique_columns)
+                                            unique_columns=["stratum"])
         
         # Calculate population estimates if valid data are available
         if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]):
 
             # ---- Merge the NASC and sigma_bs datasets
-            nasc_biology = acoustic_df.merge(sigma_bs_df, on=unique_columns)
+            nasc_biology = acoustic_df.merge(sigma_bs_df, on=spatial_column + contrast_columns)
             # ---- Compute the number densities (animals nmi^-2)
             nasc_biology["number_density"] = (
                 nasc_biology["nasc"]
@@ -130,11 +143,12 @@ def acoustic_pipeline(acoustic_dict: dict,
             # Get the corresponding average strata weights (computed for all fish)
             weight_spatial_averages = get_average_strata_weights(biology_db,
                                                                 acoustic_dict,
-                                                                unique_columns=unique_columns)
+                                                                unique_columns=spatial_column + contrast_columns)
             
             if weight_spatial_averages is not None:
                 # Merge average weights with number density estimates
-                nasc_biology = nasc_biology.merge(weight_spatial_averages, on=unique_columns)
+                nasc_biology = nasc_biology.merge(weight_spatial_averages, 
+                                                  on=spatial_column + contrast_columns)
 
                 # Compute biomass densities
                 nasc_biology["biomass_density"] = (
@@ -156,7 +170,7 @@ def get_nasc_sql_data(db_file: str,
     
     # Add SELECTION columns
     data_columns = (
-        unique_columns + ["x", "y", "longitude", "latitude", "ping_time", "nasc", "number_density", 
+        unique_columns + ["longitude", "latitude", "ping_time", "nasc", "number_density", 
                           "biomass_density", "id"]
     )
     # ----- Get the SQL dataset
diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py
index 6ce7741f..29f5df4e 100644
--- a/echopop/live/live_spatial_methods.py
+++ b/echopop/live/live_spatial_methods.py
@@ -4,6 +4,9 @@
 from geopy.distance import distance
 from ..spatial.projection import utm_string_generator
 import shapely.geometry
+from shapely.geometry import box
+import sqlalchemy as sqla
+from pathlib import Path
 from typing import Union
 
 def create_inpfc_strata(spatial_config: dict):
@@ -143,7 +146,7 @@ def define_boundary_box(boundary_dict: dict, projection: str):
         crs=projection,
     )
 
-def apply_griddify_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict):
+def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict):
 
     # Extract the griddification definitions
     griddify_definitions = spatial_config["griddify"]
@@ -190,77 +193,346 @@ def apply_griddify_definitions(acoustic_data: dict, biology_data: dict, spatial_
     # Get the centroids
     cells_gdf["cell_centroid"] = cells_gdf["geometry"].centroid
 
-    # Get the `prc_nasc_df` values, if they exist, and apply stratification information
-    if not acoustic_data["prc_nasc_df"].empty:
+    # Convert to GeoDataFrame
+    dataset_gdf = gpd.GeoDataFrame(
+        data=dataset,
+        geometry=gpd.points_from_xy(dataset["longitude"], dataset["latitude"]),
+        crs=projection,
+    )
+    # ---- To UTM
+    dataset_gdf = dataset_gdf.to_crs(projection_new)
+
+    # Extract x- and y-coordinates
+    dataset_gdf["x"] = dataset_gdf["geometry"].x
+    dataset_gdf["y"] = dataset_gdf["geometry"].y
+
+    # Bin the longitude data
+    dataset_gdf["stratum_x"] = pd.cut(
+        dataset_gdf["x"],
+        np.arange(xmin, xmax+x_step, x_step),
+        right = True,
+        labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1),
+    ).astype(int) + 1
+
+    # Bin the latitude data
+    dataset_gdf["stratum_y"] = pd.cut(
+        dataset_gdf["y"],
+        np.arange(ymin, ymax+y_step, y_step),
+        right = True,
+        labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1),
+    ).astype(int) + 1
+
+    # Update the original dataset
+    return (
+        dataset_gdf.loc[:, ["stratum_x", "stratum_y"]]
+        .rename(columns={"stratum_x": "x", "stratum_y": "y"})
+    )
+    # dataset.loc[:, "x"] = dataset_gdf.copy().loc[:, "stratum_x"]
+    # dataset.loc[:, "y"] = dataset_gdf.copy().loc[:, "stratum_y"]
 
-        #
-        prc_nasc_df = acoustic_data["prc_nasc_df"]
 
-        # to GDF
-        prc_nasc_gdf = gpd.GeoDataFrame(
-            data=prc_nasc_df,
-            geometry=gpd.points_from_xy(prc_nasc_df["longitude"], prc_nasc_df["latitude"]),
-            crs=projection,
-        )
-        # to UTM
-        prc_nasc_new = prc_nasc_gdf.to_crs(projection_new)
-
-        prc_nasc_new["x"] = prc_nasc_new["geometry"].x
-        prc_nasc_new["y"] = prc_nasc_new["geometry"].y
-
-        # ---- Bin the latitude data
-        prc_nasc_new["stratum_x"] = pd.cut(
-            prc_nasc_new["x"],
-            np.arange(xmin, xmax+x_step, x_step),
-            right = True,
-            labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1),
-        ).astype(int) + 1
-
-        prc_nasc_new["stratum_y"] = pd.cut(
-            prc_nasc_new["y"],
-            np.arange(ymin, ymax+y_step, y_step),
-            right = True,
-            labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1),
-        ).astype(int) + 1
-
-        #
-        acoustic_data["prc_nasc_df"]["stratum"] = (
-            prc_nasc_new["stratum_x"].astype(str) + "-" + prc_nasc_new["stratum_y"].astype(str)
-        )
+# def apply_griddify_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict):
 
-    if not biology_data["trawl_info_df"].empty:
+#     # Extract the griddification definitions
+#     griddify_definitions = spatial_config["griddify"]
 
-        #
-        trawl_info_df = biology_data["trawl_info_df"]
+#     # Get the projection definition
+#     projection = spatial_config["projection"]
 
-        # to GDF
-        trawl_info_gdf = gpd.GeoDataFrame(
-            data=trawl_info_df,
-            geometry=gpd.points_from_xy(trawl_info_df["longitude"], trawl_info_df["latitude"]),
-            crs=projection,
-        )
-        # to UTM
-        trawl_info_new = trawl_info_gdf.to_crs(projection_new)
-
-        trawl_info_new["x"] = trawl_info_new["geometry"].x
-        trawl_info_new["y"] = trawl_info_new["geometry"].y
-
-        # ---- Bin the latitude data
-        trawl_info_new["stratum_x"] = pd.cut(
-            trawl_info_new["x"],
-            np.arange(xmin, xmax+x_step, x_step),
-            right = True,
-            labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1),
-        ).astype(int) + 1
-
-        trawl_info_new["stratum_y"] = pd.cut(
-            trawl_info_new["y"],
-            np.arange(ymin, ymax+y_step, y_step),
-            right = True,
-            labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1),
-        ).astype(int) + 1
-
-        #
-        biology_data["trawl_info_df"]["stratum"] = (
-            trawl_info_new["stratum_x"].astype(str) + "-" + trawl_info_new["stratum_y"].astype(str)
+#     # Compute the boundary box GeoDataFrame
+#     boundary_box = define_boundary_box(griddify_definitions["bounds"], projection)
+
+#     # Convert the coordinates, if needed
+#     if not set(["northings", "eastings"]).intersection(set(griddify_definitions["bounds"].keys())):
+#         # ---- Compute the equivalent UTM string
+#         utm_num = int(utm_string_generator(np.median(boundary_box.loc[0:3, "x"]), 
+#                                            np.median(boundary_box.loc[0:3, "y"])))
+#         # ---- Compute the boundary box GeoDataFrame with the new projection
+#         boundary_box = boundary_box.to_crs(utm_num)
+#         # ---- Create a new projection for later
+#         projection_new = f"epsg:{utm_num}"
+#     else:
+#         projection_new = projection
+
+#     # Define the step sizes
+#     # ---- Define x step size
+#     x_step = distance(nautical=griddify_definitions["grid_resolution"]["x_distance"]).meters
+#     # ---- Define y step size
+#     y_step = distance(nautical=griddify_definitions["grid_resolution"]["y_distance"]).meters
+
+#     # Get the boundary tuple
+#     xmin, ymin, xmax, ymax = boundary_box.total_bounds
+
+#     # Generate the cells
+#     grid_cells = []
+#     # ---- Iterate through
+#     for y0 in np.arange(ymin, ymax+y_step, y_step):
+#         for x0 in np.arange(xmin, xmax+x_step, x_step):
+#             x1 = x0-x_step
+#             y1 = y0+y_step
+#             grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))
+
+#     # Convert to a GeoDataFrame
+#     cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=projection_new)
+
+#     # Get the centroids
+#     cells_gdf["cell_centroid"] = cells_gdf["geometry"].centroid
+
+#     # Get the `prc_nasc_df` values, if they exist, and apply stratification information
+#     if not acoustic_data["prc_nasc_df"].empty:
+
+#         #
+#         prc_nasc_df = acoustic_data["prc_nasc_df"]
+
+#         # to GDF
+#         prc_nasc_gdf = gpd.GeoDataFrame(
+#             data=prc_nasc_df,
+#             geometry=gpd.points_from_xy(prc_nasc_df["longitude"], prc_nasc_df["latitude"]),
+#             crs=projection,
+#         )
+#         # to UTM
+#         prc_nasc_new = prc_nasc_gdf.to_crs(projection_new)
+
+#         prc_nasc_new["x"] = prc_nasc_new["geometry"].x
+#         prc_nasc_new["y"] = prc_nasc_new["geometry"].y
+
+#         # ---- Bin the latitude data
+#         prc_nasc_new["stratum_x"] = pd.cut(
+#             prc_nasc_new["x"],
+#             np.arange(xmin, xmax+x_step, x_step),
+#             right = True,
+#             labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1),
+#         ).astype(int) + 1
+
+#         prc_nasc_new["stratum_y"] = pd.cut(
+#             prc_nasc_new["y"],
+#             np.arange(ymin, ymax+y_step, y_step),
+#             right = True,
+#             labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1),
+#         ).astype(int) + 1
+
+#         #
+#         acoustic_data["prc_nasc_df"]["stratum"] = (
+#             prc_nasc_new["stratum_x"].astype(str) + "-" + prc_nasc_new["stratum_y"].astype(str)
+#         )
+
+#     if not biology_data["trawl_info_df"].empty:
+
+#         #
+#         trawl_info_df = biology_data["trawl_info_df"]
+
+#         # to GDF
+#         trawl_info_gdf = gpd.GeoDataFrame(
+#             data=trawl_info_df,
+#             geometry=gpd.points_from_xy(trawl_info_df["longitude"], trawl_info_df["latitude"]),
+#             crs=projection,
+#         )
+#         # to UTM
+#         trawl_info_new = trawl_info_gdf.to_crs(projection_new)
+
+#         trawl_info_new["x"] = trawl_info_new["geometry"].x
+#         trawl_info_new["y"] = trawl_info_new["geometry"].y
+
+#         # ---- Bin the latitude data
+#         trawl_info_new["stratum_x"] = pd.cut(
+#             trawl_info_new["x"],
+#             np.arange(xmin, xmax+x_step, x_step),
+#             right = True,
+#             labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1),
+#         ).astype(int) + 1
+
+#         trawl_info_new["stratum_y"] = pd.cut(
+#             trawl_info_new["y"],
+#             np.arange(ymin, ymax+y_step, y_step),
+#             right = True,
+#             labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1),
+#         ).astype(int) + 1
+
+#         #
+#         biology_data["trawl_info_df"]["stratum"] = (
+#             trawl_info_new["stratum_x"].astype(str) + "-" + trawl_info_new["stratum_y"].astype(str)
+#         )
+
+def initialize_grid(file_configuration = dict):
+
+    # Get root directory, if defined
+    if "data_root_dir" in file_configuration:
+        root_dir = Path(file_configuration["data_root_dir"])
+    else:
+        root_dir = Path()
+
+    # Get `grid` settings
+    grid_database = file_configuration["input_directories"]["grid"]["database_name"]
+    # ----
+    db_directory = Path(file_configuration["database_directory"])
+
+    # Create full filepath
+    # db_filepath = root_dir / "database" / grid_database
+    db_filepath = db_directory / grid_database
+    # ---- Update config
+    file_configuration["database"]["grid"] = db_filepath
+
+    # Create if file doesn't already exist
+    if not db_filepath.exists():
+
+        # Get projection
+        projection = file_configuration["geospatial"]["projection"]
+        
+        # Get grid settings
+        grid_settings = file_configuration["geospatial"]["griddify"]
+
+        # Get the resolution
+        resolution = grid_settings["grid_resolution"]
+        # ---- Convert from nmi to m
+        resolution_m = {key: distance(nautical=dist).meters for key, dist in resolution.items()}
+
+        # Get boundary coordinates
+        boundary = grid_settings["bounds"]
+        # ---- x
+        x = boundary["longitude"]
+        # ---- y
+        y = boundary["latitude"]
+        # ---- Create DataFrame
+        boundary_df = pd.DataFrame({
+            "x": np.array([np.min(x), np.max(x), np.max(x), np.min(x), np.min(x)]),
+            "y": np.array([np.min(y), np.min(y), np.max(y), np.max(y), np.min(y)])
+        })
+
+        # Create GeoDataFrame
+        boundary_gdf = gpd.GeoDataFrame(
+            data = boundary_df,
+            geometry=gpd.points_from_xy(boundary_df["x"], boundary_df["y"]),
+            crs = projection
         )
+
+        # Convert to UTM (decimal degrees to m)
+        # ---- Create UTM code
+        utm_code = utm_string_generator((boundary_df.x.min() + boundary_df.x.max()) / 2, 
+                                        (boundary_df.y.min() + boundary_df.y.max()) / 2)
+        # ---- Create number code
+        utm_num = int(utm_code)
+        # ---- UTM conversion
+        boundary_gdf_utm = boundary_gdf.to_crs(utm_num)
+
+        # Get step sizes for each grid cell
+        # ---- x
+        x_step = resolution_m["x_distance"]
+        # ---- y
+        y_step = resolution_m["y_distance"]
+
+        # Prepare grid cell generation
+        # ---- Get new boundaries
+        xmin, ymin, xmax, ymax = boundary_gdf_utm.total_bounds
+        # ---- Initialize empty list
+        grid_cells = []
+        # ---- Initialize coordinate counter
+        y_ct = 0
+        x_coord = []; y_coord = []
+        # ---- Iterate through to generate cells
+        for y0 in np.arange(ymin, ymax, y_step):
+            y_ct += 1
+            x_ct = 0
+            for x0 in np.arange(xmin, xmax, x_step):
+                x_ct += 1
+                # ---- Step forward
+                x_coord.append(x_ct)
+                y_coord.append(y_ct)
+                x1 = x0 - x_step
+                y1 = y0 + y_step
+                # ---- Append to list
+                grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))
+
+        # Convert to a GeoDataFrame
+        cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=utm_code)
+        # ---- Add cordinates
+        cells_gdf.loc[:, "x"] = np.array(x_coord)
+        cells_gdf.loc[:, "y"] = np.array(y_coord)        
+
+        # Get coastline shapefile directory, if defined
+        if "coastline" in file_configuration["input_directories"]:
+
+            # Get coastline settings
+            coast_settings = file_configuration["input_directories"]["coastline"]
+            # ---- Create filepath
+            shp_filepath = (
+                root_dir / coast_settings["directory"] 
+                / coast_settings["coastline_name"] / f"{coast_settings["coastline_name"]}.shp"
+            )
+            # ---- Validate existence
+            if not shp_filepath.exists():
+                raise FileNotFoundError(
+                    f"{shp_filepath} does not exist!"
+                )
+
+            # Get original lat/lon geometry boundaries
+            xmin0, ymin0, xmax0, ymax0 = boundary_gdf.total_bounds
+            
+            # Read in file
+            full_coast = gpd.read_file(shp_filepath)
+            # ---- Convert to UTM
+            full_coast_utm = full_coast.to_crs(utm_code)
+            # ---- Remove empty
+            full_coast_utm = full_coast_utm[~full_coast_utm.is_empty]
+
+            # Create bouning box with a buffer
+            boundary_box = box(xmin0 - 5, ymin0 - 5, xmax0 + 5, ymax0 + 5)
+            # ---- Create an unbuffered copy
+            boundary_box_unbuffered = box(xmin0, ymin0, xmax0, ymax0)
+            # ---- Convert to a GeoDataFrame
+            boundary_box_unbuffered_gdf = (
+                gpd.GeoDataFrame(geometry=[boundary_box_unbuffered], crs=projection)
+            )
+            # ---- Clip the coastline for saving
+            clipped_coast_original = (
+                gpd.clip(full_coast, box(xmin0 + 1, ymin0 + 1, xmax0 + 1, ymax0 + 1))
+            )
+
+            # Clip the coastline shapefile
+            clipped_coast = gpd.clip(full_coast, boundary_box).to_crs(utm_code)
+
+            # Clip the grid cells
+            cells_gdf.loc[:, "geometry"] = (
+                cells_gdf["geometry"].difference(clipped_coast.geometry.union_all())
+            )
+
+            # Calculate area per cell
+            cells_gdf.loc[:, "area"] = cells_gdf.area
+
+            # Convert back to original projection and clip 
+            clipped_cells_latlon = (
+                gpd.clip(cells_gdf.to_crs(projection), boundary_box_unbuffered_gdf)
+                .reset_index(drop=True)
+            )
+
+            # Initialize empty columns that can be added to later on
+            clipped_cells_latlon.loc[:, ["number_density_mean", "biomass_density_mean", 
+                                         "abundance", "biomass"]] = 0.0
+            
+            # Create output DataFrame
+            output_df = pd.DataFrame({
+                "geometry": clipped_cells_latlon["geometry"].apply(lambda geom: geom.wkt)
+            })
+            # ---- Add the required columns
+            output_df = pd.concat([output_df, clipped_cells_latlon.loc[:, ["x", "y", "area"]]], 
+                                  axis=1) 
+            # ---- Initialize empty columns that can be added to later on
+            output_df.loc[:, ["number_density_mean", "biomass_density_mean", "abundance", 
+                              "biomass"]] = 0.0
+           
+            # Write to the database file (for the grid)
+            # ---- Create engine
+            engine = sqla.create_engine(f"sqlite:///{db_filepath}")
+            # ---- Connect and create table
+            _ = output_df.to_sql("grid_df", engine, if_exists="replace", index=False)
+
+            # Write to the database file (for the coastline shapefile)
+            # ---- Create output copy
+            coastline_out = pd.DataFrame({
+                "geometry": clipped_coast_original["geometry"].apply(lambda geom: geom.wkt)
+            })
+            # ---- Concatenate
+            coastline_out = (
+                pd.concat([coastline_out, clipped_coast_original.drop(columns="geometry")], axis=1)
+            )
+            # ---- Connect and create table
+            _ = coastline_out.to_sql("coastline_df", engine, if_exists="replace", index=False)
diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index f3cb7f5a..58c5c27c 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -35,6 +35,7 @@
     weight_proportions
 )
 
+from .live_spatial_methods import initialize_grid
 
 from . import live_data_processing as eldp
 from . import live_data_loading as eldl
@@ -73,6 +74,9 @@ def __init__(
         # Initialize the results attribute
         self.results = copy.deepcopy(LIVE_DATA_STRUCTURE["results"])
 
+        # Initialize the extrapolation grid
+        initialize_grid(self.config)
+
         # Configure the spatial settings
         self.input.update({"spatial": eldl.configure_spatial_settings(self.config)})
 
@@ -204,8 +208,8 @@ def process_biology_data(self):
 
             # Compute `sigma_bs` by sending it to the appropriate database table
             compute_sigma_bs(biology_unprocessed["specimen_df"], 
-                            biology_unprocessed["length_df"], 
-                            self.config)
+                             biology_unprocessed["length_df"], 
+                             self.config)
 
             # Bin the length measurements of the biological data
             bin_length_data(biology_unprocessed, self.config["length_distribution"])
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index f680e908..f9dd36eb 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -380,7 +380,8 @@ def initialize_database(root_directory: Path, file_settings: dict):
 
     # Create filepath to the SQL database
     # ---- Create Path to SQL database file
-    db_directory = root_directory / "database"
+    # db_directory = root_directory / "database"
+    db_directory = Path(file_settings["database_directory"])
     # ---- Create the directory if it does not already exist
     db_directory.mkdir(parents=True, exist_ok=True)
     # ---- Complete path to the database file
@@ -625,7 +626,7 @@ def query_processed_files(root_directory: Path, file_settings: dict, files: List
 
     # Create filepath to the SQL database
     # ---- Create Path to SQL database file
-    db_directory = Path(root_directory) / "database"
+    db_directory = Path(file_configuration["database_directory"])
     # ---- Complete path to the database file
     db_file = db_directory / db_name
 

From 63e79614622c5a13c87cd01ba934febcc3d0bd4f Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 12 Aug 2024 12:13:50 -0700
Subject: [PATCH 20/81] f-string fix for coastline db file creation

---
 echopop/live/live_spatial_methods.py |   2 +-
 echopop/mesh_generation.py           | 132 +++--------
 echopop/zarr_read_ingest_test.py     | 339 +++++++++++++++++++++++++--
 3 files changed, 354 insertions(+), 119 deletions(-)

diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py
index 29f5df4e..d8a46523 100644
--- a/echopop/live/live_spatial_methods.py
+++ b/echopop/live/live_spatial_methods.py
@@ -456,7 +456,7 @@ def initialize_grid(file_configuration = dict):
             # ---- Create filepath
             shp_filepath = (
                 root_dir / coast_settings["directory"] 
-                / coast_settings["coastline_name"] / f"{coast_settings["coastline_name"]}.shp"
+                / coast_settings["coastline_name"] / f"{coast_settings['coastline_name']}.shp"
             )
             # ---- Validate existence
             if not shp_filepath.exists():
diff --git a/echopop/mesh_generation.py b/echopop/mesh_generation.py
index bb78e1ba..699eed4f 100644
--- a/echopop/mesh_generation.py
+++ b/echopop/mesh_generation.py
@@ -12,107 +12,8 @@
 
 # Create the grid points
 grid_points = [(i, j, 0) for i in x for j in y]
-def load_acoustic_data(file_configuration: dict) -> Tuple[pd.DataFrame]:
-
-    # Get the acoustic file settings and root directory
-    # ---- File settings
-    file_settings = file_configuration["input_directories"]["acoustics"]
-    # ---- Root directory
-    root_directory = file_configuration["data_root_dir"]
-    
-    # Get and validate the acoustic data directory and files
-    acoustic_files = validate_data_directory(root_directory, file_settings)
-
-    # Query `acoustics.db` to process only new files (or create the db file in the first place)
-    new_acoustic_files = query_acoustic_db_files(file_configuration, acoustic_files)
-
-    # Read in the acoustic data files
-    # ! [REQUIRES DASK] ---- Read in the listed file
-    prc_nasc_df, acoustic_data_units = read_acoustic_zarr(new_acoustic_files)
-    # ---- Add the `acoustic_data_units` to the dictionary
-    file_configuration["acoustics"]["dataset_units"] = acoustic_data_units
-
-    # Preprocess the acoustic dataset
-    prc_nasc_df_processed = preprocess_acoustic_data(prc_nasc_df, file_configuration)
-
-    # Return output
-    return prc_nasc_df_processed
-
-def read_acoustic_zarr(acoustic_files: Path) -> tuple:
-
-    # Iterate through each of the file ids and read in the data 
-    for id in list(biology_file_ids.keys()): 
-        # ---- Extract the specific config mapping for this tag/id
-        sub_config_map = biology_config_map[id]
-        # ---- Drop the `{FIELD_ID}` tag identifier
-        file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id])
-        # ---- Replace all other tags with `*` placeholders
-        file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
-        # ---- Create Path object with the generalized format
-        subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}")
-        # ---- List all files that match this pattern
-        subcsv_files_str = [str(file) for file in list(subfile_path_obj)]
-        # ---- Filter for only new files
-        subset_files = set(subcsv_files_str).intersection(set(new_files))
-        # ---- Pull from SQL database, if applicable
-        if f"{id}_df" in tables:
-            # ---- SELECT
-            sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*")
-            # ---- Concatenate to the dictionary
-            sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df])
-        # ---- Add data files not stored in SQL database
-        if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables:
-            if len(subset_files) > 0:
-                file_list = subset_files
-            else:
-                file_list = subcsv_files_str
-            # ---- Create a list of relevant dataframes
-            sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map) 
-                            for file in file_list]
-            # ---- Concatenate into a single DataFrame
-            sub_df = pd.concat(sub_df_lst, ignore_index=True)
-            # ---- Lower-case sex
-            if "sex" in sub_df.columns:
-                sub_df["sex"] = sub_df["sex"].str.lower()
-            # ---- Concatenate to the dictionary DataFrame
-            biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df])
-
-    # Get contrasts used for filtering the dataset
-    # ---- Species
-    species_filter = file_configuration["species"]["number_code"]
-    # ---- Trawl partition information
-    trawl_filter = biology_analysis_settings["catch"]["partition"]
-    # ---- Apply the filter
-    filtered_biology_output = {
-        key: df[
-            (df['species_id'] == species_filter if 'species_id' in df.columns else True) &
-            (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns else True)
-        ]
-        for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty
-    }
-
-    # Update the SQL database
-    for table_name, df in filtered_biology_output.items():
-        # ---- Update        
-        _ = SQL(db_file, "insert", table_name=table_name, columns="*", 
-                dataframe=df)
-        
-    # Combine the two datasets 
-    merged_output = {
-        key: pd.concat([
-            sql_biology_output.get(key, pd.DataFrame()), 
-            filtered_biology_output.get(key, pd.DataFrame())
-        ]).drop_duplicates().reset_index(drop=True)
-        for key in set(sql_biology_output) | set(filtered_biology_output)
-    }
-    # ---- Return output
-    if update_config:
-        if file_configuration["database"]["biology"] is None: 
-            file_configuration["database"]["biology"] = db_file
-        return merged_output, file_configuration
-    else:
-        return merged_output
 
+def initialize_grid():
 
 
 data_root_dir = Path("C:/Users/Brandyn/Documents/GitHub/EchoPro_data/")
@@ -2016,6 +1917,12 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict):
 
 boundary_dict = griddify_definitions["bounds"]
 
+from geopy.distance import distance
+import numpy as np
+import pandas as pd
+import geopandas as gpd
+from echopop.spatial.projection import utm_string_generator
+
 ##
 grid_settings["grid_resolution"]["x"] = 50
 grid_settings["grid_resolution"]["y"] = 50
@@ -2034,6 +1941,7 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict):
     crs = projection
 )
 from echopop.spatial.projection import utm_string_generator
+import shapely.geometry
 utm_string_generator(-117.0, 33.75)
 bound_gdf.total_bounds
 # Convert to UTM
@@ -2125,7 +2033,30 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict):
 # plt.xlim(lon_min-3, lon_max+3)
 # plt.ylim(lat_min-3, lat_max+3)
 # plt.show()
+test = SQL(db_filepath, "select", table_name="grid_df")
+from shapely import wkt
+import matplotlib.pyplot as plt
+
+test = output_df.copy()
+test["geometry"] = test["geometry"].apply(wkt.loads)
+test_gdf = gpd.GeoDataFrame(test, geometry="geometry", crs=projection)
+
+co = SQL(db_filepath, "select", table_name="coastline_df")
+co["geometry"] = co["geometry"].apply(wkt.loads)
+co_gdf = gpd.GeoDataFrame(co, geometry="geometry", crs=projection)
+
+lims = test_gdf.total_bounds
+
+fig, ax = plt.subplots(figsize=(10, 10))
+test_gdf.plot(ax=ax, column="abundance", edgecolor="black", cmap="viridis", legend=False)
+co_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
+plt.xlim(lims[0]*1.005, lims[2]*1.01)
+plt.ylim(lims[1]*0.98, lims[3]*1.005)
+plt.show()
+
 
+test["geometry"].apply(wkt.loads)
+clipped_cells_latlon["geometry"]
 len(bbox_latlon.exterior.coords)
 len(buffer_boundary.exterior.coords)
 
@@ -2151,6 +2082,7 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict):
 custom_crs = '+proj=epsg:4326 +lat_ts=0 +lat_0=0 +lon_0=-180 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs +type=crs'
 cells_latlon_clipped.to_crs(custom_crs).crs
 ########
+import sqlalchemy as sqla
 import matplotlib.colors as colors
 import matplotlib.cm as cm
 cells_transformed = cells_latlon.to_crs(utm_code)
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
index e6d00cc9..0512b667 100644
--- a/echopop/zarr_read_ingest_test.py
+++ b/echopop/zarr_read_ingest_test.py
@@ -21,7 +21,7 @@
 from echopop.live import live_data_loading as eldl
 from echopop.live.live_data_processing import query_dataset, get_unique_identifiers
 from echopop.live.live_survey import LiveSurvey
-from echopop.live.live_acoustics import integrate_nasc
+from echopop.live.live_acoustics import integrate_nasc, configure_transmit_frequency
 from echopop.live.live_biology import preprocess_biology_data
 from echopop.survey import Survey
 
@@ -34,26 +34,200 @@
 length_weight_dict = analysis_dict["biology"]["weight"]
 stratum_proportions_sexed["proportion_aged"] + stratum_proportions_sexed["proportion_unaged"]
 
-files = data_files
+updated_survey_data = nasc_biology.copy()
+gridding_column = file_configuration["gridding_column"]
 
+unique_keys = get_unique_identifiers(updated_survey_data, gridding_column)
 
-    
 
-    
-    # Map the table names and validate table creation
-    # ---- Get table names
-    tables = SQL(db_file, "map")
-    # ---- `files_read`
-    if "files_read" not in tables:
-        raise KeyError(
-            f"SQL database table `files_read` in `{db_file}` failed to initialize!"
-        )
-    # ---- `files_processed`
-    if "files_processed" not in tables:
-        raise KeyError(
-            f"SQL database table `files_processed` in `{db_file}` failed to initialize!"
+file_configuration = self.config
+grid_settings["grid_resolution"]["x"] = 50
+grid_settings["grid_resolution"]["y"] = 50
+lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters
+lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters
+self = realtime_survey
+file_configuration = self.config
+
+def initialize_grid():
+
+    # Get root directory, if defined
+    if "data_root_dir" in file_configuration:
+        root_dir = Path(file_configuration["data_root_dir"])
+    else:
+        root_dir = Path()
+
+    # Get `grid` settings
+    grid_database = file_configuration["input_directories"]["grid"]["database_name"]
+
+    # Create full filepath
+    db_filepath = root_dir / "database" / grid_database
+
+    # Create if file doesn't already exist
+    if not db_filepath.exists():
+
+        # Get projection
+        projection = file_configuration["geospatial"]["projection"]
+        
+        # Get grid settings
+        grid_settings = file_configuration["geospatial"]["griddify"]
+
+        # Get the resolution
+        resolution = grid_settings["grid_resolution"]
+        # ---- Convert from nmi to m
+        resolution_m = {key: distance(nautical=dist).meters for key, dist in resolution.items()}
+
+        # Get boundary coordinates
+        boundary = grid_settings["bounds"]
+        # ---- x
+        x = boundary["longitude"]
+        # ---- y
+        y = boundary["latitude"]
+        # ---- Create DataFrame
+        boundary_df = pd.DataFrame({
+            "x": np.array([np.min(x), np.max(x), np.max(x), np.min(x), np.min(x)]),
+            "y": np.array([np.min(y), np.min(y), np.max(y), np.max(y), np.min(y)])
+        })
+
+        # Create GeoDataFrame
+        boundary_gdf = gpd.GeoDataFrame(
+            data = boundary_df,
+            geometry=gpd.points_from_xy(boundary_df["x"], boundary_df["y"]),
+            crs = projection
         )
-    
+
+        # Convert to UTM (decimal degrees to m)
+        # ---- Create UTM code
+        utm_code = utm_string_generator((boundary_df.x.min() + boundary_df.x.max()) / 2, 
+                                        (boundary_df.y.min() + boundary_df.y.max()) / 2)
+        # ---- Create number code
+        utm_num = int(utm_code)
+        # ---- Create string code
+        utm_str = f"epsg:{utm_num}"
+        # ---- UTM conversion
+        boundary_gdf_utm = boundary_gdf.to_crs(utm_num)
+
+        # Get step sizes for each grid cell
+        # ---- x
+        x_step = resolution_m["x_distance"]
+        # ---- y
+        y_step = resolution_m["y_distance"]
+
+        # Prepare grid cell generation
+        # ---- Get new boundaries
+        xmin, ymin, xmax, ymax = boundary_gdf_utm.total_bounds
+        # ---- Initialize empty list
+        grid_cells = []
+        # ---- Initialize coordinate counter
+        y_ct = 0
+        x_coord = []; y_coord = []
+        # ---- Iterate through to generate cells
+        for y0 in np.arange(ymin, ymax, y_step):
+            y_ct += 1
+            x_ct = 0
+            for x0 in np.arange(xmin, xmax, x_step):
+                x_ct += 1
+                # ---- Step forward
+                x_coord.append(x_ct)
+                y_coord.append(y_ct)
+                x1 = x0 - x_step
+                y1 = y0 + y_step
+                # ---- Append to list
+                grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))
+
+        # Convert to a GeoDataFrame
+        cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=utm_code)
+        # ---- Add cordinates
+        cells_gdf.loc[:, "x"] = np.array(x_coord)
+        cells_gdf.loc[:, "y"] = np.array(y_coord)        
+
+        # Get coastline shapefile directory, if defined
+        if "coastline" in file_configuration["input_directories"]:
+
+            # Get coastline settings
+            coast_settings = file_configuration["input_directories"]["coastline"]
+            # ---- Create filepath
+            shp_filepath = (
+                root_dir / coast_settings["directory"] 
+                / coast_settings["coastline_name"] / f"{coast_settings["coastline_name"]}.shp"
+            )
+            # ---- Validate existence
+            if not shp_filepath.exists():
+                raise FileNotFoundError(
+                    f"{shp_filepath} does not exist!"
+                )
+            
+            # Get original lat/lon geometry boundaries
+            xmin0, ymin0, xmax0, ymax0 = boundary_gdf.total_bounds
+            
+            # Read in file
+            full_coast = gpd.read_file(shp_filepath)
+            # ---- Convert to UTM
+            full_coast_utm = full_coast.to_crs(utm_code)
+            # ---- Remove empty
+            full_coast_utm = full_coast_utm[~full_coast_utm.is_empty]
+
+            # Create bouning box with a buffer
+            boundary_box = box(xmin0 - 5, ymin0 - 5, xmax0 + 5, ymax0 + 5)
+            # ---- Create an unbuffered copy
+            boundary_box_unbuffered = box(xmin0, ymin0, xmax0, ymax0)
+            # ---- Convert to a GeoDataFrame
+            boundary_box_unbuffered_gdf = (
+                gpd.GeoDataFrame(geometry=[boundary_box_unbuffered], crs=projection)
+            )
+            # ---- Clip the coastline for saving
+            clipped_coast_original = (
+                gpd.clip(full_coast, box(xmin0 + 1, ymin0 + 1, xmax0 + 1, ymax0 + 1))
+            )
+
+            # Clip the coastline shapefile
+            clipped_coast = gpd.clip(full_coast, boundary_box).to_crs(utm_code)
+
+            # Clip the grid cells
+            cells_gdf.loc[:, "geometry"] = (
+                cells_gdf["geometry"].difference(clipped_coast.geometry.union_all())
+            )
+
+            # Calculate area per cell
+            cells_gdf.loc[:, "area"] = cells_gdf.area
+
+            # Convert back to original projection and clip 
+            clipped_cells_latlon = (
+                gpd.clip(cells_gdf.to_crs(projection), boundary_box_unbuffered_gdf)
+                .reset_index(drop=True)
+            )
+
+            # Initialize empty columns that can be added to later on
+            clipped_cells_latlon.loc[:, ["number_density_mean", "biomass_density_mean", 
+                                         "abundance", "biomass"]] = 0.0
+            
+            # Create output DataFrame
+            output_df = pd.DataFrame({
+                "geometry": clipped_cells_latlon["geometry"].apply(lambda geom: geom.wkt)
+            })
+            # ---- Add the required columns
+            output_df = pd.concat([output_df, clipped_cells_latlon.loc[:, ["x", "y", "area"]]], 
+                                  axis=1) 
+            # ---- Initialize empty columns that can be added to later on
+            output_df.loc[:, ["number_density_mean", "biomass_density_mean", "abundance", 
+                              "biomass"]] = 0.0
+           
+            # Write to the database file (for the grid)
+            # ---- Create engine
+            engine = sqla.create_engine(f"sqlite:///{db_filepath}")
+            # ---- Connect and create table
+            _ = output_df.to_sql("grid_df", engine, if_exists="replace")
+
+            # Write to the database file (for the coastline shapefile)
+            # ---- Create output copy
+            coastline_out = pd.DataFrame({
+                "geometry": clipped_coast_original["geometry"].apply(lambda geom: geom.wkt)
+            })
+            # ---- Concatenate
+            coastline_out = (
+                pd.concat([coastline_out, clipped_coast_original.drop(columns="geometry")], axis=1)
+            )
+            # ---- Connect and create table
+            _ = coastline_out.to_sql("coastline_df", engine, if_exists="replace")
 
 ####################################################################################################  
 # TEST: YAML FILE CONFIGURATION
@@ -245,9 +419,138 @@ def biology_pipeline(biology_dict: dict,
 data_table = "grid"
 grid_table = "reference"
 column_pairs = [("number_density", "abundance"), ("biomass_density", "biomass")]
-coordinates = ["x", "y"]
+
 dataframe = nasc_biology_output
 
+import sqlalchemy as sqla
+grid_db_file = file_configuration["database"]["grid"]
+survey_db_file = Path(file_configuration["data_root_dir"]) / "database" / "acoustics.db"
+data_table = "survey_data_df"
+grid_table = "grid_df"
+coordinates = ["x", "y"]
+from echopop.live.sql_methods import SQL
+
+SQL(grid_db_file, "select", table_name=grid_table)
+SQL(survey_db_file, "select", table_name=data_table)
+SQL(data_table, "map")
+
+updated_survey_data = nasc_biology.copy()
+# Get relevant table
+previous_grid = query_dataset(grid_db_file, updated_survey_data, 
+                              table_name=grid_table,
+                              data_columns=["x", "y", "area", "number_density_mean", 
+                                            "biomass_density_mean", "abundance", "biomass"],
+                              unique_columns=["x", "y"])
+
+# Get unique coordinates
+update_keys = get_unique_identifiers(updated_survey_data, gridding_column).set_index(["x", "y"])
+update_keys["number_density_mean"] = updated_survey_data.groupby(["x", "y"])["number_density"].mean()
+update_keys["biomass_density_mean"] = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean()
+
+
+
+number_density_mean = updated_survey_data.groupby(["x", "y"])["number_density"].mean()
+biomass_density_mean = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean()
+
+SQL(grid_db_file, "select", table_name=grid_table)
+
+
+
+pulled_data = pd.concat([SQL(grid_db_file, "select", 
+                             table_name=grid_table, 
+                             condition=f"x = {t[0]} & y = {t[1]}") for t in unique_coord])
+previous_cell_data = pd.concat([SQL(survey_db_file, "select", 
+                                    table_name=data_table,
+                                    condition=f"x = {t[0]} & y = {t[1]}") for t in unique_coord])
+
+from echopop.live.live_data_processing import get_nasc_sql_data, get_sigma_bs_sql_data, get_average_strata_weights, summarize_strata
+from echopop.live.sql_methods import sql_group_update
+from typing import List
+from shapely.geometry import box
+SQL(grid_db_file, "select", table_name="grid_df")
+# Compute means
+number_density_mean = previous_cell_data.groupby(["x", "y"])["number_density"].mean()
+previous_cell_data = previous_cell_data.groupby(["x", "y"])["biomass_density"].mean()
+
+[SQL(grid_db_file, "select", table_name=grid_table, condition=f"x = {xi} & y = {yi}") for xi, yi in zip(nasc_data_df["x"], nasc_data_df["y"])]
+
+# Write to the database file (for the grid)
+# ---- Create engine
+engine = sqla.create_engine(f"sqlite:///{db_filepath}")
+
+def update_population_grid(grid_db_file: str, 
+                           data_table: str,
+                           grid_table: str,
+                           dataframe: pd.DataFrame,
+                           column_pairs: Union[List[tuple[str, str]], tuple[str, str]],
+                           coordinates: List[str]):
+    
+    # Convert `column_pairs` to a list, if needed
+    if not isinstance(column_pairs, list):
+        column_pairs = [column_pairs]
+
+    dataframe[coordinates]
+    # Format the coordinate pairs
+    # ---- Convert coordinate values into a list of tuples
+    coord_pairs = [tuple(row) for row in dataframe[coordinates].itertuples(index=False)]    
+    # ---- Get unique pairs
+    coords = list(set(coord_pairs))
+
+    # Format the SQL script command
+    # ---- Initialize
+    sql_script = []
+    # ---- Iteratively update
+    for input_column, output_column in column_pairs:
+        sql_script.append(
+        f"""
+        BEGIN TRANSACTION;
+                        
+        -- Calculate averages for input_column and update grid_table
+        WITH avgs AS (
+            SELECT
+                {coordinates[0]},
+                {coordinates[1]},
+                AVG(d.{input_column}) as avg_value
+            FROM {data_table} d
+            GROUP BY d.{coordinates[0]}, d.{coordinates[1]}
+        )
+
+        -- Update the grid_table with both average and computed total
+        UPDATE {grid_table}
+        SET 
+            mean_{input_column} = (
+                SELECT avg_value
+                FROM avgs
+                WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
+                    AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
+            ),
+            {output_column} = (
+                SELECT avg_value * {grid_table}.area
+                FROM avgs
+                WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
+                    AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
+            )       
+        WHERE EXISTS (
+            SELECT 1
+            FROM avgs
+            WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
+              AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
+        );
+
+        COMMIT;
+        """
+        )
+
+    # Create the engine
+    engine = create_engine(f"sqlite:///{db_file}")
+
+    # Create the SQL database connection and send the script 
+    with engine.connect() as connection:
+        dbapi_conn = connection.connection
+        _ = dbapi_conn.executescript("\n".join(sql_script))
+
+
+
 def update_population_grid(db_file: str, 
                            data_table: str,
                            grid_table: str,

From ab6d9ffdaf084c3f2000c8b510565a5841894acb Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 12 Aug 2024 12:32:04 -0700
Subject: [PATCH 21/81] Fix to stratum/spatial config key name

---
 echopop/live/live_data_loading.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index 84316027..e42a86dd 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -545,7 +545,7 @@ def configure_spatial_settings(file_configuration: dict):
         file_configuration.update({"spatial_column": []})
 
     # Add grid
-    file_configuration.update({"gridding_column": file_configuration["stratum_column"] + ["x", "y"]})
+    file_configuration.update({"gridding_column": file_configuration["spatial_column"] + ["x", "y"]})
 
     # Return the dictionary as an output
     return spatial_dict

From 8dd470c7087d35991dc0d57d2d61bfebf673317b Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 12 Aug 2024 12:37:57 -0700
Subject: [PATCH 22/81] Fix to database directory initialization

---
 echopop/live/live_data_loading.py | 2 ++
 echopop/live/sql_methods.py       | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index e42a86dd..8ef37ce5 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -232,6 +232,8 @@ def validate_data_directory(file_configuration: dict, dataset: str,
         raise TypeError(
             "Data loading argument `input_filenames` must be a list."
         )        
+    #
+    root_directory = file_configuration["database_directory"]
     
     # Initialize the database file
     initialize_database(root_directory, file_settings)
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index f9dd36eb..67cf424c 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -380,8 +380,8 @@ def initialize_database(root_directory: Path, file_settings: dict):
 
     # Create filepath to the SQL database
     # ---- Create Path to SQL database file
-    # db_directory = root_directory / "database"
-    db_directory = Path(file_settings["database_directory"])
+    db_directory = Path(root_directory)
+    # db_directory = Path(file_settings["database_directory"])
     # ---- Create the directory if it does not already exist
     db_directory.mkdir(parents=True, exist_ok=True)
     # ---- Complete path to the database file

From b6fbae513983bb306068c4c78d9e34c7fccc3639 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 12 Aug 2024 12:41:48 -0700
Subject: [PATCH 23/81] Additional db directorypath changes/fixes

---
 echopop/live/sql_methods.py      | 3 ++-
 echopop/zarr_read_ingest_test.py | 9 +++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index 67cf424c..7ae3824f 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -626,7 +626,8 @@ def query_processed_files(root_directory: Path, file_settings: dict, files: List
 
     # Create filepath to the SQL database
     # ---- Create Path to SQL database file
-    db_directory = Path(file_configuration["database_directory"])
+    # db_directory = Path(file_configuration["database_directory"])
+    db_directory = Path(root_directory)
     # ---- Complete path to the database file
     db_file = db_directory / db_name
 
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
index 0512b667..2df92682 100644
--- a/echopop/zarr_read_ingest_test.py
+++ b/echopop/zarr_read_ingest_test.py
@@ -442,6 +442,15 @@ def biology_pipeline(biology_dict: dict,
                                             "biomass_density_mean", "abundance", "biomass"],
                               unique_columns=["x", "y"])
 
+# Index
+previous_grid.set_index(["x", "y"], inplace=True)
+previous_grid["biomass_density_mean"] = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean()
+previous_grid["number_density_mean"] = updated_survey_data.groupby(["x", "y"])["number_density"].mean()
+
+# Convert area from m^2 to nmi^2
+previous_grid["abundance"] = previous_grid["number_density_mean"] * previous_grid["area"]
+previous_grid["biomass"] = previous_grid["biomass_density_mean"] * previous_grid["area"]
+
 # Get unique coordinates
 update_keys = get_unique_identifiers(updated_survey_data, gridding_column).set_index(["x", "y"])
 update_keys["number_density_mean"] = updated_survey_data.groupby(["x", "y"])["number_density"].mean()

From 6c6214f870c00d133588d1c819cd432c27d8641f Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 12 Aug 2024 12:45:41 -0700
Subject: [PATCH 24/81] Fix `data_root_dir` missing workaround

---
 echopop/live/live_data_loading.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index 8ef37ce5..25c89064 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -54,6 +54,10 @@ def live_configuration(live_init_config_path: Union[str, Path],
             f"file."
         )
     
+    # Amend root directory, if needed
+    if "data_root_dir" not in file_config:
+        file_config["data_root_dir"] = ""
+    
     # Combine both into a dictionary output that can be added to the `LiveSurvey` class object
     return {**init_config, **file_config}
 
@@ -234,7 +238,7 @@ def validate_data_directory(file_configuration: dict, dataset: str,
         )        
     #
     root_directory = file_configuration["database_directory"]
-    
+
     # Initialize the database file
     initialize_database(root_directory, file_settings)
     

From d1bdc2cd74037dfa1812062ac87b3ddf2605bc75 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 12 Aug 2024 13:21:27 -0700
Subject: [PATCH 25/81] db pathing issues fixed

---
 echopop/live/live_acoustics.py       |  5 ++++-
 echopop/live/live_data_loading.py    |  4 ----
 echopop/live/live_spatial_methods.py |  8 ++++++--
 echopop/live/live_survey.py          | 14 +++++++++-----
 4 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py
index 6c1ebf08..24f96681 100644
--- a/echopop/live/live_acoustics.py
+++ b/echopop/live/live_acoustics.py
@@ -261,8 +261,11 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict
     # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint
     df.loc[:, "id"] = key_values
 
+    # Get root database directory
+    root_database = file_configuration["database_directory"]
+
     # Update the successfully processed files
-    query_processed_files(file_configuration["data_root_dir"], 
+    query_processed_files(root_database, 
                           file_configuration["input_directories"]["acoustics"],
                           meta_dict["provenance"]["acoustic_files"],
                           processed=True)
diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index 25c89064..f6365689 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -54,10 +54,6 @@ def live_configuration(live_init_config_path: Union[str, Path],
             f"file."
         )
     
-    # Amend root directory, if needed
-    if "data_root_dir" not in file_config:
-        file_config["data_root_dir"] = ""
-    
     # Combine both into a dictionary output that can be added to the `LiveSurvey` class object
     return {**init_config, **file_config}
 
diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py
index d8a46523..c86f20b9 100644
--- a/echopop/live/live_spatial_methods.py
+++ b/echopop/live/live_spatial_methods.py
@@ -453,10 +453,14 @@ def initialize_grid(file_configuration = dict):
 
             # Get coastline settings
             coast_settings = file_configuration["input_directories"]["coastline"]
+            # ---- Get root folder directory
+            coast_root = root_dir / coast_settings["directory"] / coast_settings["coastline_name"] 
             # ---- Create filepath
             shp_filepath = (
-                root_dir / coast_settings["directory"] 
-                / coast_settings["coastline_name"] / f"{coast_settings['coastline_name']}.shp"
+                # root_dir / coast_settings["directory"] 
+                # / coast_settings["coastline_name"] 
+                coast_root
+                / f"{coast_settings['coastline_name']}.shp"
             )
             # ---- Validate existence
             if not shp_filepath.exists():
diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index 58c5c27c..870b57da 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -39,6 +39,7 @@
 
 from . import live_data_processing as eldp
 from . import live_data_loading as eldl
+
 class LiveSurvey:
     """
     A real-time processing version of the `echopop` base `Survey` class that ingests biological, 
@@ -60,7 +61,7 @@ def __init__(
         # initialize the Survey class object
         self.config = eldl.live_configuration(Path(live_init_config_path), 
                                               Path(live_file_config_path))
-        # ---- Initialize config key for database files
+        # # ---- Initialize config key for database files
         self.config.update(
             {"database": {key: None for key in self.config["input_directories"].keys()}}
         )
@@ -198,6 +199,9 @@ def process_biology_data(self):
         # ----- Unprocessed
         biology_unprocessed = self.input["biology"]
 
+        # Get database root directory
+        root_directory = self.config["database_directory"]
+
         # Check if data are present
         unprocess_data_dfs = (
             [True if isinstance(df, pd.DataFrame) and not df.empty else False 
@@ -260,10 +264,10 @@ def process_biology_data(self):
             })
 
             # Update the database
-            query_processed_files(self.config["data_root_dir"], 
-                                self.config["input_directories"]["biology"],
-                                self.meta["provenance"]["biology_files"],
-                                processed=True)
+            query_processed_files(root_directory, 
+                                  self.config["input_directories"]["biology"],
+                                  self.meta["provenance"]["biology_files"],
+                                  processed=True)
             
 
     def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True):

From 3e252a068330e2892439cbc12fee8fea13faca2c Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 12 Aug 2024 13:26:03 -0700
Subject: [PATCH 26/81] `data_root_dir` check for `read_biology_files`

---
 echopop/live/live_data_loading.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index f6365689..0ad82db5 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -109,7 +109,10 @@ def read_biology_files(biology_files: List[Path], file_configuration: dict):
     # ---- Initialize the dictionary that will define this key in the `input` attribute
     biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
     # # ---- Create filepath object
-    directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"]
+    if "data_root_dir" in file_configuration:
+        directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"]
+    else:
+        directory_path = Path(file_settings["directory"])
     
     # Add SQL file to dict
     # file_configuration["database"]["biology"] = (

From a1cec0198c5758cdfbe59558ee98a8d5dd62a169 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 12 Aug 2024 19:53:29 -0700
Subject: [PATCH 27/81] Gridding methods

---
 echopop/live/live_data_processing.py |  79 +++----------
 echopop/live/live_spatial_methods.py |  83 +++++++++++++-
 echopop/live/live_visualizer.py      |   0
 echopop/live/sql_methods.py          |  60 ++++++++++
 echopop/mesh_generation.py           | 161 +++++++++++++++++++++++++--
 echopop/test_workflow.py             |   1 -
 echopop/zarr_read_ingest_test.py     |  62 +++++++++--
 7 files changed, 355 insertions(+), 91 deletions(-)
 create mode 100644 echopop/live/live_visualizer.py

diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py
index c46317a0..a235bf58 100644
--- a/echopop/live/live_data_processing.py
+++ b/echopop/live/live_data_processing.py
@@ -2,8 +2,9 @@
 import re
 
 from functools import reduce
-from .sql_methods import SQL, sql_group_update
+from .sql_methods import SQL, sql_group_update, query_dataset, get_unique_identifiers
 from .live_biology import summarize_strata
+from .live_spatial_methods import update_population_grid
 from pathlib import Path
 from typing import Union, Tuple, Optional, List
 
@@ -16,68 +17,6 @@
     LIVE_INPUT_FILE_CONFIG_MAP
 )
 
-def get_unique_identifiers(data_dict: dict,
-                           unique_columns: List[str]) -> pd.DataFrame:
-
-    # Gather all dataframes from a dictionary into a list
-    if isinstance(data_dict, dict):
-        df_list = [df for _, df in data_dict.items()]
-    else:
-        df_list = [data_dict]
-
-    # Get unique values of each contrast column across the biological datasets    
-    # dfs = [pd.DataFrame({col: df[col].unique().tolist()}) for col in unique_columns 
-    #        for df in df_list if isinstance(df, pd.DataFrame) and not df.empty and col in df.columns]    
-    combined_df = pd.concat(
-        [df[unique_columns] for df in df_list if all(col in df.columns for col in unique_columns)], 
-        ignore_index=True
-    ).drop_duplicates()
-    
-    # Reduce into a single DataFrame
-    return combined_df
-    # if len(unique_columns) > 1:
-    #     return reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs)
-    # else:
-    #     return reduce(lambda left, right: pd.merge(left, right, how="outer"), dfs)
-
-def query_dataset(db_file: str,
-                  data_dict: dict,
-                  table_name: str,
-                  data_columns: List[str],
-                  unique_columns: List[str],
-                  constraint: str = None):
-    
-    # Validate that the desired table exists
-    if SQL(db_file, "validate", table_name=table_name):
-        # ---- Inspect the SQL table
-        inspected_table = SQL(db_file, "inspect", table_name=table_name)
-        # ---- Create a list of intersecting column names
-        unique_keys = list(set(inspected_table.keys()).intersection(set(unique_columns)))
-        # ---- Create list of valid columns
-        valid_keys = list(set(inspected_table.keys()).intersection(set(data_columns)))
-        # ---- Get unique identifiers
-        unique_keys_df = get_unique_identifiers(data_dict, unique_keys)
-        # ---- Create conditional string  
-        conditional_str = " | ".join(
-            [" & ".join([f"{col} = {val}" for col, val in row.items()]) 
-            for _, row in unique_keys_df.iterrows()]
-        )          
-        # conditional_str = (
-        #    " & ".join([f"{col} in {np.unique(unique_keys_df[col]).tolist()}" 
-        #                for col in unique_keys_df.columns])  
-        # )
-        # ---- Append the additional constraint statement if present
-        if constraint is not None:
-            conditional_str = f"({conditional_str})" + f" & {constraint}"
-        # ---- SELECT the dataset using the conidtional statement
-        data_sql = SQL(db_file, "select", table_name=table_name, columns=valid_keys,
-                       condition=conditional_str).filter(data_columns)
-    else:
-        data_sql = None
-
-    # Return the table DataFrame
-    return data_sql
-
 def get_average_strata_weights(db_file: str,
                                data_dict: dict,
                                unique_columns: list):
@@ -164,6 +103,10 @@ def acoustic_pipeline(acoustic_dict: dict,
                 # Summarize strata
                 summarize_strata(nasc_biology, strata_df, file_configuration)
 
+                # Update grid
+                update_population_grid(file_configuration, coordinates=["x", "y"], 
+                                       dataset=nasc_biology)
+
 def get_nasc_sql_data(db_file: str,
                       data_dict: dict, 
                       unique_columns: List[str]):
@@ -246,7 +189,7 @@ def biology_pipeline(biology_dict: dict,
                                         unique_columns=unique_columns)        
 
         # Get the corresopding `sigma_bs` data (and also compute the sample-number weighted average)
-        sigma_bs_df = get_sigma_bs_sql_data(acoustic_db, 
+        sigma_bs_df = get_sigma_bs_sql_data(biology_db, 
                                             biology_dict,
                                             unique_columns=unique_columns)
 
@@ -262,8 +205,8 @@ def biology_pipeline(biology_dict: dict,
 
             # Get the corresponding average strata weights (computed for all fish)
             weight_spatial_averages = get_average_strata_weights(biology_db,
-                                                                biology_dict,
-                                                                unique_columns=unique_columns)
+                                                                 biology_dict,
+                                                                 unique_columns=unique_columns)
             
             if weight_spatial_averages is not None:
                 # Merge average weights with number density estimates
@@ -282,3 +225,7 @@ def biology_pipeline(biology_dict: dict,
             
                 # Summarize strata
                 summarize_strata(nasc_biology, strata_df, file_configuration)
+
+                # Update population grid
+                update_population_grid(file_configuration, coordinates=["stratum"], 
+                                       dataset=nasc_biology)
diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py
index c86f20b9..510e26a6 100644
--- a/echopop/live/live_spatial_methods.py
+++ b/echopop/live/live_spatial_methods.py
@@ -7,7 +7,8 @@
 from shapely.geometry import box
 import sqlalchemy as sqla
 from pathlib import Path
-from typing import Union
+from typing import Union, List
+from .sql_methods import sql_group_update, query_dataset
 
 def create_inpfc_strata(spatial_config: dict):
 
@@ -181,8 +182,8 @@ def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict):
     # Generate the cells
     grid_cells = []
     # ---- Iterate through
-    for y0 in np.arange(ymin, ymax+y_step, y_step):
-        for x0 in np.arange(xmin, xmax+x_step, x_step):
+    for y0 in np.arange(ymin, ymax, y_step):
+        for x0 in np.arange(xmin, xmax, x_step):
             x1 = x0-x_step
             y1 = y0+y_step
             grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))
@@ -210,9 +211,9 @@ def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict):
     dataset_gdf["stratum_x"] = pd.cut(
         dataset_gdf["x"],
         np.arange(xmin, xmax+x_step, x_step),
-        right = True,
-        labels = range(len(np.arange(xmin, xmax+x_step, x_step)) - 1),
-    ).astype(int) + 1
+        right = False,
+        labels = np.arange(1, len(np.arange(xmin, xmax+x_step, x_step))),
+    ).astype(int)
 
     # Bin the latitude data
     dataset_gdf["stratum_y"] = pd.cut(
@@ -501,6 +502,8 @@ def initialize_grid(file_configuration = dict):
 
             # Calculate area per cell
             cells_gdf.loc[:, "area"] = cells_gdf.area
+            # ---- Convert back to nmi^2 from m^2
+            cells_gdf.loc[:, "area"] = cells_gdf.loc[:, "area"] / 1852 ** 2
 
             # Convert back to original projection and clip 
             clipped_cells_latlon = (
@@ -540,3 +543,71 @@ def initialize_grid(file_configuration = dict):
             )
             # ---- Connect and create table
             _ = coastline_out.to_sql("coastline_df", engine, if_exists="replace", index=False)
+
+def update_population_grid(file_configuration: dict,
+                           coordinates: Union[List[str], str],
+                           dataset: Union[dict, pd.DataFrame]):
+
+    # Extract input directory settings
+    file_settings = file_configuration["input_directories"]
+
+    # Get filepath for grid
+    grid_db = list(
+        Path(file_configuration["database_directory"])
+        .glob(pattern=f"{file_settings["grid"]["database_name"]}")
+    )[0]
+
+    # Get filepath for acoustics
+    survey_db = list(
+        Path(file_configuration["database_directory"])
+        .glob(pattern=f"{file_settings["acoustics"]["database_name"]}")
+    )[0]
+
+    # Define the SQL tables that will be parsed and queries
+    data_table = "survey_data_df"
+    grid_table = "grid_df"
+
+    # Get indexed survey data
+    indexed_data = query_dataset(survey_db, 
+                                 dataset, 
+                                 table_name=data_table, 
+                                 data_columns=coordinates + ["x", "y", "number_density", 
+                                                             "biomass_density"], 
+                                 unique_columns=coordinates)
+    
+    # Get indexed grid data
+    indexed_grid = query_dataset(grid_db, 
+                                 indexed_data, 
+                                 table_name=grid_table, 
+                                 data_columns= ["x", "y", "area", "number_density_mean", 
+                                                "biomass_density_mean", "abundance", "biomass"], 
+                                 unique_columns=["x", "y"])
+    
+    # Set DataFrame index
+    indexed_grid.set_index(["x", "y"], inplace=True)
+
+    # Update the areal density esitmates
+    # ---- Number (animals/nmi^2)
+    indexed_grid["number_density_mean"] = indexed_data.groupby(["x", "y"])["number_density"].mean()
+    # ---- Bioamss (kg/nmi^2)
+    indexed_grid["biomass_density_mean"] = indexed_data.groupby(["x", "y"])["biomass_density"].mean()
+
+    # Compute the abundance and biomass per grid cell
+    # ---- Abundance (# animals)
+    indexed_grid["abundance"] = indexed_grid["number_density_mean"] * indexed_grid["area"]
+    # ---- kg
+    indexed_grid["biomass"] = indexed_grid["biomass_density_mean"] * indexed_grid["area"]
+
+    # Update grid table
+    # ---- Reset index
+    output_df = indexed_grid.reset_index()
+    # ---- Grouped update
+    sql_group_update(grid_db, dataframe=output_df, table_name=grid_table, 
+                     columns=["number_density_mean", "biomass_density_mean", "abundance", 
+                              "biomass"], 
+                     unique_columns=["x", "y"])
+
+
+
+
+    
\ No newline at end of file
diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py
new file mode 100644
index 00000000..e69de29b
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index 7ae3824f..eb009780 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -566,6 +566,29 @@ def get_table_key_names(db_file: Path, data_dict: dict, table_name: str) -> List
     # Return a list of the output
     return list(key_columns)
 
+def get_unique_identifiers(data_dict: dict,
+                           unique_columns: List[str]) -> pd.DataFrame:
+
+    # Gather all dataframes from a dictionary into a list
+    if isinstance(data_dict, dict):
+        df_list = [df for _, df in data_dict.items()]
+    else:
+        df_list = [data_dict]
+
+    # Get unique values of each contrast column across the biological datasets    
+    combined_df = pd.concat(
+        [df[unique_columns] for df in df_list if isinstance(df, pd.DataFrame) and all(col in df.columns for col in unique_columns)], 
+        ignore_index=True
+    ).drop_duplicates()
+    
+    # Reduce into a single DataFrame
+    return combined_df
+    # if len(unique_columns) > 1:
+    #     return reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs)
+    # else:
+    #     return reduce(lambda left, right: pd.merge(left, right, how="outer"), dfs)
+
+
 def parse_condition(condition: str):
     # Replace logical operators with SQL equivalents
     condition = condition.replace('&', ' AND ').replace('|', ' OR ')
@@ -699,6 +722,43 @@ def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str
                 f"Attempted reset of [{str(db_file)}] failed."
             )
 
+def query_dataset(db_file: str,
+                  data_dict: dict,
+                  table_name: str,
+                  data_columns: List[str],
+                  unique_columns: List[str],
+                  constraint: Optional[str] = None):
+    
+    # Validate that the desired table exists
+    if SQL(db_file, "validate", table_name=table_name):
+        # ---- Inspect the SQL table
+        inspected_table = SQL(db_file, "inspect", table_name=table_name)
+        # ---- Create a list of intersecting column names
+        unique_keys = list(set(inspected_table.keys()).intersection(set(unique_columns)))
+        # ---- Create list of valid columns
+        valid_keys = list(set(inspected_table.keys()).intersection(set(data_columns)))
+        # ---- Get unique identifiers
+        unique_keys_df = get_unique_identifiers(data_dict, unique_keys)
+        # ---- Create conditional string  
+        conditional_str = " | ".join(
+            [" & ".join([f"{col} = {val}" for col, val in row.items()]) 
+            for _, row in unique_keys_df.iterrows()]
+        )          
+        # conditional_str = (
+        #    " & ".join([f"{col} in {np.unique(unique_keys_df[col]).tolist()}" 
+        #                for col in unique_keys_df.columns])  
+        # )
+        # ---- Append the additional constraint statement if present
+        if constraint is not None:
+            conditional_str = f"({conditional_str})" + f" & {constraint}"
+        # ---- SELECT the dataset using the conidtional statement
+        data_sql = SQL(db_file, "select", table_name=table_name, columns=valid_keys,
+                       condition=conditional_str).filter(data_columns)
+    else:
+        data_sql = None
+
+    # Return the table DataFrame
+    return data_sql
 def sql_update_strata_summary(source_db: str,
                               target_db: str,
                               source_table: str,
diff --git a/echopop/mesh_generation.py b/echopop/mesh_generation.py
index 699eed4f..7752fe63 100644
--- a/echopop/mesh_generation.py
+++ b/echopop/mesh_generation.py
@@ -2033,28 +2033,167 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict):
 # plt.xlim(lon_min-3, lon_max+3)
 # plt.ylim(lat_min-3, lat_max+3)
 # plt.show()
-test = SQL(db_filepath, "select", table_name="grid_df")
+from echopop.live.sql_methods import SQL
 from shapely import wkt
 import matplotlib.pyplot as plt
+import geopandas as gpd
+import matplotlib.colors as colors
+import matplotlib.cm as cm
+import numpy as np
+from matplotlib.colors import ListedColormap
+import matplotlib.dates as mdates
+from datetime import datetime
+db_filepath = realtime_survey.config["database"]["grid"]
+survey_db = realtime_survey.config["database"]["acoustics"]
+grid_df = SQL(db_filepath, "select", table_name="grid_df")
+# grid_df[grid_df.abundance > 0]
+grid_df[grid_df.abundance > 1e10]
+# grid_df[grid_df.abundance > 0]
+coast_df = SQL(db_filepath, "select", table_name="coastline_df")
+survey_df = SQL(survey_db, "select", table_name="survey_data_df")
+
+# def parse_datetime(date_str):
+#     # List of possible formats
+#     formats = [
+#         '%Y-%m-%d %H:%M:%S.%f',  # With fractional seconds
+#         '%Y-%m-%d %H:%M:%S',     # Without fractional seconds
+#         '%Y-%m-%dT%H:%M:%S.%f',  # ISO 8601 format with fractional seconds
+#         '%Y-%m-%dT%H:%M:%S'      # ISO 8601 format without fractional seconds
+#     ]
+    
+#     for fmt in formats:
+#         try:
+#             return pd.to_datetime(date_str, format=fmt)
+#         except (ValueError, TypeError):
+#             continue  # Try the next format
+    
+#     return pd.NaT  # Return NaT if no formats match
 
-test = output_df.copy()
-test["geometry"] = test["geometry"].apply(wkt.loads)
-test_gdf = gpd.GeoDataFrame(test, geometry="geometry", crs=projection)
+# survey_df["ping_time"] = survey_df["ping_time"].apply(parse_datetime)
 
-co = SQL(db_filepath, "select", table_name="coastline_df")
-co["geometry"] = co["geometry"].apply(wkt.loads)
-co_gdf = gpd.GeoDataFrame(co, geometry="geometry", crs=projection)
+# pd.to_datetime(survey_df["ping_time"], format='%Y-%m-%d %H:%M:%S.%f', errors="coerce")
 
-lims = test_gdf.total_bounds
+# fig, ax = plt.subplots(figsize=(5, 8))
+# ax.scatter(survey_df.ping_time, survey_df.nasc)
+# plt.ylabel("NASC")
+# # ax.xaxis.set_major_locator(mdates.DayLocator(5, 10, 15))
+# plt.show()
 
-fig, ax = plt.subplots(figsize=(10, 10))
-test_gdf.plot(ax=ax, column="abundance", edgecolor="black", cmap="viridis", legend=False)
-co_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
+
+# times = np.arange(np.datetime64('2001-01-02'),
+#                   np.datetime64('2002-02-03'), np.timedelta64(75, 'm'))
+# y = np.random.randn(len(times))
+# survey_df[(survey_df.nasc > 0) & (survey_df.nasc < 1e5)]["nasc"].mean()
+# survey_df[(survey_df.nasc > 0) & (survey_df.nasc > 1e5)]["nasc"].mean()
+
+# fig, ax = plt.subplots()
+# ax.plot(times, y)
+# survey_df[(survey_df.number_density > 0) & (survey_df.x == 21)]
+# # a = self.input["acoustics"]["prc_nasc_df"]
+# # survey_df[(survey_df.x) == 24 & (survey_df.y == 13)]
+
+grid_df["geometry"] = grid_df["geometry"].apply(wkt.loads)
+coast_df["geometry"] = coast_df["geometry"].apply(wkt.loads)
+
+projection = realtime_survey.config["geospatial"]["projection"]
+
+grid_gdf = gpd.GeoDataFrame(grid_df, geometry="geometry", crs=projection)
+grid_gdf_1 = grid_gdf[grid_gdf.abundance > 0]
+coast_gdf = gpd.GeoDataFrame(coast_df, geometry="geometry", crs=projection)
+
+lims = grid_gdf.total_bounds
+# nu = dataset_gdf[(dataset_gdf.stratum_x == 25) & (dataset_gdf.stratum_y == 11)]
+# dataset_gdf.stratum_x.max() 
+# # np.linspace(1, 1, len(np.arange(xmin, xmax+x_step, x_step))-1)
+
+# # np.arange(1, len(np.arange(xmin, xmax+x_step, x_step)))
+# pd.cut(
+#     nu["x"],
+#     np.arange(xmin, xmax, x_step),
+#     right = False,
+#     labels = np.arange(1, len(np.arange(xmin, xmax, x_step))),
+# ).astype(int) - 1
+# grid_gdf["x"] =  grid_gdf["x"] - 1
+
+# fig, ax = plt.subplots(figsize=(5, 8))
+# grid_gdf.plot(ax=ax, edgecolor="gainsboro", color="white", linewidth=0.5, legend=False)
+# plt.plot(dataset_gdf.longitude, dataset_gdf.latitude, linewidth=1, color='black')
+# plt.plot(nu.longitude, nu.latitude, linewidth=1, color="red")
+# # Calculate centroids and plot text
+# for idx, row in grid_gdf.iterrows():
+#     centroid = row.geometry.centroid
+#     var = f"{row.x}-{row.y}"
+#     ax.annotate(var, xy=(centroid.x, centroid.y), 
+#                 xytext=(0,0), fontsize=8, 
+#                 textcoords="offset points",
+#                 ha='center', va='center', color='black')
+# plt.tight_layout()
+# plt.margins(0, 0)
+# coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
+# plt.xlim(lims[0]*1.005, lims[2]*1.01)
+# plt.ylim(lims[1]*0.98, lims[3]*1.005)
+# plt.show()
+
+
+variable = "abundance"
+VARIABLE_MAP = {
+    "number_density_mean": {
+        "name": "Mean number density",
+        "units": "fish $\\mathregular{nmi^{-2}}$"
+    }, 
+    "biomass_density_mean": {
+        "name": "Mean biomass density",
+        "units": "kg $\\mathregular{nmi^{-2}}$"
+    },     
+    "biomass": {
+        "name": "Biomass",
+        "units": "kg"
+    },
+    "abundance": {
+        "name": "Abundance",
+        "units": "$\\it{N}$"
+    }
+}
+
+viridis = plt.colormaps.get_cmap('viridis').resampled(1024)
+newcolors = viridis(np.linspace(0, 1, 1024))[::-1]
+white = np.array([1, 1, 1, 1])
+newcolors[0, :] = white
+custom_cmap = ListedColormap(newcolors)
+# Check the minimum and maximum values for normalization
+
+
+fig, ax = plt.subplots(figsize=(5, 8))
+grid_gdf.plot(ax=ax, edgecolor="gainsboro", color="white", linewidth=0.5, legend=False)
+grid_gdf_1.plot(ax=ax, column=variable, edgecolor="black", linewidth=2, cmap=custom_cmap, legend=False, norm=norm)
+plt.scatter(survey_df["longitude"], survey_df["latitude"], linewidth=0.5, color="black")
+vmin = grid_gdf[variable][grid_gdf[variable] > 0.0].min()
+vmax = grid_gdf[variable].max()
+norm = colors.Normalize(vmin=0, vmax=vmax, clip=False)
+# norm = colors.Normalize(vmin=grid_gdf[variable][grid_gdf[variable] > 0.0].min(), vmax=grid_gdf[variable].max())
+# cbar = plt.colorbar(cm.ScalarMappable(norm=norm, cmap=custom_cmap), ax=ax, orientation="horizontal", shrink=0.5)
+cbar = plt.colorbar(cm.ScalarMappable(cmap=custom_cmap, norm=norm), ax=ax, orientation="horizontal", shrink=0.5)
+cbar.set_label(f"{VARIABLE_MAP[variable]["name"]} ({VARIABLE_MAP[variable]["units"]})", 
+               fontsize=12, labelpad=10, loc='center')  
+cbar.ax.xaxis.set_label_position('top')
+cbar.ax.xaxis.set_ticks_position('top')
+plt.tight_layout()
+plt.margins(0,0)
+# grid_gdf_1.plot(ax=ax, linewidth=1.5, color="black")
+coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
 plt.xlim(lims[0]*1.005, lims[2]*1.01)
 plt.ylim(lims[1]*0.98, lims[3]*1.005)
+plt.xlabel(u'Longitude (\u00B0E)')
+plt.ylabel(u'Latitude (\u00B0N)')
 plt.show()
 
 
+co = SQL(db_filepath, "select", table_name="coastline_df")
+co["geometry"] = co["geometry"].apply(wkt.loads)
+co_gdf = gpd.GeoDataFrame(co, geometry="geometry", crs=projection)
+
+
+
 test["geometry"].apply(wkt.loads)
 clipped_cells_latlon["geometry"]
 len(bbox_latlon.exterior.coords)
diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index e52c6739..7f2006c5 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -1,6 +1,5 @@
 from echopop.live.live_survey import LiveSurvey
 from echopop.live.sql_methods import SQL
-
 ####################################################################################################
 # TEST: Set up `LiveSurvey` object
 # NOTE: General initialization parameter configuration
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
index 2df92682..101bc81a 100644
--- a/echopop/zarr_read_ingest_test.py
+++ b/echopop/zarr_read_ingest_test.py
@@ -24,6 +24,27 @@
 from echopop.live.live_acoustics import integrate_nasc, configure_transmit_frequency
 from echopop.live.live_biology import preprocess_biology_data
 from echopop.survey import Survey
+import geopandas as gpd
+import pandas as pd
+import numpy as np
+import shapely.geometry
+from shapely.geometry import box
+from echopop.spatial.projection import utm_string_generator
+from geopy.distance import distance
+from echopop.live.sql_methods import SQL
+from shapely import wkt
+import matplotlib.pyplot as plt
+import geopandas as gpd
+import matplotlib.colors as colors
+import matplotlib.cm as cm
+import numpy as np
+from matplotlib.colors import ListedColormap
+self = realtime_survey
+spatial_config = self.config["geospatial"]
+dataset = self.input["acoustics"]["nasc_df"]
+
+
+
 
 survey_2019 = Survey("C:/Users/Brandyn/Documents/GitHub/echopop/config_files/initialization_config.yml", "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/survey_year_2019_config.yml")
 survey_2019.transect_analysis()
@@ -434,6 +455,8 @@ def biology_pipeline(biology_dict: dict,
 SQL(survey_db_file, "select", table_name=data_table)
 SQL(data_table, "map")
 
+gridding_column = self.config["gridding_column"]
+
 updated_survey_data = nasc_biology.copy()
 # Get relevant table
 previous_grid = query_dataset(grid_db_file, updated_survey_data, 
@@ -441,23 +464,39 @@ def biology_pipeline(biology_dict: dict,
                               data_columns=["x", "y", "area", "number_density_mean", 
                                             "biomass_density_mean", "abundance", "biomass"],
                               unique_columns=["x", "y"])
+previous_data = query_dataset(survey_db_file, updated_survey_data, 
+                              table_name=data_table,
+                              data_columns=["x", "y", "number_density", "biomass_density"],
+                              unique_columns=["x", "y"])
+# Get unique coordinates
+update_keys = get_unique_identifiers(updated_survey_data, gridding_column).set_index(["x", "y"])
+
 
 # Index
 previous_grid.set_index(["x", "y"], inplace=True)
-previous_grid["biomass_density_mean"] = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean()
-previous_grid["number_density_mean"] = updated_survey_data.groupby(["x", "y"])["number_density"].mean()
+previous_grid["biomass_density_mean"] = previous_data.groupby(["x", "y"])["biomass_density"].mean()
+previous_grid["number_density_mean"] = previous_data.groupby(["x", "y"])["number_density"].mean()
 
 # Convert area from m^2 to nmi^2
 previous_grid["abundance"] = previous_grid["number_density_mean"] * previous_grid["area"]
 previous_grid["biomass"] = previous_grid["biomass_density_mean"] * previous_grid["area"]
+previous_grid = previous_grid.reset_index()
 
-# Get unique coordinates
-update_keys = get_unique_identifiers(updated_survey_data, gridding_column).set_index(["x", "y"])
-update_keys["number_density_mean"] = updated_survey_data.groupby(["x", "y"])["number_density"].mean()
-update_keys["biomass_density_mean"] = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean()
+sql_group_update(grid_db_file, dataframe=previous_grid, 
+                 table_name=grid_table, 
+                 columns=["number_density_mean", "biomass_density_mean", "abundance", "biomass"], 
+                 unique_columns=["x", "y"])
 
+murr = SQL(grid_db_file, "select", table_name=grid_table)
+murr[murr.abundance > 0]
 
+update_keys["number_density_mean"] = updated_survey_data.groupby(["x", "y"])["number_density"].mean()
+update_keys["biomass_density_mean"] = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean()
 
+am = SQL(grid_db_file, "select", table_name="grid_df")
+am[am.abundance > 0]
+bm = SQL(grid_db_file, "select", table_name="grid_df")
+bm[bm.abundance > 0]
 number_density_mean = updated_survey_data.groupby(["x", "y"])["number_density"].mean()
 biomass_density_mean = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean()
 
@@ -1656,6 +1695,10 @@ def __init__(
 TS_SLOPE = 20.0
 TS_INTERCEPT = -68.0
 
+acoustic_db = realtime_survey.config["database"]["acoustics"]
+SQL(acoustic_db, "select", table_name="files_processed")
+biology_db = realtime_survey.config["database"]["biology"]
+SQL(biology_db, "select", table_name="files_processedk")
 ####
 # CONCATENATE FILE SOURCES
 specimen_reframed = specimen_df.groupby(["haul_num", "station", "sex", "length"])["length"].value_counts().to_frame("length_count").reset_index()
@@ -1666,6 +1709,12 @@ def __init__(
 comb_lengths = all_lengths.groupby(["haul_num", "sex", "length"])["length_count"].sum().to_frame("length_count").reset_index()
 
 
+from echopop.live.sql_methods import SQL
+
+# Assuming that you have a LiveSurvey object defined
+# ---- Get the database file name (and path)
+biology_db = livesurvey_object.config["database"]["biology"]
+# ----
 # CONVERT TO TS
 comb_lengths["ts"] = TS_SLOPE * np.log10(comb_lengths["length"]) + TS_INTERCEPT
 # TO SIGMA_BS
@@ -1673,7 +1722,6 @@ def __init__(
 # WEIGHTED MEAN SIGMA_BS
 sigma_mean = np.average(comb_lengths["sigma_bs"], weights=comb_lengths["length_count"])
 
-### 
 # INTEGRATE NASC
 path2file = "C:/Users/15052/Downloads/win_1720457505_1720460000_NASC.zarr"
 

From dd87bc909216736f1a422e4c8a94c3776f150d12 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Tue, 13 Aug 2024 10:16:51 -0700
Subject: [PATCH 28/81] Grid fix

---
 echopop/live/live_spatial_methods.py |  4 +-
 echopop/test_workflow.py             | 94 +++++++++++++++++++++++++++-
 2 files changed, 95 insertions(+), 3 deletions(-)

diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py
index 510e26a6..33f534dd 100644
--- a/echopop/live/live_spatial_methods.py
+++ b/echopop/live/live_spatial_methods.py
@@ -554,13 +554,13 @@ def update_population_grid(file_configuration: dict,
     # Get filepath for grid
     grid_db = list(
         Path(file_configuration["database_directory"])
-        .glob(pattern=f"{file_settings["grid"]["database_name"]}")
+        .glob(pattern=f"{file_settings['grid']['database_name']}")
     )[0]
 
     # Get filepath for acoustics
     survey_db = list(
         Path(file_configuration["database_directory"])
-        .glob(pattern=f"{file_settings["acoustics"]["database_name"]}")
+        .glob(pattern=f"{file_settings['acoustics']['database_name']}")
     )[0]
 
     # Define the SQL tables that will be parsed and queries
diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index 7f2006c5..e95f7336 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -65,4 +65,96 @@
 # NOTE: Quantized length-binned weights (summed)
 SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df")
 # NOTE: Average weights per stratum
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df")
\ No newline at end of file
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df")
+
+dat = realtime_survey.input["acoustics"]["prc_nasc_df"].copy()
+dat = dat[dat.latitude > 40]
+dat = dat[dat.depth > 20]
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+from geopy.distance import geodesic
+import pandas as pd
+from datetime import datetime
+import matplotlib.dates as mdates
+def calculate_distances(df):
+    distances = [0]  # Start with 0 for the first point
+    for i in range(1, len(df)):
+        point1 = (df.iloc[i - 1]['latitude'], df.iloc[i - 1]['longitude'])
+        point2 = (df.iloc[i]['latitude'], df.iloc[i]['longitude'])
+        distances.append(geodesic(point1, point2).meters)
+    return distances
+
+def parse_datetime(date_str):
+    # List of possible formats
+    formats = [
+        '%Y-%m-%d %H:%M:%S.%f',  # With fractional seconds
+        '%Y-%m-%d %H:%M:%S',     # Without fractional seconds
+        '%Y-%m-%dT%H:%M:%S.%f',  # ISO 8601 format with fractional seconds
+        '%Y-%m-%dT%H:%M:%S'      # ISO 8601 format without fractional seconds
+    ]
+    
+    for fmt in formats:
+        try:
+            return pd.to_datetime(date_str, format=fmt)
+        except (ValueError, TypeError):
+            continue  # Try the next format
+    
+    return pd.NaT  # Return NaT if no formats match
+
+dat["ping_time"] = dat["ping_time"].apply(parse_datetime)
+
+pivot_table = dat.pivot_table(index=["depth"], columns=["ping_time"], values=["NASC"], aggfunc="mean")
+# Get the unique distance and depth values for plotting
+plt.figure(figsize=(10, 8))
+ax = sns.heatmap(pivot_table, cmap="viridis", cbar_kws={'label': 'NASC'})
+plt.gca().xaxis.set_major_locator(mdates.MinuteLocator(interval=30))  # Major ticks every 30 minutes
+plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))  # Format as hour:minute
+plt.gcf().autofmt_xdate()
+ax.set_xticks(ax.get_xticks()[::max(len(ax.get_xticks()) // 10, 1)])  # Show fewer ticks if necessary
+plt.xlabel('Ping time')
+plt.ylabel('Depth')
+# plt.gca().invert_yaxis()  # To have depth increasing downwards like in a typical depth plot
+plt.show()
+
+
+dat.groupby(["ping_time"]).size()
+unique_pairs = dat.drop_duplicates(subset=['latitude', 'longitude']).sort_values("ping_time")
+
+unique_pairs["d"] = calculate_distances(dat)
+df['cumulative_distance'] = df['distance'].cumsum()
+
+unique_distances = dat.groupby('source')[['latitude', 'longitude']].unique().reset_index()
+unique_distances = unique_distances.explode('distance')
+
+
+# Create a pivot table to reshape the dataframe suitable for a heatmap
+dat['source_id'] = dat['source'].astype('category').cat.codes
+pivot_table = dat.pivot(index=["depth"], columns=["distance"], values=["NASC"])
+dat.groupby('source')['distance'].cumsum()
+plt.plot(index="depth", columns="distance", values="NASC")
+plt.show()
+
+data = {
+    'distance': [1, 1, 2, 2, 1, 1, 3, 3],
+    'depth': [1, 2, 1, 2, 1, 2, 1, 2],
+    'source': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B']
+}
+dat = pd.DataFrame(data)
+dat = dat.sort_values(by=['source', 'distance'])
+unique_distances = dat.groupby('source')['distance'].unique().reset_index()
+unique_distances = unique_distances.explode('distance')
+
+unique_distances['distance'] = pd.to_numeric(unique_distances['distance'], errors='coerce')
+unique_distances['distance_diff'] = unique_distances.groupby('source')['distance'].diff().fillna(0)
+unique_distances['cumsum_diff'] = unique_distances.groupby('source')['distance_diff'].cumsum()
+unique_distances['Cumsum_dist'] = unique_distances['cumsum_diff'].cumsum()
+unique_distances['Cumsum_dist'] = pd.to_numeric(unique_distances['Cumsum_dist'], errors='coerce')
+dat = dat.merge(unique_distances[['source', 'distance', 'Cumsum_dist']], on=['source', 'distance'], how='left')
+
+
+# Calculate cumulative sum of distances for each source
+dat['Cumsum_dist'] = dat.groupby('source')['distance'].transform(lambda x: x.cumsum())
+dat['Cumsum_dist_within_source'] = dat.groupby('source')['distance'].cumsum()
+
+dat['Cumsum_dist'] = dat.groupby('source')['Cumsum_dist_within_source'].transform(lambda x: x + x.shift(1).fillna(0).cumsum())

From 8693641f1bc6dfd5069cc5db9fa853680308138c Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 10:32:53 -0700
Subject: [PATCH 29/81] Add `xarray` kwargs options

---
 echopop/live/live_data_loading.py | 10 ++++++----
 echopop/live/live_survey.py       |  8 +++++---
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index 0ad82db5..e8b29ff9 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -57,7 +57,8 @@ def live_configuration(live_init_config_path: Union[str, Path],
     # Combine both into a dictionary output that can be added to the `LiveSurvey` class object
     return {**init_config, **file_config}
 
-def read_acoustic_files(acoustic_files: List[Path]) -> tuple:
+def read_acoustic_files(acoustic_files: List[Path],
+                        xarray_kwargs: dict = {}) -> tuple:
 
     # Get the file-specific settings, datatypes, columns, etc.
     # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
@@ -66,7 +67,8 @@ def read_acoustic_files(acoustic_files: List[Path]) -> tuple:
     # Read all of the zarr files
     results_list =  [(data_df, unit_dict) if i ==0 else (data_df, None) 
                      for i, (data_df, unit_dict) in enumerate(
-                        read_acoustic_zarr(Path(file), acoustics_config_map) 
+                        read_acoustic_zarr(Path(file), acoustics_config_map, 
+                                           xarray_kwargs=xarray_kwargs) 
                         for file in acoustic_files
     )]
 
@@ -154,7 +156,7 @@ def read_biology_files(biology_files: List[Path], file_configuration: dict):
     # Return the output
     return biology_output
 
-def read_acoustic_zarr(file: Path, config_map: dict) -> tuple:
+def read_acoustic_zarr(file: Path, config_map: dict, xarray_kwargs: dict = {}) -> tuple:
     
     # Format the file reading configuration
     # ---- Concatenate into a full configuration map
@@ -162,7 +164,7 @@ def read_acoustic_zarr(file: Path, config_map: dict) -> tuple:
                         **config_map["xarray_variables"]} 
 
     # Determine the file loading method for the `acoustic_files`
-    zarr_data_ds = xr.open_dataset(file, engine="zarr", chunks="auto")
+    zarr_data_ds = xr.open_dataset(file, engine="zarr", chunks="auto", **xarray_kwargs)
 
     # Pre-process the Dataset, convert it to a DataFrame, and validate the structure
     # ---- Convert to a DataFrame
diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index 870b57da..9da07806 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -128,12 +128,14 @@ def __str__(self):
         return self.__repr__()
 
     def load_acoustic_data(self,
-                           input_filenames: Optional[list] = None,
+                           xarray_kwargs: dict = {},
+                           input_filenames: Optional[list] = None,                           
                            verbose: bool = True):
         
         # Validate the data directory and format the filepaths
-        acoustic_files = eldl.validate_data_directory(self.config, dataset="acoustics", 
-                                                      input_filenames=input_filenames)
+        acoustic_files = eldl.validate_data_directory(self.config, dataset="acoustics",                                                       
+                                                      input_filenames=input_filenames,
+                                                      xarray_kwargs=xarray_kwargs)
         
         # Read in the acoustic data files
         if acoustic_files:

From faed21a809bb9a23620cabf129ba6f7f5abc4b9b Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 10:38:19 -0700
Subject: [PATCH 30/81] `pandas` kwargs storage options

---
 echopop/live/live_data_loading.py | 10 ++++++----
 echopop/live/live_survey.py       |  4 +++-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index e8b29ff9..30e14904 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -96,7 +96,8 @@ def filter_filenames(directory_path: Path, filename_id: str,
     # Find intersection with the proposed filenames and return the output
     return list(set(subfile_str).intersection(set(file_str)))
 
-def read_biology_files(biology_files: List[Path], file_configuration: dict):
+def read_biology_files(biology_files: List[Path], file_configuration: dict, 
+                       pandas_kwargs: dict = {}):
 
     # Get the biology data file settings
     file_settings = file_configuration["input_directories"]["biology"]
@@ -137,7 +138,8 @@ def read_biology_files(biology_files: List[Path], file_configuration: dict):
             # ---- Read in validated biology data
             dataframe_list = [read_biology_csv(Path(file), 
                                                file_settings["file_name_formats"][dataset], 
-                                               biology_config_map[dataset]) 
+                                               biology_config_map[dataset],
+                                               pandas_kwargs) 
                               for file in dataset_files]
             # ---- Concatenate the dataset
             dataframe_combined = pd.concat(dataframe_list, ignore_index=True)
@@ -265,10 +267,10 @@ def compile_filename_format(file_name_format: str):
     # Compile the regex pattern and return the output
     return re.compile(regex_pattern)
 
-def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict):
+def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict, pandas_kwargs: dict = {}):
 
     # Read in the `*.csv` file
-    df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys()))
+    df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys()), **pandas_kwargs)
 
     # Validate the dataframe
     # ---- Check for any missing columns
diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index 9da07806..fa738967 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -165,6 +165,7 @@ def load_acoustic_data(self,
             self.input["acoustics"]["prc_nasc_df"] = None
 
     def load_biology_data(self,
+                          pandas_kwargs: dict = {},
                           input_filenames: Optional[list] = None,
                           verbose: bool = True):
 
@@ -182,7 +183,8 @@ def load_biology_data(self,
             )
         
             # Read in the biology data files
-            initial_biology_output = eldl.read_biology_files(biology_files, self.config)
+            initial_biology_output = eldl.read_biology_files(biology_files, self.config,
+                                                             pandas_kwargs=pandas_kwargs)
 
             # Preprocess the biology dataset
             self.input["biology"], self.input["biology_processed"] = (

From af9385170ca0ac5f0ae57a11e74ac626e83a63ca Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 11:08:40 -0700
Subject: [PATCH 31/81] `xarray_kwargs` patch

---
 echopop/live/live_survey.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index fa738967..3d5dd446 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -128,20 +128,20 @@ def __str__(self):
         return self.__repr__()
 
     def load_acoustic_data(self,
-                           xarray_kwargs: dict = {},
+                           xarray_kwargs: dict = {},                           
                            input_filenames: Optional[list] = None,                           
                            verbose: bool = True):
         
         # Validate the data directory and format the filepaths
         acoustic_files = eldl.validate_data_directory(self.config, dataset="acoustics",                                                       
-                                                      input_filenames=input_filenames,
-                                                      xarray_kwargs=xarray_kwargs)
+                                                      input_filenames=input_filenames)
         
         # Read in the acoustic data files
         if acoustic_files:
             # ! [REQUIRES DASK] ---- Read in the listed file
             # ---- Read in the acoustic data files
-            prc_nasc_df, acoustic_data_units = eldl.read_acoustic_files(acoustic_files)
+            prc_nasc_df, acoustic_data_units = eldl.read_acoustic_files(acoustic_files, 
+                                                                        xarray_kwargs=xarray_kwargs)
             # ---- Add the `acoustic_data_units` to the dictionary
             self.config["acoustics"]["dataset_units"] = acoustic_data_units   
             # ---- Preprocess the acoustic dataset

From 0b13fa74d70cb88999165897c589ba7eb22fd9e8 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 11:17:45 -0700
Subject: [PATCH 32/81] Disable file/directory existence checker

---
 echopop/live/live_data_loading.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index 30e14904..caa55244 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -217,19 +217,19 @@ def validate_data_directory(file_configuration: dict, dataset: str,
 
     # Validate filepath, columns, datatypes
     # ---- Error evaluation (if applicable)
-    if not directory_path.exists():
-        raise FileNotFoundError(
-            f"The acoustic data directory [{directory_path}] does not exist."
-        )
+    # if not directory_path.exists():
+    #     raise FileNotFoundError(
+    #         f"The acoustic data directory [{directory_path}] does not exist."
+    #     )
 
     # Validate that files even exist
     # ---- List available *.zarr files
     data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}"))
     # ---- Error evaluation (if applicable)
-    if not data_files:
-        raise FileNotFoundError(
-            f"No `*.{file_settings['extension']}` files found in [{directory_path}]!"
-        )
+    # if not data_files:
+    #     raise FileNotFoundError(
+    #         f"No `*.{file_settings['extension']}` files found in [{directory_path}]!"
+    #     )
     
     # Check and format specific input filenames
     if isinstance(input_filenames, list):

From 9e9ae077202988c4515ddbb4746262ebf96e3b94 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 11:26:38 -0700
Subject: [PATCH 33/81] Remove `Path` typing for acoustic zarr input

---
 echopop/live/live_data_loading.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index caa55244..afd9be00 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -67,7 +67,7 @@ def read_acoustic_files(acoustic_files: List[Path],
     # Read all of the zarr files
     results_list =  [(data_df, unit_dict) if i ==0 else (data_df, None) 
                      for i, (data_df, unit_dict) in enumerate(
-                        read_acoustic_zarr(Path(file), acoustics_config_map, 
+                        read_acoustic_zarr(file, acoustics_config_map, 
                                            xarray_kwargs=xarray_kwargs) 
                         for file in acoustic_files
     )]

From a46ccacb95beb69d74e28df3f5e2c7eb7558577b Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 11:52:57 -0700
Subject: [PATCH 34/81] Attempts pathing fixes

---
 echopop/live/live_data_loading.py | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index afd9be00..96485229 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -207,13 +207,20 @@ def validate_data_directory(file_configuration: dict, dataset: str,
     # Get the acoustic file settings and root directory
     # ---- Root directory
     if "data_root_dir" in file_configuration.keys():
-        root_directory = Path(file_configuration["data_root_dir"])
+        # root_directory = Path(file_configuration["data_root_dir"])
+        root_directory = file_configuration["data_root_dir"]
     else: 
-        root_directory = Path()
+        # root_directory = Path()
+        root_directory = ""
     # ---- File folder
-    data_directory = Path(file_settings["directory"])
+    # data_directory = Path(file_settings["directory"])
+    data_directory = file_settings["directory"]
     # ---- Createa directory path
-    directory_path = root_directory / data_directory
+    # directory_path = root_directory / data_directory
+    if root_directory != "":    
+        directory_path = "/".join([root_directory, data_directory])
+    else:
+        directory_path = data_directory
 
     # Validate filepath, columns, datatypes
     # ---- Error evaluation (if applicable)
@@ -224,7 +231,7 @@ def validate_data_directory(file_configuration: dict, dataset: str,
 
     # Validate that files even exist
     # ---- List available *.zarr files
-    data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}"))
+    # data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}"))
     # ---- Error evaluation (if applicable)
     # if not data_files:
     #     raise FileNotFoundError(
@@ -233,21 +240,25 @@ def validate_data_directory(file_configuration: dict, dataset: str,
     
     # Check and format specific input filenames
     if isinstance(input_filenames, list):
-        data_files = [directory_path / filename for filename in input_filenames]
+        # data_files = [directory_path / filename for filename in input_filenames]
+        data_files = ["/".join([directory_path, filename]) for filename in input_filenames]
     # ---- Raise Error
     elif input_filenames is not None:
         raise TypeError(
             "Data loading argument `input_filenames` must be a list."
         )        
-    #
-    root_directory = file_configuration["database_directory"]
+    else:
+        data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}"))
+
+    # Database root directory
+    database_root_directory = file_configuration["database_directory"]
 
     # Initialize the database file
-    initialize_database(root_directory, file_settings)
+    initialize_database(database_root_directory, file_settings)
     
     # Query the SQL database to process only new files (or create the db file in the first place)
     valid_files, file_configuration["database"][dataset] = (
-        query_processed_files(root_directory, file_settings, data_files)
+        query_processed_files(database_root_directory, file_settings, data_files)
     )
 
     # Return the valid filenames/paths

From 48dd27a2fb04e151f668f97efa2dcd2694516f12 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 12:00:37 -0700
Subject: [PATCH 35/81] More Path removal changes

---
 echopop/live/live_data_loading.py | 2 +-
 echopop/live/sql_methods.py       | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index 96485229..6b9b50ef 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -96,7 +96,7 @@ def filter_filenames(directory_path: Path, filename_id: str,
     # Find intersection with the proposed filenames and return the output
     return list(set(subfile_str).intersection(set(file_str)))
 
-def read_biology_files(biology_files: List[Path], file_configuration: dict, 
+def read_biology_files(biology_files: List[str], file_configuration: dict, 
                        pandas_kwargs: dict = {}):
 
     # Get the biology data file settings
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index eb009780..d1504f90 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -650,9 +650,11 @@ def query_processed_files(root_directory: Path, file_settings: dict, files: List
     # Create filepath to the SQL database
     # ---- Create Path to SQL database file
     # db_directory = Path(file_configuration["database_directory"])
-    db_directory = Path(root_directory)
+    # db_directory = Path(root_directory)
+    db_directory = root_directory
     # ---- Complete path to the database file
-    db_file = db_directory / db_name
+    # db_file = db_directory / db_name
+    db_file = "/".join([db_directory, db_name])
 
     # Create a list of string-formatted Path names
     files_str = [str(file) for file in files]

From 7c5d38ec5c06616dd8945a79823500986e901853 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 12:01:08 -0700
Subject: [PATCH 36/81] More Path removal

---
 echopop/live/live_data_loading.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index 6b9b50ef..cbf04526 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -57,7 +57,7 @@ def live_configuration(live_init_config_path: Union[str, Path],
     # Combine both into a dictionary output that can be added to the `LiveSurvey` class object
     return {**init_config, **file_config}
 
-def read_acoustic_files(acoustic_files: List[Path],
+def read_acoustic_files(acoustic_files: List[str],
                         xarray_kwargs: dict = {}) -> tuple:
 
     # Get the file-specific settings, datatypes, columns, etc.

From 152b703c8748904ee80ff828b601978c838c3484 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 12:46:40 -0700
Subject: [PATCH 37/81] Coastline db update fixes (pathing)

---
 echopop/live/live_spatial_methods.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py
index 33f534dd..9fbefd05 100644
--- a/echopop/live/live_spatial_methods.py
+++ b/echopop/live/live_spatial_methods.py
@@ -358,18 +358,22 @@ def initialize_grid(file_configuration = dict):
 
     # Get root directory, if defined
     if "data_root_dir" in file_configuration:
-        root_dir = Path(file_configuration["data_root_dir"])
+        # root_dir = Path(file_configuration["data_root_dir"])
+        root_dir = file_configuration["data_root_dir"]
     else:
-        root_dir = Path()
+        # root_dir = Path()
+        root_dir = ""
 
     # Get `grid` settings
     grid_database = file_configuration["input_directories"]["grid"]["database_name"]
     # ----
     db_directory = Path(file_configuration["database_directory"])
+    # db_directory = file_configuration["database_directory"]
 
     # Create full filepath
     # db_filepath = root_dir / "database" / grid_database
     db_filepath = db_directory / grid_database
+    # db_filepath = "/".join([db_directory, grid_database])
     # ---- Update config
     file_configuration["database"]["grid"] = db_filepath
 
@@ -455,19 +459,23 @@ def initialize_grid(file_configuration = dict):
             # Get coastline settings
             coast_settings = file_configuration["input_directories"]["coastline"]
             # ---- Get root folder directory
-            coast_root = root_dir / coast_settings["directory"] / coast_settings["coastline_name"] 
+            # coast_root = root_dir / coast_settings["directory"] / coast_settings["coastline_name"] 
+            coast_root = (
+                "/".join([root_dir, coast_settings["directory"], coast_settings["coastline_name"]])
+            )
             # ---- Create filepath
             shp_filepath = (
                 # root_dir / coast_settings["directory"] 
                 # / coast_settings["coastline_name"] 
-                coast_root
-                / f"{coast_settings['coastline_name']}.shp"
+                # coast_root
+                # / f"{coast_settings['coastline_name']}.shp"
+                "/".join([coast_root, f"{coast_settings['coastline_name']}.shp"])
             )
             # ---- Validate existence
-            if not shp_filepath.exists():
-                raise FileNotFoundError(
-                    f"{shp_filepath} does not exist!"
-                )
+            # if not shp_filepath.exists():
+            #     raise FileNotFoundError(
+            #         f"{shp_filepath} does not exist!"
+            #     )
 
             # Get original lat/lon geometry boundaries
             xmin0, ymin0, xmax0, ymax0 = boundary_gdf.total_bounds

From c75be735edaf86adbbe69f307ba02a79513179f9 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 12:54:17 -0700
Subject: [PATCH 38/81] Add `storage_options` input for `pygrio.read_file`

---
 echopop/live/live_spatial_methods.py | 2 +-
 echopop/live/live_survey.py          | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py
index 9fbefd05..b57e0746 100644
--- a/echopop/live/live_spatial_methods.py
+++ b/echopop/live/live_spatial_methods.py
@@ -481,7 +481,7 @@ def initialize_grid(file_configuration = dict):
             xmin0, ymin0, xmax0, ymax0 = boundary_gdf.total_bounds
             
             # Read in file
-            full_coast = gpd.read_file(shp_filepath)
+            full_coast = gpd.read_file(shp_filepath, **file_configuration["storage_options"])
             # ---- Convert to UTM
             full_coast_utm = full_coast.to_crs(utm_code)
             # ---- Remove empty
diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index 3d5dd446..07387c26 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -50,6 +50,7 @@ def __init__(
         self,
         live_init_config_path: Union[str, Path], 
         live_file_config_path: Union[str, Path],
+        cloud_storage_options: dict = {},
         verbose: bool = True,
     ):
         # Initialize `meta` attribute
@@ -61,10 +62,14 @@ def __init__(
         # initialize the Survey class object
         self.config = eldl.live_configuration(Path(live_init_config_path), 
                                               Path(live_file_config_path))
-        # # ---- Initialize config key for database files
+        # ---- Initialize config key for database files
         self.config.update(
             {"database": {key: None for key in self.config["input_directories"].keys()}}
         )
+        # ---- Add cloud storage options, if needed
+        self.config.update(
+            {"storage_options": cloud_storage_options}
+        )
         
         # Initialize input attribute
         self.input = copy.deepcopy(LIVE_DATA_STRUCTURE["input"])

From 755c9cf762a10e3d3b46c927191ae0a6146f9de3 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 13:01:18 -0700
Subject: [PATCH 39/81] Fix to `storage_options` arg for `geopandas`

---
 echopop/live/live_spatial_methods.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py
index b57e0746..00bc2711 100644
--- a/echopop/live/live_spatial_methods.py
+++ b/echopop/live/live_spatial_methods.py
@@ -481,7 +481,8 @@ def initialize_grid(file_configuration = dict):
             xmin0, ymin0, xmax0, ymax0 = boundary_gdf.total_bounds
             
             # Read in file
-            full_coast = gpd.read_file(shp_filepath, **file_configuration["storage_options"])
+            full_coast = gpd.read_file(shp_filepath, 
+                                       storage_options=file_configuration["storage_options"])
             # ---- Convert to UTM
             full_coast_utm = full_coast.to_crs(utm_code)
             # ---- Remove empty

From 9fc24936c1e65139826701ac10233d7f1eb1f861 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 13:19:12 -0700
Subject: [PATCH 40/81] Updated `pygrio` engine settings

---
 echopop/live/live_spatial_methods.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py
index 00bc2711..2d7ac606 100644
--- a/echopop/live/live_spatial_methods.py
+++ b/echopop/live/live_spatial_methods.py
@@ -482,6 +482,7 @@ def initialize_grid(file_configuration = dict):
             
             # Read in file
             full_coast = gpd.read_file(shp_filepath, 
+                                       engine="pyogrio",
                                        storage_options=file_configuration["storage_options"])
             # ---- Convert to UTM
             full_coast_utm = full_coast.to_crs(utm_code)

From 1c6c81a9792617e9813ea903318f83f6f7dcdb32 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 15:19:21 -0700
Subject: [PATCH 41/81] Fixed random/inconsistent column key missing

---
 echopop/live/live_biology.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py
index 5fcf3c32..99264e0f 100644
--- a/echopop/live/live_biology.py
+++ b/echopop/live/live_biology.py
@@ -120,8 +120,11 @@ def preprocess_biology_data(biology_output: dict, spatial_dict: dict, file_confi
                                      id_columns=["id"],
                                      primary_keys=["id"],
                                      output_type=pd.DataFrame)
-        # ---- Add to the outgoing dictionary (and drop SQL db identifier)
-        sql_results_dict.update({table_name: table_df.drop(columns="id")})
+        # ---- Drop SQL db identifier
+        if "id" in table_df.columns:
+            table_df.drop(columns="id", inplace=True)
+        # ---- Add to the outgoing dictionary
+        sql_results_dict.update({table_name: table_df})
 
     # Return the output
     return filtered_biology_output, sql_results_dict

From 95901d354c802a4a452bd25d433ab24f32567d69 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 15:53:11 -0700
Subject: [PATCH 42/81] Change files read/processed tracking

---
 echopop/live/live_acoustics.py    |  2 +-
 echopop/live/live_data_loading.py |  2 +-
 echopop/live/live_survey.py       | 20 ++++++++++++++------
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py
index 24f96681..5aea43f7 100644
--- a/echopop/live/live_acoustics.py
+++ b/echopop/live/live_acoustics.py
@@ -267,7 +267,7 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict
     # Update the successfully processed files
     query_processed_files(root_database, 
                           file_configuration["input_directories"]["acoustics"],
-                          meta_dict["provenance"]["acoustic_files"],
+                          meta_dict["provenance"]["acoustic_files_read"],
                           processed=True)
     
     # Insert the new data into the database & pull in the combined dataset
diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index cbf04526..abc69322 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -248,7 +248,7 @@ def validate_data_directory(file_configuration: dict, dataset: str,
             "Data loading argument `input_filenames` must be a list."
         )        
     else:
-        data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}"))
+        data_files = list(Path(directory_path).glob(f"*{'.'+file_settings['extension']}"))
 
     # Database root directory
     database_root_directory = file_configuration["database_directory"]
diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index 07387c26..97ac4425 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -95,7 +95,7 @@ def __repr__(self):
         # Get any acoustic files created
         if "acoustic_files" in self.meta["provenance"]:
             # ---- Get the filenames
-            acoustic_filenames = self.meta["provenance"]["acoustic_files"]
+            acoustic_filenames = self.meta["provenance"]["acoustic_files_read"]
             # ---- Subset if many files are being processed
             if len(acoustic_filenames) > 2:
                 acoustic_filenames = acoustic_filenames[:2] + ["..."] + [f"[n = {len(acoustic_filenames)}]"]
@@ -107,7 +107,7 @@ def __repr__(self):
         # Get any biology files created
         if "biology_files" in self.meta["provenance"]:
             # ---- Get the filenames
-            biology_filenames = self.meta["provenance"]["biology_files"]
+            biology_filenames = self.meta["provenance"]["biology_files_read"]
             # ---- Subset if many files are being processed
             if len(biology_filenames) > 4:
                 biology_filenames = biology_filenames + ["..."]
@@ -156,7 +156,7 @@ def load_acoustic_data(self,
                                                                               self.config)  
             # ---- Add meta key
             self.meta["provenance"].update({
-                "acoustic_files": acoustic_files,
+                "acoustic_files_read":  acoustic_files,
             })   
             # TODO: Add verbosity for printing database filepaths/connections 
             if verbose:
@@ -198,7 +198,7 @@ def load_biology_data(self,
 
             # Add meta key
             self.meta["provenance"].update({
-                "biology_files": biology_files,
+                "biology_files_read": biology_files,
             })  
 
     def process_biology_data(self):
@@ -275,9 +275,14 @@ def process_biology_data(self):
             # Update the database
             query_processed_files(root_directory, 
                                   self.config["input_directories"]["biology"],
-                                  self.meta["provenance"]["biology_files"],
+                                  self.meta["provenance"]["biology_files_read"],
                                   processed=True)
             
+            # Add meta key
+            self.meta["provenance"].update({
+                "biology_files_processed": self.meta["provenance"]["biology_files_read"]
+            })  
+            
 
     def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True):
 
@@ -303,7 +308,10 @@ def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True):
                                                                          self.config,
                                                                          self.meta)
 
-            # Update the database
+            # Add meta key
+            self.meta["provenance"].update({
+                "acoustic_files_processed": self.meta["provenance"]["acoustic_files_read"]
+            })  
     
     def estimate_population(self,
                             working_dataset: Literal["acoustic", "biology"],

From 10ebb88fff1d24bcf05cbad45db5e8d374265961 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 16:19:53 -0700
Subject: [PATCH 43/81] Add file read checkpointing (`load_biology_data`)

---
 echopop/live/live_survey.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index 97ac4425..03c0651b 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -178,6 +178,9 @@ def load_biology_data(self,
         biology_files = eldl.validate_data_directory(self.config, dataset="biology", 
                                                      input_filenames=input_filenames)
         
+        # ! REMOVE 
+        self.meta["provenance"]["biology_files_checkpoint1"] = biology_files
+        
         # TODO: Add verbosity for printing database filepaths/connections 
         if biology_files and verbose:
             # ---- Create file list
@@ -190,12 +193,22 @@ def load_biology_data(self,
             # Read in the biology data files
             initial_biology_output = eldl.read_biology_files(biology_files, self.config,
                                                              pandas_kwargs=pandas_kwargs)
+            
+            # ! REMOVE 
+            self.meta["provenance"]["biology_files_checkpoint2"] =(
+                {key: df.shape for key, df in initial_biology_output.items()}
+            ) 
 
             # Preprocess the biology dataset
             self.input["biology"], self.input["biology_processed"] = (
                 preprocess_biology_data(initial_biology_output, self.input["spatial"], self.config)
             )
 
+            # ! REMOVE 
+            self.meta["provenance"]["biology_files_checkpoint2"] = (
+                {key: df.shape for key, df in self.input["biology_processed"].items()}
+            )
+
             # Add meta key
             self.meta["provenance"].update({
                 "biology_files_read": biology_files,

From 3e1d06032cc9c3cec4e0f4bea3fac46f4c5a05b5 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 17:17:28 -0700
Subject: [PATCH 44/81] fix to `read_csv`

---
 echopop/live/live_data_loading.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index abc69322..ecb60426 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -4,6 +4,7 @@
 import re
 from .sql_methods import SQL, query_processed_files, sql_data_exchange, initialize_database
 import pandas as pd
+import numpy as np
 from datetime import datetime
 import xarray as xr
 
@@ -113,16 +114,18 @@ def read_biology_files(biology_files: List[str], file_configuration: dict,
     biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
     # # ---- Create filepath object
     if "data_root_dir" in file_configuration:
-        directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"]
+        # directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"]
+        directory_path = "/".join([file_configuration["data_root_dir"], file_settings["directory"]])
     else:
-        directory_path = Path(file_settings["directory"])
+        directory_path = file_settings["directory"]
     
     # Add SQL file to dict
     # file_configuration["database"]["biology"] = (
     #     Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"]         
     # )
     file_configuration["database"]["biology"] = (
-        Path(file_configuration["database_directory"]) / file_settings["database_name"]         
+        # Path(file_configuration["database_directory"]) / file_settings["database_name"]    
+        "/".join([file_configuration["database_directory"], file_settings["database_name"]])     
     )
 
 
@@ -136,7 +139,7 @@ def read_biology_files(biology_files: List[str], file_configuration: dict,
         # ---- If there are dataset files available
         if dataset_files:
             # ---- Read in validated biology data
-            dataframe_list = [read_biology_csv(Path(file), 
+            dataframe_list = [read_biology_csv(file, 
                                                file_settings["file_name_formats"][dataset], 
                                                biology_config_map[dataset],
                                                pandas_kwargs) 
@@ -281,7 +284,7 @@ def compile_filename_format(file_name_format: str):
 def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict, pandas_kwargs: dict = {}):
 
     # Read in the `*.csv` file
-    df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys()), **pandas_kwargs)
+    df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys()), storage_options=pandas_kwargs)
 
     # Validate the dataframe
     # ---- Check for any missing columns

From 5ed0ce2970d9a0b2bd9a5e05dc8b69295a9db679 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 17:24:18 -0700
Subject: [PATCH 45/81] Fix glob cmd

---
 echopop/live/live_data_loading.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index ecb60426..84993104 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -87,7 +87,7 @@ def filter_filenames(directory_path: Path, filename_id: str,
     # ---- Replace all other tags with `*` placeholders
     file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
     # ---- Create Path object with the generalized format
-    subfile_path_obj = directory_path.glob(f"{file_id_format}.{file_extension}")
+    subfile_path_obj = Path(directory_path).glob(f"{file_id_format}.{file_extension}")
     # ---- List all files that match this pattern
     subfile_str = [str(file) for file in list(subfile_path_obj)]
 

From c6697d2b4333ee839aef07831e160962561fbc62 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 17:28:42 -0700
Subject: [PATCH 46/81] Index fix

---
 echopop/live/live_survey.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index 03c0651b..9b468191 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -205,7 +205,7 @@ def load_biology_data(self,
             )
 
             # ! REMOVE 
-            self.meta["provenance"]["biology_files_checkpoint2"] = (
+            self.meta["provenance"]["biology_files_checkpoint3"] = (
                 {key: df.shape for key, df in self.input["biology_processed"].items()}
             )
 

From c7d2244ee8e16a832e56b211fd07d141845343b6 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 18:00:03 -0700
Subject: [PATCH 47/81] Fixed methods for s3 bucket

---
 echopop/live/live_data_loading.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index 84993104..2e707c5c 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -86,8 +86,18 @@ def filter_filenames(directory_path: Path, filename_id: str,
     file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', filename_id)
     # ---- Replace all other tags with `*` placeholders
     file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
-    # ---- Create Path object with the generalized format
-    subfile_path_obj = Path(directory_path).glob(f"{file_id_format}.{file_extension}")
+    # ---- Compile the pattern
+    pattern = re.compile(rf'{file_id_format.replace(".", r"\.").replace("*", ".*")}')
+    # ---- Create Path object with the generalized format: S3
+    s3_files = [filename for filename in files 
+                if filename.startswith("s3://") and pattern.search(filename)]
+    # ---- Local search
+    local_files = Path(directory_path).glob(f"{file_id_format}.{file_extension}")
+    # ---- Assign to subfile path object
+    if s3_files:
+        subfile_path_obj = s3_files
+    else:
+        subfile_path_obj = local_files
     # ---- List all files that match this pattern
     subfile_str = [str(file) for file in list(subfile_path_obj)]
 
@@ -128,7 +138,6 @@ def read_biology_files(biology_files: List[str], file_configuration: dict,
         "/".join([file_configuration["database_directory"], file_settings["database_name"]])     
     )
 
-
     # Iterate through the different biology datasets and read them in
     for dataset in list(biology_file_ids.keys()):
         # ---- Get dataset-specific file lists
@@ -284,7 +293,9 @@ def compile_filename_format(file_name_format: str):
 def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict, pandas_kwargs: dict = {}):
 
     # Read in the `*.csv` file
-    df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys()), storage_options=pandas_kwargs)
+    df = pd.read_csv(file, 
+                     usecols=list(config_map["dtypes"].keys()), 
+                     storage_options=pandas_kwargs)
 
     # Validate the dataframe
     # ---- Check for any missing columns
@@ -309,7 +320,7 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict, pandas_k
     # Compile the filename regular expression
     compiled_regex = compile_filename_format(pattern)
     # ---- Create the `Match` object that will be used to parse the string
-    match_obj = compiled_regex.search(file.name)
+    match_obj = compiled_regex.search(file)
 
     # Iterate through the filename-derived tags and add them to the DataFrame
     for i in valid_tags: 

From 89880bbae45f158bbe24bd93a4fadad2356bd45a Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 18:08:03 -0700
Subject: [PATCH 48/81] Removed f-string

---
 echopop/live/live_data_loading.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index 2e707c5c..96631b35 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -87,7 +87,9 @@ def filter_filenames(directory_path: Path, filename_id: str,
     # ---- Replace all other tags with `*` placeholders
     file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
     # ---- Compile the pattern
-    pattern = re.compile(rf'{file_id_format.replace(".", r"\.").replace("*", ".*")}')
+    escaped_file_id_format = re.escape(file_id_format)
+    pattern = re.compile(escaped_file_id_format.replace(r"\*", ".*"))
+    # pattern = re.compile(rf'{file_id_format.replace(".", r"\.").replace("*", ".*")}')
     # ---- Create Path object with the generalized format: S3
     s3_files = [filename for filename in files 
                 if filename.startswith("s3://") and pattern.search(filename)]

From 724424693469d8a4bf45f2ac9d50475db3748a12 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 14 Aug 2024 18:38:34 -0700
Subject: [PATCH 49/81] `live_visualizer` module

---
 echopop/live/live_visualizer.py | 358 ++++++++++++++++++++++++++++++++
 echopop/test_workflow.py        | 108 ++--------
 2 files changed, 373 insertions(+), 93 deletions(-)

diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py
index e69de29b..0975c08a 100644
--- a/echopop/live/live_visualizer.py
+++ b/echopop/live/live_visualizer.py
@@ -0,0 +1,358 @@
+from echopop.live.sql_methods import SQL
+from shapely import wkt
+import matplotlib.pyplot as plt
+from matplotlib.colors import ListedColormap
+import numpy as np
+import pandas as pd
+import geopandas as gpd
+from typing import Union, Optional
+from pathlib import Path
+
+def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame],
+                         projection: str,
+                         coast_db: Optional[Union[Path, pd.DataFrame]] = None):
+
+    # Extract grid data from database if needed
+    if isinstance(grid_db, Path):
+        # ---- SELECT
+        grid_data = SQL(grid_db, "select", table_name="grid_df")
+    elif not isinstance(grid_db, pd.DataFrame):
+        raise TypeError(
+            "Grid data input (`grid_data`) must either be a `Path` or `pandas.DataFrame` object."
+        )
+    else:
+        grid_data = grid_db
+    
+    # Extract coast data from database if needed
+    if isinstance(coast_db, Path):
+        # ---- SELECT
+        coast_data = SQL(coast_db, "select", table_name="coastline_df")  
+    elif coast_data is None:
+        # ---- SELECT from `grid_data`
+        coast_data = SQL(grid_db, "select", table_name="coastline_df")  
+    elif not isinstance(coast_db, pd.DataFrame):
+        raise TypeError(
+            "Coast data input (`coast_data`) must either be a `Path` or `pandas.DataFrame` object, "
+            "or exist within the SQL database as a table (`'coastline_df'`) within the `grid_data` "
+            "input (i.e. `grid_data.db`)."
+        )      
+    else:
+        coast_data = coast_db  
+    
+    # Format columns if needed (well-known-text to Polygon)
+    # ---- `grid_data`
+    if isinstance(grid_data["geometry"][0], str):
+        grid_data["geometry"] = grid_data["geometry"].apply(wkt.loads)
+    # ---- `coastline_data`
+    if isinstance(coast_data["geometry"][0], str):
+        coast_data["geometry"] = coast_data["geometry"].apply(wkt.loads)    
+    
+    # Generate GeoDataFrames
+    # ---- `grid`
+    grid_gdf = gpd.GeoDataFrame(grid_data, geometry="geometry", crs=projection)
+    # ---- `coast`
+    coast_gdf = gpd.GeoDataFrame(coast_data, geometry="geometry", crs=projection)
+
+    # Get appropriate plot axis-limits
+    axis_limits = grid_gdf.total_bounds
+
+    # Variable label dictionary map
+    VARIABLE_MAP = {
+        "number_density_mean": {
+            "name": "Mean number density",
+            "units": "fish $\\mathregular{nmi^{-2}}$",
+            "colormap": "viridis",
+        }, 
+        "biomass_density_mean": {
+            "name": "Mean biomass density",
+            "units": "kg $\\mathregular{nmi^{-2}}$",
+            "colormap": "plasma",
+        },     
+        "biomass": {
+            "name": "Biomass",
+            "units": "kg",
+            "colormap": "cividis",
+        },
+        "abundance": {
+            "name": "Abundance",
+            "units": "$\\it{N}$",
+            "colormap": "inferno",
+        }
+    }
+
+    # Create a figure and a 2x2 grid of subplots
+    fig, axes = plt.subplots(2, 2, figsize=(10, 10))
+
+    # List of variables to plot
+    variables = list(VARIABLE_MAP.keys())
+
+    # Iterate through and plot all subplots
+    for ax, var in zip(axes.flat, variables):
+        # ---- Get the colormap
+        colormap = plt.colormaps.get_cmap(VARIABLE_MAP[var]["colormap"]).resampled(256)
+        # ---- Invert
+        newcolors = colormap (np.linspace(0, 1, 256))[::-1]
+        # ---- Define `white`
+        white = np.array([1, 1, 1, 1])
+        # ---- Replace "start" color
+        newcolors[0, :] = white
+        # ---- Create the new custom colormap
+        custom_cmap = ListedColormap(newcolors)
+        # ---- Normalize colorscale
+        norm=plt.Normalize(vmin=grid_gdf[var].min(), vmax=grid_gdf[var].max())
+        # ---- Plot the polygons with color fills based on the variable (non-zero)
+        grid_gdf.plot(column=var, ax=ax, edgecolor="gainsboro", legend=False, cmap=custom_cmap,
+                      norm=norm,
+                      markersize=0, linewidth=0.5)        
+        # ---- Add coastline data layer
+        coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
+        # ---- Set axis limits
+        ax.set_xlim(axis_limits[0]*1.005, axis_limits[2]*1.01)
+        ax.set_ylim(axis_limits[1]*0.98, axis_limits[3]*1.005)
+        # ---- Trim down the margins
+        ax.margins(0,0)
+        # ---- Set adjustable aspect ratio
+        # ax.set_aspect('equal', adjustable='box')
+        # ---- Set the title and labels
+        var_info = VARIABLE_MAP[var]
+        ax.set_title(f"{var_info['name']}")
+        # ---- Set axis labels
+        plt.xlabel(u'Longitude (\u00B0E)')
+        plt.ylabel(u'Latitude (\u00B0N)')
+        # ---- Add colorbar
+        sm = plt.cm.ScalarMappable(cmap=custom_cmap, 
+                                   norm=plt.Normalize(vmin=grid_gdf[var].min(), 
+                                                      vmax=grid_gdf[var].max()))
+        sm._A = []  # fake up the array of the scalar mappable
+        cbar = fig.colorbar(sm, ax=ax, shrink=0.5)
+        cbar.set_label(f"{var_info['units']}")
+        # ---- Add scalebar
+        scalebar_length = 250  # Length of scale bar in km
+        scalebar_length_in_degrees = scalebar_length / 111  # Assuming 1 degree = 111 km
+        # ---- Transform scale bar coordinates to axis units
+        # scalebar_x = axis_limits[0]*1.005 + (axis_limits[2]*1.01 - axis_limits[0]*1.005) * 0.1
+        # scalebar_y = axis_limits[1]*0.98 + (axis_limits[3]*1.005 - axis_limits[1]*0.98) * 0.1
+        x0, x1 = ax.get_xlim()
+        y0, y1 = ax.get_ylim()
+        x_scale = (x1 - x0) * 0.1
+        y_scale = (y1 - y0) * 0.1
+        # scalebar_y_offset = (axis_limits[3]*1.005 - axis_limits[1]*0.98) * 0.05
+        # ---- Plot scalebar        
+        # ax.plot([scalebar_x, scalebar_x + scalebar_length / 100], 
+        #         [scalebar_y, scalebar_y], color='black', lw=2)
+        ax.plot([x0 + x_scale, x0 + x_scale + scalebar_length_in_degrees], 
+                [y0 + y_scale, y0 + y_scale], color='black', lw=2)
+        # ---- Add scale text
+        ax.text(x0 + x_scale + scalebar_length_in_degrees / 2, y0 + y_scale - (y1 - y0) * 0.025, 
+                f'{scalebar_length} km', ha='center', va='top', color='black')
+
+        # ax.text(scalebar_x + (scalebar_length / 200), 
+        #         scalebar_y - scalebar_y_offset, 
+        #         f'{scalebar_length} km', ha='center', va='bottom', color='black')
+
+    # Adjust layout
+    plt.tight_layout()
+
+    # Show the plot
+    plt.show()
+
+def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame],
+                          projection: str,
+                          coast_db: Optional[Union[Path, pd.DataFrame]] = None):
+
+    # Extract grid data from database if needed
+    if isinstance(survey_data_db, Path):
+        # ---- SELECT
+        survey_data = SQL(survey_data_db, "select", table_name="survey_data_df")
+    elif not isinstance(survey_data_db, pd.DataFrame):
+        raise TypeError(
+            "Grid data input (`grid_data`) must either be a `Path` or `pandas.DataFrame` object."
+        )
+    else:
+        survey_data = survey_data_db
+    
+    # Extract coast data from database if needed
+    if isinstance(coast_db, Path):
+        # ---- SELECT
+        coast_data = SQL(coast_db, "select", table_name="coastline_df")   
+    elif not isinstance(coast_db, pd.DataFrame):
+        raise TypeError(
+            "Coast data input (`coast_data`) must either be a `Path` or `pandas.DataFrame` object."
+        )      
+    else:
+        coast_data = coast_db
+    
+    # Format columns if needed (well-known-text to Polygon)
+    # ---- `coastline_data`
+    if isinstance(coast_data["geometry"][0], str):
+        coast_data["geometry"] = coast_data["geometry"].apply(wkt.loads)    
+    
+    # Generate GeoDataFrames
+    # ---- `grid`
+    survey_gdf = gpd.GeoDataFrame(survey_data, 
+                                  geometry=gpd.points_from_xy(survey_data["longitude"], 
+                                                              survey_data["latitude"]),
+                                                              crs=projection)
+    # ---- `coast`
+    coast_gdf = gpd.GeoDataFrame(coast_data, geometry="geometry", crs=projection)
+
+    # Get appropriate plot axis-limits
+    axis_limits = survey_gdf.total_bounds
+
+    # Variable label dictionary map
+    VARIABLE_MAP = {
+        "number_density": {
+            "name": "Mean number density",
+            "units": "fish $\\mathregular{nmi^{-2}}$",
+            "colormap": "inferno",
+            "minimum": 0.0,
+            "cbar_reverse": True,
+            "size": [25, 250]
+        }, 
+        "biomass_density": {
+            "name": "Mean biomass density",
+            "units": "kg $\\mathregular{nmi^{-2}}$",
+            "colormap": "plasma",
+            "minimum": 0.0,
+            "cbar_reverse": True,
+            "size": [25, 250]
+        },     
+        "nasc": {
+            "name": "Nautical area scattering coefficient",
+            "units": "$\\mathregular{m^{2}~nmi^{-2}}$",
+            "colormap": "viridis",
+            "minimum": 0.0,
+            "cbar_reverse": False,
+            "size": [25, 250]
+        },
+        "max_Sv": {
+            "name": "Max $\\mathregular{S_V}$",
+            "units": "dB re. 1 $\\mathregular{m^-1}$",
+            "colormap": "viridis",
+            "minimum": -999,
+            "cbar_reverse": True,
+            "color_threshold": {
+                "minimum": -80.0,
+                "maximum": -36.0
+            },
+            "size": [5, 200]
+        },
+        # "mean_Sv": {
+        #     "name": "$Mean \\mathregular{S_V}$",
+        #     "units": "dB re. 1 $\\mathregular{m^-1}$",
+        #     "colormap": "viridis",
+        #     "minimum": -999,
+        #     "cbar_reverse": True,
+        #     "color_threshold": {
+        #         "minimum": -80.0,
+        #         "maximum": -36.0
+        #     }
+        # },
+    }
+
+    # List of variables to plot
+    variables = list(VARIABLE_MAP.keys())
+
+    def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
+
+        # Censor values if needed
+        sizes = values.copy()
+        sizes.loc[sizes < min_value] = min_value
+        sizes.loc[sizes > max_value] = max_value
+
+        return (
+            ((sizes - min_value) / (max_value - min_value))
+            * (max_size - min_size) + min_size
+        )    
+    
+    # Create a figure and a 2x2 grid of subplots
+    fig, axes = plt.subplots(2, 2, figsize=(10, 10))
+
+    # Iterate through and plot all subplots
+    for ax, var in zip(axes.flat, variables):
+        # ---- Get the colormap
+        colormap = plt.colormaps.get_cmap(VARIABLE_MAP[var]["colormap"]).resampled(256)
+        # ---- Invert
+        if VARIABLE_MAP[var]["cbar_reverse"]:
+            newcolors = colormap(np.linspace(0, 1, 256))[::-1]
+        # ---- Create the new custom colormap
+        custom_cmap = ListedColormap(newcolors)
+        # ---- Plot cruisetrack
+        # survey_gdf.plot(ax=ax, color="dimgray", linewidth=0.25, linestyle="-")
+        ax.plot(survey_gdf.geometry.x, survey_gdf.geometry.y, color="dimgray", 
+                linewidth=0.25, linestyle="-")
+        # ---- Drop "empty" values
+        sub_gdf = survey_gdf[survey_gdf[var] > VARIABLE_MAP[var]["minimum"]]
+        # ---- Assign color range
+        if "color_threshold" in VARIABLE_MAP[var].keys():
+            min_value = VARIABLE_MAP[var]["color_threshold"]["minimum"]
+            max_value = VARIABLE_MAP[var]["color_threshold"]["maximum"]
+        else:
+            min_value = sub_gdf[var].min()
+            max_value = sub_gdf[var].max()
+        # ---- Normalize colorscale
+        norm=plt.Normalize(vmin=min_value, vmax=max_value)
+        # ---- Plot the points with color fills based on the variable (non-zero)
+        ax.scatter(
+            [geom.x for geom in sub_gdf.geometry],
+            [geom.y for geom in sub_gdf.geometry],
+            c=sub_gdf[var],
+            s=scale_sizes(values=sub_gdf[var], 
+                          min_value=min_value, 
+                          max_value=max_value,
+                          min_size=VARIABLE_MAP[var]["size"][0],
+                          max_size=VARIABLE_MAP[var]["size"][1]),
+            cmap=custom_cmap,
+            norm=norm,
+            edgecolor="black",
+            linewidths=0.5
+        )    
+        # ---- Add coastline data layer
+        coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
+        # ---- Set axis limits
+        ax.set_xlim(axis_limits[0]*1.005, axis_limits[2]*0.995)
+        ax.set_ylim(axis_limits[1]*0.98, axis_limits[3]*1.005)
+        # ---- Trim down the margins
+        ax.margins(0,0)
+        # ---- Set adjustable aspect ratio
+        # ax.set_aspect('equal', adjustable='box')
+        # ---- Set the title and labels
+        var_info = VARIABLE_MAP[var]
+        ax.set_title(f"{var_info['name']}")
+        # ---- Set axis labels
+        plt.xlabel(u'Longitude (\u00B0E)')
+        plt.ylabel(u'Latitude (\u00B0N)')
+        # ---- Add colorbar
+        sm = plt.cm.ScalarMappable(cmap=custom_cmap, norm=norm)
+        sm._A = []  # fake up the array of the scalar mappable
+        cbar = fig.colorbar(sm, ax=ax, shrink=0.5)
+        cbar.set_label(f"{var_info['units']}")
+        # ---- Add scalebar
+        scalebar_length = 250  # Length of scale bar in km
+        scalebar_length_in_degrees = scalebar_length / 111  # Assuming 1 degree = 111 km
+        # ---- Transform scale bar coordinates to axis units
+        # scalebar_x = axis_limits[0]*1.005 + (axis_limits[2]*1.01 - axis_limits[0]*1.005) * 0.1
+        # scalebar_y = axis_limits[1]*0.98 + (axis_limits[3]*1.005 - axis_limits[1]*0.98) * 0.1
+        x0, x1 = ax.get_xlim()
+        y0, y1 = ax.get_ylim()
+        x_scale = (x1 - x0) * 0.1
+        y_scale = (y1 - y0) * 0.1
+        # scalebar_y_offset = (axis_limits[3]*1.005 - axis_limits[1]*0.98) * 0.05
+        # ---- Plot scalebar        
+        # ax.plot([scalebar_x, scalebar_x + scalebar_length / 100], 
+        #         [scalebar_y, scalebar_y], color='black', lw=2)
+        ax.plot([x0 + x_scale, x0 + x_scale + scalebar_length_in_degrees], 
+                [y0 + y_scale, y0 + y_scale], color='black', lw=2)
+        # ---- Add scale text
+        ax.text(x0 + x_scale + scalebar_length_in_degrees / 2, y0 + y_scale - (y1 - y0) * 0.025, 
+                f'{scalebar_length} km', ha='center', va='top', color='black')
+
+        # ax.text(scalebar_x + (scalebar_length / 200), 
+        #         scalebar_y - scalebar_y_offset, 
+        #         f'{scalebar_length} km', ha='center', va='bottom', color='black')
+
+    # Adjust layout
+    plt.tight_layout()
+
+    # Show the plot
+    plt.show()
diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index e95f7336..47844f25 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -1,5 +1,7 @@
 from echopop.live.live_survey import LiveSurvey
 from echopop.live.sql_methods import SQL
+import echopop.live.live_visualizer as elv
+from pathlib import Path
 ####################################################################################################
 # TEST: Set up `LiveSurvey` object
 # NOTE: General initialization parameter configuration
@@ -56,7 +58,7 @@
 # !!! The SQL functions will fail if the tables have not yet been created/initialized
 # ---- ACOUSTICS
 # NOTE: Mean linear backscatter coefficient (`sigma_bs`) keyed for each haul and stratum
-SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df")
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df")
 # NOTE: Along-track acoustically-derived number/biomass densities and NASC 
 SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
 # ---- BIOLOGICAL
@@ -66,95 +68,15 @@
 SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df")
 # NOTE: Average weights per stratum
 SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df")
-
-dat = realtime_survey.input["acoustics"]["prc_nasc_df"].copy()
-dat = dat[dat.latitude > 40]
-dat = dat[dat.depth > 20]
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-from geopy.distance import geodesic
-import pandas as pd
-from datetime import datetime
-import matplotlib.dates as mdates
-def calculate_distances(df):
-    distances = [0]  # Start with 0 for the first point
-    for i in range(1, len(df)):
-        point1 = (df.iloc[i - 1]['latitude'], df.iloc[i - 1]['longitude'])
-        point2 = (df.iloc[i]['latitude'], df.iloc[i]['longitude'])
-        distances.append(geodesic(point1, point2).meters)
-    return distances
-
-def parse_datetime(date_str):
-    # List of possible formats
-    formats = [
-        '%Y-%m-%d %H:%M:%S.%f',  # With fractional seconds
-        '%Y-%m-%d %H:%M:%S',     # Without fractional seconds
-        '%Y-%m-%dT%H:%M:%S.%f',  # ISO 8601 format with fractional seconds
-        '%Y-%m-%dT%H:%M:%S'      # ISO 8601 format without fractional seconds
-    ]
-    
-    for fmt in formats:
-        try:
-            return pd.to_datetime(date_str, format=fmt)
-        except (ValueError, TypeError):
-            continue  # Try the next format
-    
-    return pd.NaT  # Return NaT if no formats match
-
-dat["ping_time"] = dat["ping_time"].apply(parse_datetime)
-
-pivot_table = dat.pivot_table(index=["depth"], columns=["ping_time"], values=["NASC"], aggfunc="mean")
-# Get the unique distance and depth values for plotting
-plt.figure(figsize=(10, 8))
-ax = sns.heatmap(pivot_table, cmap="viridis", cbar_kws={'label': 'NASC'})
-plt.gca().xaxis.set_major_locator(mdates.MinuteLocator(interval=30))  # Major ticks every 30 minutes
-plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))  # Format as hour:minute
-plt.gcf().autofmt_xdate()
-ax.set_xticks(ax.get_xticks()[::max(len(ax.get_xticks()) // 10, 1)])  # Show fewer ticks if necessary
-plt.xlabel('Ping time')
-plt.ylabel('Depth')
-# plt.gca().invert_yaxis()  # To have depth increasing downwards like in a typical depth plot
-plt.show()
-
-
-dat.groupby(["ping_time"]).size()
-unique_pairs = dat.drop_duplicates(subset=['latitude', 'longitude']).sort_values("ping_time")
-
-unique_pairs["d"] = calculate_distances(dat)
-df['cumulative_distance'] = df['distance'].cumsum()
-
-unique_distances = dat.groupby('source')[['latitude', 'longitude']].unique().reset_index()
-unique_distances = unique_distances.explode('distance')
-
-
-# Create a pivot table to reshape the dataframe suitable for a heatmap
-dat['source_id'] = dat['source'].astype('category').cat.codes
-pivot_table = dat.pivot(index=["depth"], columns=["distance"], values=["NASC"])
-dat.groupby('source')['distance'].cumsum()
-plt.plot(index="depth", columns="distance", values="NASC")
-plt.show()
-
-data = {
-    'distance': [1, 1, 2, 2, 1, 1, 3, 3],
-    'depth': [1, 2, 1, 2, 1, 2, 1, 2],
-    'source': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B']
-}
-dat = pd.DataFrame(data)
-dat = dat.sort_values(by=['source', 'distance'])
-unique_distances = dat.groupby('source')['distance'].unique().reset_index()
-unique_distances = unique_distances.explode('distance')
-
-unique_distances['distance'] = pd.to_numeric(unique_distances['distance'], errors='coerce')
-unique_distances['distance_diff'] = unique_distances.groupby('source')['distance'].diff().fillna(0)
-unique_distances['cumsum_diff'] = unique_distances.groupby('source')['distance_diff'].cumsum()
-unique_distances['Cumsum_dist'] = unique_distances['cumsum_diff'].cumsum()
-unique_distances['Cumsum_dist'] = pd.to_numeric(unique_distances['Cumsum_dist'], errors='coerce')
-dat = dat.merge(unique_distances[['source', 'distance', 'Cumsum_dist']], on=['source', 'distance'], how='left')
-
-
-# Calculate cumulative sum of distances for each source
-dat['Cumsum_dist'] = dat.groupby('source')['distance'].transform(lambda x: x.cumsum())
-dat['Cumsum_dist_within_source'] = dat.groupby('source')['distance'].cumsum()
-
-dat['Cumsum_dist'] = dat.groupby('source')['Cumsum_dist_within_source'].transform(lambda x: x + x.shift(1).fillna(0).cumsum())
+####################################################################################################
+# FROM THE `LiveSurvey` object !
+# ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table
+grid_db = Path(realtime_survey.config["database"]["grid"])
+survey_data_db = Path('C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files/database/acoustics.db')
+coast_db = grid_db
+projection = realtime_survey.config["geospatial"]["projection"]
+# NOTE: PLOTS
+# ---- PLOT GRID
+elv.plot_livesurvey_grid(grid_db, projection, coast_db)
+# ---- PLOT TRACK
+elv.plot_livesurvey_track(survey_data_db, projection, coast_db)

From 76085c3f63b879d7c06053b15d0409dd2434b162 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Thu, 15 Aug 2024 11:57:27 -0700
Subject: [PATCH 50/81] Minor changes to axis labels

---
 echopop/live/live_visualizer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py
index 0975c08a..cd085399 100644
--- a/echopop/live/live_visualizer.py
+++ b/echopop/live/live_visualizer.py
@@ -117,8 +117,8 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame],
         var_info = VARIABLE_MAP[var]
         ax.set_title(f"{var_info['name']}")
         # ---- Set axis labels
-        plt.xlabel(u'Longitude (\u00B0E)')
-        plt.ylabel(u'Latitude (\u00B0N)')
+        ax.set_xlabel(u'Longitude (\u00B0E)')
+        ax.set_ylabel(u'Latitude (\u00B0N)')
         # ---- Add colorbar
         sm = plt.cm.ScalarMappable(cmap=custom_cmap, 
                                    norm=plt.Normalize(vmin=grid_gdf[var].min(), 
@@ -320,8 +320,8 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
         var_info = VARIABLE_MAP[var]
         ax.set_title(f"{var_info['name']}")
         # ---- Set axis labels
-        plt.xlabel(u'Longitude (\u00B0E)')
-        plt.ylabel(u'Latitude (\u00B0N)')
+        ax.set_xlabel(u'Longitude (\u00B0E)')
+        ax.set_ylabel(u'Latitude (\u00B0N)')
         # ---- Add colorbar
         sm = plt.cm.ScalarMappable(cmap=custom_cmap, norm=norm)
         sm._A = []  # fake up the array of the scalar mappable

From e1ec7a0d4cd70fe1640cda858b0f409c3deafa2b Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Thu, 15 Aug 2024 14:13:50 -0700
Subject: [PATCH 51/81] Plotting function for bio distirbutions

---
 echopop/live/live_visualizer.py | 134 ++++++++++++++++++++++++++++++++
 echopop/test_workflow.py        |  14 ++++
 2 files changed, 148 insertions(+)

diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py
index cd085399..89977b12 100644
--- a/echopop/live/live_visualizer.py
+++ b/echopop/live/live_visualizer.py
@@ -356,3 +356,137 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
 
     # Show the plot
     plt.show()
+
+def plot_livesurvey_distributions(weight_table: pd.DataFrame, 
+                                  stratum_table: pd.DataFrame,
+                                  specimen_table: pd.DataFrame,
+                                  length_table: pd.DataFrame,
+                                  biology_db: Optional[Path] = None):
+    
+    # If calling from SQL database
+    if biology_db is not None: 
+        weight_table = SQL(biology_db, "select", table_name="length_weight_df")
+        stratum_table = SQL(biology_db, "select", table_name="strata_summary_df")
+        specimen_table = SQL(biology_db, "select", table_name="specimen_data_df")
+        length_table = SQL(biology_db, "select", table_name="length_df")
+    elif not all([isinstance(df, pd.DataFrame) for df in [weight_table, stratum_table, 
+                                                          specimen_table, length_table]]):
+        raise TypeError(
+            "All tables must be a `pandas.DataFrame."
+        )
+    
+    # Organize the weight table data
+    # ---- Sum weights by stratum, sex, and length_bin
+    aggregated_data = (
+        weight_table.groupby(['stratum', 'sex', 'length_bin'])['weight'].sum().reset_index()
+    )
+    # ---- Create a column to indicate 'all' sexes
+    aggregated_data_all = (
+        aggregated_data.groupby(['stratum', 'length_bin'])['weight'].sum().reset_index()
+    )
+    aggregated_data_all['sex'] = 'all'
+    # ---- Combine the male, female, and all data
+    plot_weight_data = pd.concat([aggregated_data, aggregated_data_all], ignore_index=True)
+    
+    # Define the sexes
+    sexes = plot_weight_data.sex.unique().tolist()
+    
+    # Organize the length table data
+    bins = plot_weight_data.length_bin.unique() + 1
+    full_bins = np.concatenate([[bins[0] - np.diff(bins).mean() / 2], bins])
+    length_table["length_bin"] = (
+        pd.cut(length_table["length"], bins=full_bins, labels=bins - 1).astype(float)
+    )
+    length_table_sex = (
+        length_table.groupby(["stratum", "sex", "length_bin"])["length_count"].sum().reset_index()
+    )
+    length_table_all = (
+        length_table.groupby(["stratum", "length_bin"])["length_count"].sum().reset_index()
+    )
+    length_table_all['sex'] = 'all'
+    full_count = (
+        specimen_table.meld(length_table_all, contrasts=["stratum", "sex", "species_id", "length_bin"])
+        .loc[lambda x: x.sex.isin(sexes)]
+        .groupby(['stratum', 'sex', 'length_bin'])['length_count'].sum().reset_index()
+    )
+    full_count["total"] = full_count.groupby(["stratum", "sex"])["length_count"].transform("sum")
+    full_count["number_proportion"] = full_count["length_count"] / full_count["total"]
+    # ---- Combine into the full dataset for plotting
+    plot_count_data = (
+        plot_weight_data
+        .merge(full_count.filter(["stratum", "sex", "length_bin", "number_proportion"]), 
+            on=["stratum", "sex", "length_bin"], how="left")
+    ).fillna(0.0)
+    
+    # Get a color map
+    colors = plt.colormaps['tab10']
+    num_strata = len(stratum_table['stratum'].unique())
+    num_sexes = len(sexes)
+    color_map = colors(num_strata)
+    
+    # Plot
+    fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(6, 8), sharex=True, sharey=True)
+    plt.subplots_adjust(hspace=0.08, wspace=0.05, bottom=0.25)  # Adjust spacing between plots
+    
+    # Plot weights and counts
+    for i, sex in enumerate(sexes):
+        # Weight plot (left column)
+        ax_weight = axes[i, 0]
+        data_weight = plot_weight_data[plot_weight_data['sex'] == sex]
+        for j, (stratum, group) in enumerate(data_weight.groupby('stratum')):
+            # color = colors(i / num_strata) if num_strata > 1 else colors(0)        
+            color = colors(j / num_strata) if num_strata > 1 else colors(0)
+            total = group["weight"].sum()
+            group["proportions"] = group["weight"] / total if total > 0.0 else 0.0
+            ms = 5 if group["proportions"].max() > 0.0 else 0.1
+            # handle, = ax_weight.plot(group['length_bin'], group['proportions'], marker='o', 
+            #                          label=f'Stratum {stratum}', color=color, ms=ms)
+            ax_weight.plot(group['length_bin'], group['proportions'], marker='o', 
+                        label=f'Stratum {stratum}', color=color, ms=ms)
+        if i == 0:
+            ax_weight.set_title(f'Weight')
+        if i < num_sexes - 1:  # No x-ticks for non-bottom plots
+            ax_weight.set_xlabel('')
+        if i == num_sexes // 2:
+            ax_weight.set_ylabel('Within-stratum proportion [0, 1]')
+        if i == num_sexes - 1:  # Bottom plot
+            ax_weight.set_xlabel('Length bin (cm)')
+        ax_weight.set_ylim(0.0, 1.0)
+        # Add label in the top-left corner
+        ax_weight.text(0.05, 1.00 - 0.05 * (num_sexes - 1), sex.title(), 
+                       transform=ax_weight.transAxes,
+                       fontsize=12, verticalalignment='top',
+                       bbox=dict(facecolor='white', alpha=0.8, 
+                                 edgecolor='none'))
+        
+        # Count plot (right column)
+        ax_count = axes[i, 1]
+        data_count = plot_count_data[plot_count_data['sex'] == sex]
+        for j, (stratum, group) in enumerate(data_count.groupby('stratum')):
+            color = colors(j / num_strata) if num_strata > 1 else colors(0)
+            ms = 5 if group["number_proportion"].max() > 0.0 else 0.1
+            ax_count.plot(group['length_bin'], group['number_proportion'], 
+                        marker='o', label=f'Stratum {stratum}', color=color, ms=ms)
+        if i == 0:
+            ax_count.set_title(f"Number")
+        if i < num_sexes - 1:  # No x-ticks for non-bottom plots
+            ax_count.set_xlabel('')
+        if i == num_sexes - 1:  # Bottom plot
+            ax_count.set_xlabel('Length bin (cm)')
+        ax_count.set_ylim(0.0, 1.0)
+        # Add label in the top-left corner
+        ax_count.text(0.05, 1.00 - 0.05 * (num_sexes - 1), sex.title(), 
+                      transform=ax_count.transAxes,
+                      fontsize=12, verticalalignment='top', 
+                      bbox=dict(facecolor='white', alpha=0.8, 
+                                edgecolor='none'))
+    # Create a new axes for the legend
+    legend_ax = fig.add_axes([0.15, 0.05, 0.7, 0.1])  # Position the legend axes (left, bottom, width, height)
+    legend_ax.axis('off')  # Hide the new axes
+    
+    # Create a shared legend in the bottom-most subplot
+    handles, labels = axes[2, 1].get_legend_handles_labels() # Get handles and labels from the bottom-left plot
+    fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 0.2), 
+               ncol=num_strata // 2 + 1, fontsize='small', title='INPFC stratum')
+
+    plt.show()
diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index 47844f25..74968fdc 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -68,6 +68,10 @@
 SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df")
 # NOTE: Average weights per stratum
 SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df")
+# NOTE: Stratum summary tables
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df")
+SQL(realtime_survey.config["database"]["biology"], "map")
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_df")
 ####################################################################################################
 # FROM THE `LiveSurvey` object !
 # ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table
@@ -80,3 +84,13 @@
 elv.plot_livesurvey_grid(grid_db, projection, coast_db)
 # ---- PLOT TRACK
 elv.plot_livesurvey_track(survey_data_db, projection, coast_db)
+# ---- PLOT DISTRIBUTIONS
+weight_table = SQL(realtime_survey.config["database"]["biology"], "select", 
+                   table_name="length_weight_df")
+stratum_table = SQL(realtime_survey.config["database"]["biology"], "select", 
+                    table_name="strata_summary_df")
+specimen_table = SQL(realtime_survey.config["database"]["biology"], "select", 
+                     table_name="specimen_data_df")
+length_table = SQL(realtime_survey.config["database"]["biology"], "select", 
+                   table_name="length_df")
+elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table)
\ No newline at end of file

From b36e28489c299f6f63ab6e75925acc9a526bbca4 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Fri, 16 Aug 2024 12:21:32 -0700
Subject: [PATCH 52/81] Possible fix to `BIGINT` SQL error

---
 echopop/live/sql_methods.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index d1504f90..ce118dce 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -435,6 +435,7 @@ def initialize_database(root_directory: Path, file_settings: dict):
     "INTEGER": int,
     "DATETIME": str,
     "TEXT": str,
+    "BIGINT": int,
 } 
 
 def sql_group_update(db_file: str,

From c761a9b2a226c0f64fde1857bc702b7cfb7ce41b Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Fri, 16 Aug 2024 12:24:41 -0700
Subject: [PATCH 53/81] Validator for successful population run

---
 echopop/live/live_survey.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index 9b468191..79ac8704 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -83,6 +83,8 @@ def __init__(
         # Initialize the extrapolation grid
         initialize_grid(self.config)
 
+        # TODO: add quick utility function to get db filepaths
+
         # Configure the spatial settings
         self.input.update({"spatial": eldl.configure_spatial_settings(self.config)})
 
@@ -335,7 +337,9 @@ def estimate_population(self,
            eldp.acoustic_pipeline(self.input["acoustics"],
                                   self.input["spatial"]["strata"],
                                   self.config,
-                                  verbose=verbose)
+                                  verbose=verbose)   
+           # --- Validate successful run
+           self.meta["provenance"]["acoustic_population"] = True
         
         # method
        if working_dataset == "biology":
@@ -343,4 +347,7 @@ def estimate_population(self,
                                  self.input["spatial"]["strata"],
                                  self.config,
                                  verbose=verbose)
+           # --- Validate successful run
+           self.meta["provenance"]["biology_population"] = True
+
         

From 0f1e8f2b9bc8188406c06b8370f51250bf2e5364 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Fri, 16 Aug 2024 12:26:26 -0700
Subject: [PATCH 54/81] Fix to population validation handshake

---
 echopop/live/live_survey.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index 79ac8704..5523c0e4 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -331,23 +331,25 @@ def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True):
     def estimate_population(self,
                             working_dataset: Literal["acoustic", "biology"],
                             verbose: bool = True):
-       
+    
+        self.meta["provenance"][f"{working_dataset}_population"] = False
+
         # method
-       if working_dataset == "acoustic":
-           eldp.acoustic_pipeline(self.input["acoustics"],
-                                  self.input["spatial"]["strata"],
-                                  self.config,
-                                  verbose=verbose)   
-           # --- Validate successful run
-           self.meta["provenance"]["acoustic_population"] = True
+        if working_dataset == "acoustic":
+            eldp.acoustic_pipeline(self.input["acoustics"],
+                                    self.input["spatial"]["strata"],
+                                    self.config,
+                                    verbose=verbose)   
+            # --- Validate successful run
+            self.meta["provenance"]["acoustic_population"] = True
         
         # method
-       if working_dataset == "biology":
-           eldp.biology_pipeline(self.input["biology"],
-                                 self.input["spatial"]["strata"],
-                                 self.config,
-                                 verbose=verbose)
-           # --- Validate successful run
-           self.meta["provenance"]["biology_population"] = True
+        if working_dataset == "biology":
+            eldp.biology_pipeline(self.input["biology"],
+                                    self.input["spatial"]["strata"],
+                                    self.config,
+                                    verbose=verbose)
+            # --- Validate successful run
+            self.meta["provenance"]["biology_population"] = True
 
         

From 370b16f31da66ae0b1ade933f9bcf5cb55f20793 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Fri, 16 Aug 2024 15:12:48 -0700
Subject: [PATCH 55/81] Database pathing changes

---
 echopop/live/live_data_processing.py | 18 ++++++++++++++++--
 echopop/live/live_survey.py          |  3 ++-
 echopop/live/sql_methods.py          |  6 ++++++
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py
index a235bf58..8673bd53 100644
--- a/echopop/live/live_data_processing.py
+++ b/echopop/live/live_data_processing.py
@@ -33,6 +33,20 @@ def get_average_strata_weights(db_file: str,
     else:
         return None
 
+def configure_database_paths(file_configuration: dict):
+
+    # Extract input directory settings
+    file_settings = file_configuration["input_directories"]
+
+    # Get database directory
+    database_dir = file_configuration["database_directory"]
+
+    # Update configuration
+    file_configuration["database"].update({
+        dataset: "/".join([database_dir, file_settings[dataset]["database_name"]]) 
+        for dataset in file_settings.keys() if "database_name" in file_settings[dataset]
+    })
+
 def acoustic_pipeline(acoustic_dict: dict, 
                       strata_df: pd.DataFrame, 
                       file_configuration: dict, 
@@ -81,8 +95,8 @@ def acoustic_pipeline(acoustic_dict: dict,
 
             # Get the corresponding average strata weights (computed for all fish)
             weight_spatial_averages = get_average_strata_weights(biology_db,
-                                                                acoustic_dict,
-                                                                unique_columns=spatial_column + contrast_columns)
+                                                                 acoustic_dict,
+                                                                 unique_columns=spatial_column + contrast_columns)
             
             if weight_spatial_averages is not None:
                 # Merge average weights with number density estimates
diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index 5523c0e4..d8470366 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -83,7 +83,8 @@ def __init__(
         # Initialize the extrapolation grid
         initialize_grid(self.config)
 
-        # TODO: add quick utility function to get db filepaths
+        # Add database paths to configuration attribute
+        eldp.configure_database_paths(self.config)
 
         # Configure the spatial settings
         self.input.update({"spatial": eldl.configure_spatial_settings(self.config)})
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index ce118dce..a0cf299c 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -786,6 +786,12 @@ def sql_update_strata_summary(source_db: str,
     ATTACH DATABASE '{source_db}' AS source;
     ATTACH DATABASE '{target_db}' AS target;
 
+    -- Verify the source database tables
+    SELECT name FROM source.sqlite_master WHERE type='table';
+
+    -- Query the source table directly
+    SELECT * FROM source.{source_table} LIMIT 1;
+
     """
 
     # Dynamically format the cross-database command

From aca08e47efe293bb0e26eb4c7d11924a0fa8392a Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Fri, 16 Aug 2024 15:46:15 -0700
Subject: [PATCH 56/81] Fixes to oddities due to `NaN` for cruise plot

---
 echopop/live/live_acoustics.py  |  4 ++++
 echopop/live/live_visualizer.py | 10 +++++-----
 echopop/test_workflow.py        | 15 +++++++--------
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py
index 5aea43f7..44e61ae0 100644
--- a/echopop/live/live_acoustics.py
+++ b/echopop/live/live_acoustics.py
@@ -113,6 +113,10 @@ def estimate_echometrics(acoustic_data_df: pd.DataFrame):
 
     # Pre-compute the change in depth
     acoustic_df["dz"] = acoustic_df["depth"].diff()
+    # ---- Change first cell !
+    acoustic_df.loc[0, "dz"] = (
+        acoustic_df.loc[1, "depth"] - acoustic_df.loc[0, "depth"]
+    )
 
     # Initialize echometrics dictionary
     echometrics = {}
diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py
index 89977b12..3985ec53 100644
--- a/echopop/live/live_visualizer.py
+++ b/echopop/live/live_visualizer.py
@@ -207,7 +207,7 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame],
             "colormap": "inferno",
             "minimum": 0.0,
             "cbar_reverse": True,
-            "size": [25, 250]
+            "size": [25, 150]
         }, 
         "biomass_density": {
             "name": "Mean biomass density",
@@ -215,7 +215,7 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame],
             "colormap": "plasma",
             "minimum": 0.0,
             "cbar_reverse": True,
-            "size": [25, 250]
+            "size": [25, 150]
         },     
         "nasc": {
             "name": "Nautical area scattering coefficient",
@@ -223,7 +223,7 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame],
             "colormap": "viridis",
             "minimum": 0.0,
             "cbar_reverse": False,
-            "size": [25, 250]
+            "size": [25, 150]
         },
         "max_Sv": {
             "name": "Max $\\mathregular{S_V}$",
@@ -235,7 +235,7 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame],
                 "minimum": -80.0,
                 "maximum": -36.0
             },
-            "size": [5, 200]
+            "size": [5, 100]
         },
         # "mean_Sv": {
         #     "name": "$Mean \\mathregular{S_V}$",
@@ -305,7 +305,7 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
             cmap=custom_cmap,
             norm=norm,
             edgecolor="black",
-            linewidths=0.5
+            linewidths=0.1
         )    
         # ---- Add coastline data layer
         coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index 74968fdc..05b04c03 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -70,14 +70,13 @@
 SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df")
 # NOTE: Stratum summary tables
 SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df")
-SQL(realtime_survey.config["database"]["biology"], "map")
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_df")
 ####################################################################################################
 # FROM THE `LiveSurvey` object !
 # ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table
+survey_data_db = Path(realtime_survey.config["database"]["acoustics"])
 grid_db = Path(realtime_survey.config["database"]["grid"])
-survey_data_db = Path('C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files/database/acoustics.db')
 coast_db = grid_db
+biology_db = Path(realtime_survey.config["database"]["biology"])
 projection = realtime_survey.config["geospatial"]["projection"]
 # NOTE: PLOTS
 # ---- PLOT GRID
@@ -85,12 +84,12 @@
 # ---- PLOT TRACK
 elv.plot_livesurvey_track(survey_data_db, projection, coast_db)
 # ---- PLOT DISTRIBUTIONS
-weight_table = SQL(realtime_survey.config["database"]["biology"], "select", 
+weight_table = SQL(biology_db, "select", 
                    table_name="length_weight_df")
-stratum_table = SQL(realtime_survey.config["database"]["biology"], "select", 
+stratum_table = SQL(biology_db, "select", 
                     table_name="strata_summary_df")
-specimen_table = SQL(realtime_survey.config["database"]["biology"], "select", 
+specimen_table = SQL(biology_db, "select", 
                      table_name="specimen_data_df")
-length_table = SQL(realtime_survey.config["database"]["biology"], "select", 
+length_table = SQL(biology_db, "select", 
                    table_name="length_df")
-elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table)
\ No newline at end of file
+elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table)

From eaec504fbfd3e7899a22b19039ad51794e46ab69 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Fri, 16 Aug 2024 16:14:18 -0700
Subject: [PATCH 57/81] Updated plotting method for `None`

---
 echopop/live/live_visualizer.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py
index 3985ec53..1260b787 100644
--- a/echopop/live/live_visualizer.py
+++ b/echopop/live/live_visualizer.py
@@ -253,6 +253,9 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame],
     # List of variables to plot
     variables = list(VARIABLE_MAP.keys())
 
+    # Go completed variables
+    intact_variables = [var for var in variables if not survey_gdf[var].isnull().all()]
+
     def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
 
         # Censor values if needed
@@ -265,11 +268,16 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
             * (max_size - min_size) + min_size
         )    
     
-    # Create a figure and a 2x2 grid of subplots
-    fig, axes = plt.subplots(2, 2, figsize=(10, 10))
+    # Create a figure and a 2xn grid of subplots
+    if len(intact_variables) == 4:
+        fig, axes = plt.subplots(2, 2, figsize=(10, 10))
+    elif len(intact_variables) == 3:
+        fig, axes = plt.subplots(1, 3, figsize=(10, 10))
+    elif len(intact_variables) == 2:
+        fig, axes = plt.subplots(1, 1, figsize=(10, 10))
 
     # Iterate through and plot all subplots
-    for ax, var in zip(axes.flat, variables):
+    for ax, var in zip(axes.flat, intact_variables):
         # ---- Get the colormap
         colormap = plt.colormaps.get_cmap(VARIABLE_MAP[var]["colormap"]).resampled(256)
         # ---- Invert
@@ -304,8 +312,8 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
                           max_size=VARIABLE_MAP[var]["size"][1]),
             cmap=custom_cmap,
             norm=norm,
-            edgecolor="black",
-            linewidths=0.1
+            # edgecolor="black",
+            # linewidths=0.1
         )    
         # ---- Add coastline data layer
         coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
@@ -325,7 +333,7 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
         # ---- Add colorbar
         sm = plt.cm.ScalarMappable(cmap=custom_cmap, norm=norm)
         sm._A = []  # fake up the array of the scalar mappable
-        cbar = fig.colorbar(sm, ax=ax, shrink=0.5)
+        cbar = fig.colorbar(sm, ax=ax, shrink=0.5, fraction=0.075, pad=0.1)
         cbar.set_label(f"{var_info['units']}")
         # ---- Add scalebar
         scalebar_length = 250  # Length of scale bar in km

From 0924489337c48a3e5c9432db55b2edd6ac6ae73b Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Fri, 16 Aug 2024 16:32:59 -0700
Subject: [PATCH 58/81] Matplotlib to panel update

---
 echopop/live/live_visualizer.py |  9 ++++++---
 echopop/test_workflow.py        | 22 +++++++++++++++++++---
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py
index 1260b787..37b67a7b 100644
--- a/echopop/live/live_visualizer.py
+++ b/echopop/live/live_visualizer.py
@@ -154,7 +154,8 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame],
     plt.tight_layout()
 
     # Show the plot
-    plt.show()
+    # plt.show()
+    return fig
 
 def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame],
                           projection: str,
@@ -363,7 +364,8 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
     plt.tight_layout()
 
     # Show the plot
-    plt.show()
+    # plt.show()
+    return fig
 
 def plot_livesurvey_distributions(weight_table: pd.DataFrame, 
                                   stratum_table: pd.DataFrame,
@@ -497,4 +499,5 @@ def plot_livesurvey_distributions(weight_table: pd.DataFrame,
     fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 0.2), 
                ncol=num_strata // 2 + 1, fontsize='small', title='INPFC stratum')
 
-    plt.show()
+    # plt.show()
+    return fig
diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index 05b04c03..02b4e557 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -72,6 +72,8 @@
 SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df")
 ####################################################################################################
 # FROM THE `LiveSurvey` object !
+# ---- Convert to a Panel
+import panel as pn
 # ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table
 survey_data_db = Path(realtime_survey.config["database"]["acoustics"])
 grid_db = Path(realtime_survey.config["database"]["grid"])
@@ -79,10 +81,20 @@
 biology_db = Path(realtime_survey.config["database"]["biology"])
 projection = realtime_survey.config["geospatial"]["projection"]
 # NOTE: PLOTS
+# Ensure Panel is initialized
+pn.extension()
+# ---- Helper function
+def plt_to_pn(fig):
+    # Convert to a panel object
+    panel = pn.panel(fig)
+    # Display
+    panel.show() # OR panel.servable() if you want to serve it in a Panel server
 # ---- PLOT GRID
-elv.plot_livesurvey_grid(grid_db, projection, coast_db)
+fig = elv.plot_livesurvey_grid(grid_db, projection, coast_db)
+plt_to_pn(fig)
 # ---- PLOT TRACK
-elv.plot_livesurvey_track(survey_data_db, projection, coast_db)
+fig = elv.plot_livesurvey_track(survey_data_db, projection, coast_db)
+plt_to_pn(fig)
 # ---- PLOT DISTRIBUTIONS
 weight_table = SQL(biology_db, "select", 
                    table_name="length_weight_df")
@@ -92,4 +104,8 @@
                      table_name="specimen_data_df")
 length_table = SQL(biology_db, "select", 
                    table_name="length_df")
-elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table)
+fig = elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table)
+plt_to_pn(fig)
+
+
+

From 1a901864c15437cfd23a3e4d6c3a0389cd721a23 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Fri, 16 Aug 2024 17:09:35 -0700
Subject: [PATCH 59/81] Panel naming update

---
 echopop/test_workflow.py | 89 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 87 insertions(+), 2 deletions(-)

diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index 02b4e557..32105daa 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -93,7 +93,7 @@ def plt_to_pn(fig):
 fig = elv.plot_livesurvey_grid(grid_db, projection, coast_db)
 plt_to_pn(fig)
 # ---- PLOT TRACK
-fig = elv.plot_livesurvey_track(survey_data_db, projection, coast_db)
+fig1 = elv.plot_livesurvey_track(survey_data_db, projection, coast_db)
 plt_to_pn(fig)
 # ---- PLOT DISTRIBUTIONS
 weight_table = SQL(biology_db, "select", 
@@ -104,8 +104,93 @@ def plt_to_pn(fig):
                      table_name="specimen_data_df")
 length_table = SQL(biology_db, "select", 
                    table_name="length_df")
-fig = elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table)
+fig2 = elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table)
 plt_to_pn(fig)
+### MULTIPANEL
+panel0 = pn.panel(fig, name='Gridded population estimates')
+panel1 = pn.panel(fig1, name='Alongtrack population estimates')
+panel2 = pn.panel(fig2, name='Length and weight distributions')
+
+def serve_panels():
+    # Create links to each panel
+    home = pn.Column(
+        pn.pane.Markdown("# Main Page"),
+        pn.pane.Markdown("[Gridded population estimates](gridded_population_estimates)", sizing_mode="stretch_width"),
+        pn.pane.Markdown("[Alongtrack population estimates](alongtrack_population_estimates)", sizing_mode="stretch_width"),
+        pn.pane.Markdown("[Length and weight distributions](length_weight_distributions)", sizing_mode="stretch_width")
+    )
+
+    # Serve the home page and individual panels
+    pn.serve({
+        'Main Page': home,
+        'gridded_population_estimates': panel0,
+        'alongtrack_population_estimates': panel1,
+        'length_weight_distributions': panel2
+    },  show=True)
+# Run the function to serve panels
+serve_panels()
+
+
+
+def serve_panels():
+    panel0.servable(title='Gridded population', location=True)
+    panel1.servable(title='Alongtrack population')
+    panel2.servable(title='Length/weight distribution')
+    pn.serve({'gridded a': panel0, 'fig1': panel1, 'fig2': panel2}, show=True)
+serve_panels()
+
+def serve_panels():
+    # Assign titles and make panels servable
+    panel0.servable(title='Gridded population')
+    panel1.servable(title='Alongtrack population')
+    panel2.servable(title='Length/weight distribution')
+    
+    # Create a dictionary layout
+    layout = {
+        'Gridded population': panel0,
+        'Alongtrack population': panel1,
+        'Length/weight distribution': panel2
+    }
+    
+    # Serve the panels
+    pn.serve(layout, show=True)
+
+# Run the function to serve panels
+serve_panels()
+
+layout = pn.Column(
+    pn.pane.Markdown("# Gridded population", style={'font-size': '20px'}),
+    panel0,
+    pn.pane.Markdown("# Alongtrack population", style={'font-size': '20px'}),
+    panel1,
+    pn.pane.Markdown("# Length/weight distribution", style={'font-size': '20px'}),
+    panel2
+)
+
+def serve_panels():
+    # Serve the layout with titles
+    layout.servable()
+    pn.serve(layout, show=True)
+
+# Run the function to serve panels
+serve_panels()
+
+# Create a layout
+layout = pn.Tabs(('Plot 1', panel1), ('Plot 2', panel2))
+
+# Serve the layout
+layout.servable()
+pn.serve(layout, show=True)
+# Run the server to display panels in separate windows
+# Create a layout with tabs
+tabs = pn.Tabs(('Alongtrack population', panel1), ('Length/weight distribution', panel2))
+tabs.servable()
+pn.serve(tabs, port=5006, show=True)
+pn.serve({'Plot 1': panel1, 'Plot 2': panel2}, show=True)
+
+combined_panel = pn.Column(panel1, panel2)
+combined_panel.show()
+panel1.show()
 
 
 

From 32c0b993a923a47ecee6f80b53aa955fb75060c0 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Fri, 16 Aug 2024 19:16:13 -0700
Subject: [PATCH 60/81] Cleaned up `test_workflow`

---
 echopop/test_workflow.py | 67 +---------------------------------------
 1 file changed, 1 insertion(+), 66 deletions(-)

diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index 32105daa..a22fa10b 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -128,69 +128,4 @@ def serve_panels():
         'length_weight_distributions': panel2
     },  show=True)
 # Run the function to serve panels
-serve_panels()
-
-
-
-def serve_panels():
-    panel0.servable(title='Gridded population', location=True)
-    panel1.servable(title='Alongtrack population')
-    panel2.servable(title='Length/weight distribution')
-    pn.serve({'gridded a': panel0, 'fig1': panel1, 'fig2': panel2}, show=True)
-serve_panels()
-
-def serve_panels():
-    # Assign titles and make panels servable
-    panel0.servable(title='Gridded population')
-    panel1.servable(title='Alongtrack population')
-    panel2.servable(title='Length/weight distribution')
-    
-    # Create a dictionary layout
-    layout = {
-        'Gridded population': panel0,
-        'Alongtrack population': panel1,
-        'Length/weight distribution': panel2
-    }
-    
-    # Serve the panels
-    pn.serve(layout, show=True)
-
-# Run the function to serve panels
-serve_panels()
-
-layout = pn.Column(
-    pn.pane.Markdown("# Gridded population", style={'font-size': '20px'}),
-    panel0,
-    pn.pane.Markdown("# Alongtrack population", style={'font-size': '20px'}),
-    panel1,
-    pn.pane.Markdown("# Length/weight distribution", style={'font-size': '20px'}),
-    panel2
-)
-
-def serve_panels():
-    # Serve the layout with titles
-    layout.servable()
-    pn.serve(layout, show=True)
-
-# Run the function to serve panels
-serve_panels()
-
-# Create a layout
-layout = pn.Tabs(('Plot 1', panel1), ('Plot 2', panel2))
-
-# Serve the layout
-layout.servable()
-pn.serve(layout, show=True)
-# Run the server to display panels in separate windows
-# Create a layout with tabs
-tabs = pn.Tabs(('Alongtrack population', panel1), ('Length/weight distribution', panel2))
-tabs.servable()
-pn.serve(tabs, port=5006, show=True)
-pn.serve({'Plot 1': panel1, 'Plot 2': panel2}, show=True)
-
-combined_panel = pn.Column(panel1, panel2)
-combined_panel.show()
-panel1.show()
-
-
-
+serve_panels()
\ No newline at end of file

From 65ab70dd361a81056e13bd59dca270b0fc4d39c4 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Fri, 16 Aug 2024 20:24:02 -0700
Subject: [PATCH 61/81] Changed dynamic colorrange for some plots

---
 echopop/live/live_visualizer.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py
index 37b67a7b..1ba8dd74 100644
--- a/echopop/live/live_visualizer.py
+++ b/echopop/live/live_visualizer.py
@@ -208,6 +208,10 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame],
             "colormap": "inferno",
             "minimum": 0.0,
             "cbar_reverse": True,
+                "color_threshold": {
+                "minimum": 1e1,
+                "maximum": 1e6,
+            },
             "size": [25, 150]
         }, 
         "biomass_density": {
@@ -216,6 +220,10 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame],
             "colormap": "plasma",
             "minimum": 0.0,
             "cbar_reverse": True,
+            "color_threshold": {
+                "minimum": 1e1,
+                "maximum": 1e6,
+            },
             "size": [25, 150]
         },     
         "nasc": {
@@ -224,6 +232,10 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame],
             "colormap": "viridis",
             "minimum": 0.0,
             "cbar_reverse": False,
+                "color_threshold": {
+                "minimum": 1e2,
+                "maximum": 1e4
+            },
             "size": [25, 150]
         },
         "max_Sv": {

From 27ff2d388c4cd41bad87db0d5158262f68a90414 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 19 Aug 2024 10:57:21 -0700
Subject: [PATCH 62/81] Fix to grid plot colormap scaling/range

---
 echopop/live/live_visualizer.py | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py
index 1ba8dd74..2b48ae5c 100644
--- a/echopop/live/live_visualizer.py
+++ b/echopop/live/live_visualizer.py
@@ -62,21 +62,37 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame],
             "name": "Mean number density",
             "units": "fish $\\mathregular{nmi^{-2}}$",
             "colormap": "viridis",
+            "color_threshold": {
+                "minimum": 1e1,
+                "maximum": 1e6
+            },
         }, 
         "biomass_density_mean": {
             "name": "Mean biomass density",
             "units": "kg $\\mathregular{nmi^{-2}}$",
             "colormap": "plasma",
+            "color_threshold": {
+                "minimum": 1e1,
+                "maximum": 1e6
+            },
         },     
         "biomass": {
             "name": "Biomass",
             "units": "kg",
             "colormap": "cividis",
+            "color_threshold": {
+                "minimum": 1e1 * grid_gdf["area"].max(),
+                "maximum": 1e6 * grid_gdf["area"].max()
+            },
         },
         "abundance": {
             "name": "Abundance",
             "units": "$\\it{N}$",
             "colormap": "inferno",
+            "color_threshold": {
+                "minimum": 1e1 * grid_gdf["area"].max(),
+                "maximum": 1e6 * grid_gdf["area"].max()
+            },
         }
     }
 
@@ -98,8 +114,16 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame],
         newcolors[0, :] = white
         # ---- Create the new custom colormap
         custom_cmap = ListedColormap(newcolors)
+        # ---- Drop "empty" values
+        sub_grid_gdf = grid_gdf[grid_gdf[var] > 0.0]
+        if "color_threshold" in VARIABLE_MAP[var].keys():
+            min_value = VARIABLE_MAP[var]["color_threshold"]["minimum"]
+            max_value = VARIABLE_MAP[var]["color_threshold"]["maximum"]
+        else:
+            min_value = sub_grid_gdf[var].min()
+            max_value = sub_grid_gdf[var].max()
         # ---- Normalize colorscale
-        norm=plt.Normalize(vmin=grid_gdf[var].min(), vmax=grid_gdf[var].max())
+        norm=plt.Normalize(vmin=min_value, vmax=max_value)
         # ---- Plot the polygons with color fills based on the variable (non-zero)
         grid_gdf.plot(column=var, ax=ax, edgecolor="gainsboro", legend=False, cmap=custom_cmap,
                       norm=norm,
@@ -121,8 +145,7 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame],
         ax.set_ylabel(u'Latitude (\u00B0N)')
         # ---- Add colorbar
         sm = plt.cm.ScalarMappable(cmap=custom_cmap, 
-                                   norm=plt.Normalize(vmin=grid_gdf[var].min(), 
-                                                      vmax=grid_gdf[var].max()))
+                                   norm=norm)
         sm._A = []  # fake up the array of the scalar mappable
         cbar = fig.colorbar(sm, ax=ax, shrink=0.5)
         cbar.set_label(f"{var_info['units']}")
@@ -232,7 +255,7 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame],
             "colormap": "viridis",
             "minimum": 0.0,
             "cbar_reverse": False,
-                "color_threshold": {
+            "color_threshold": {
                 "minimum": 1e2,
                 "maximum": 1e4
             },
@@ -349,7 +372,7 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
         cbar = fig.colorbar(sm, ax=ax, shrink=0.5, fraction=0.075, pad=0.1)
         cbar.set_label(f"{var_info['units']}")
         # ---- Add scalebar
-        scalebar_length = 250  # Length of scale bar in km
+        scalebar_length = 100  # Length of scale bar in km
         scalebar_length_in_degrees = scalebar_length / 111  # Assuming 1 degree = 111 km
         # ---- Transform scale bar coordinates to axis units
         # scalebar_x = axis_limits[0]*1.005 + (axis_limits[2]*1.01 - axis_limits[0]*1.005) * 0.1

From 8e21e13d3d90d3291eef4564637fbb1656586359 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 19 Aug 2024 16:53:55 -0700
Subject: [PATCH 63/81] Add dataset validator for biodata

---
 echopop/live/live_data_loading.py | 73 +++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index 96631b35..bde4db7f 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -7,6 +7,7 @@
 import numpy as np
 from datetime import datetime
 import xarray as xr
+import os
 
 from .live_core import(
     LIVE_FILE_FORMAT_MAP,
@@ -269,6 +270,17 @@ def validate_data_directory(file_configuration: dict, dataset: str,
 
     # Initialize the database file
     initialize_database(database_root_directory, file_settings)
+
+    # Clean the file names    
+    data_files = [
+        re.sub(r'//', r'\\', str(filename)).replace('/', '\\') 
+        if not str(filename).startswith('s3://') 
+        else str(filename)
+        for filename in data_files
+    ]
+
+    # Drop incomplete datasets
+    data_files = validate_complete_biology_dataset(data_files, directory_path, file_configuration)
     
     # Query the SQL database to process only new files (or create the db file in the first place)
     valid_files, file_configuration["database"][dataset] = (
@@ -278,6 +290,67 @@ def validate_data_directory(file_configuration: dict, dataset: str,
     # Return the valid filenames/paths
     return valid_files
 
+def validate_complete_biology_dataset(data_files: List[str], 
+                                      directory_path: str,
+                                      file_configuration: dict):
+
+    # Get the biology data file settings
+    file_settings = file_configuration["input_directories"]["biology"]
+
+    # Get the file-specific settings, datatypes, columns, etc.
+    # ---- Extract the expected file name ID's
+    biology_file_ids = file_settings["file_name_formats"]
+
+    # Define helper function for extract haul number from filename strings
+    def get_file_haul_number(filename, format_string):
+        # Step 1: Extract the filename from the full path
+        filename_only = os.path.basename(filename)
+            
+        # Remove the file extension from the filename
+        filename_no_ext = os.path.splitext(filename_only)[0]
+
+        # Split the format string and filename into parts
+        format_parts = re.findall(r'\{[^}]+\}|[^_]+', format_string)
+        filename_parts = filename_no_ext.split('_')
+
+        # Find the index of {HAUL} in format_parts
+        haul_index = format_parts.index('{HAUL}')
+
+        # Extract and return the haul number from filename_parts
+        if haul_index < len(filename_parts):
+            return filename_parts[haul_index]
+        return None
+        
+    # Organize dataset by their respective dataset-type
+    dataset_dict = {key: filter_filenames(directory_path, 
+                                          ds, 
+                                          data_files, 
+                                          file_settings["extension"]) 
+                    for key, ds in biology_file_ids.items()}
+    
+    # Extract the haul numbers
+    extracted_hauls = {
+        key: set(get_file_haul_number(filename, biology_file_ids.get(key, ''))
+                for filename in filenames)
+        for key, filenames in dataset_dict.items()
+    }
+
+    # Find haul numbers that appear in all keys
+    common_hauls = set.intersection(*extracted_hauls.values())
+
+    # Filter filenames to keep only those with haul numbers in the common set
+    filtered_filenames = [
+        filename
+        for key, filenames in dataset_dict.items()
+        for filename in filenames
+        if get_file_haul_number(filename, biology_file_ids.get(key, '')) 
+        in common_hauls
+    ]
+
+    # Return the curated filename list
+    return filtered_filenames
+
+
 def compile_filename_format(file_name_format: str):
 
     # Create a copy of `file_name_format`

From 0d8e73256d1c084b3ef37e29de112d0ae9f4dc2a Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 19 Aug 2024 16:56:30 -0700
Subject: [PATCH 64/81] Apply biodata validator only to biodata...

---
 echopop/live/live_data_loading.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index bde4db7f..d46cec1e 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -280,7 +280,9 @@ def validate_data_directory(file_configuration: dict, dataset: str,
     ]
 
     # Drop incomplete datasets
-    data_files = validate_complete_biology_dataset(data_files, directory_path, file_configuration)
+    if dataset == "biology":
+        data_files = validate_complete_biology_dataset(data_files, directory_path, 
+                                                       file_configuration)
     
     # Query the SQL database to process only new files (or create the db file in the first place)
     valid_files, file_configuration["database"][dataset] = (

From 3ab32ac0d3e0bccdc0c3c137002c84fd05f75cd4 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 19 Aug 2024 18:48:40 -0700
Subject: [PATCH 65/81] Fix to biodata dataset validator

---
 echopop/live/live_data_loading.py | 17 ++++++++++++++++-
 echopop/live/sql_methods.py       |  4 +++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index d46cec1e..0d90e2d3 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -281,7 +281,8 @@ def validate_data_directory(file_configuration: dict, dataset: str,
 
     # Drop incomplete datasets
     if dataset == "biology":
-        data_files = validate_complete_biology_dataset(data_files, directory_path, 
+        data_files = validate_complete_biology_dataset(data_files, 
+                                                       directory_path, 
                                                        file_configuration)
     
     # Query the SQL database to process only new files (or create the db file in the first place)
@@ -349,6 +350,20 @@ def get_file_haul_number(filename, format_string):
         in common_hauls
     ]
 
+    # Get bad files for DEBUG
+    non_filtered_filenames = [
+        filename
+        for key, filenames in dataset_dict.items()
+        for filename in filenames
+        if get_file_haul_number(filename, biology_file_ids.get(key, '')) 
+        not in common_hauls
+    ]
+
+    print(
+        f"The following files are parts of incomplete filesets: "
+        f"{'\n'.join(non_filtered_filenames)}"
+    )
+
     # Return the curated filename list
     return filtered_filenames
 
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index a0cf299c..682bef65 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -669,11 +669,13 @@ def query_processed_files(root_directory: Path, file_settings: dict, files: List
     if processed: 
         SQL(db_file, "insert", table_name="files_processed", dataframe=current_files, 
             id_columns=["filepath"])
-    else:
+    elif not current_files.empty:
         SQL(db_file, "insert", table_name="files_read", dataframe=current_files, 
             id_columns=["filepath"])
         # ---- Apply filter by comparing sets and return the output
         return list(set(files_str) - set(previous_files)), db_file
+    else:
+        return None, db_file
 
 # TODO: Documentation
 def sql_data_exchange(database_file: Path, **kwargs):

From 80d2b555fabd740f9d3198a1d824ba09529b1b4f Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Tue, 20 Aug 2024 09:03:05 -0700
Subject: [PATCH 66/81] f-string adjustment

---
 echopop/live/live_data_loading.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index 0d90e2d3..06d1a6fc 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -358,10 +358,11 @@ def get_file_haul_number(filename, format_string):
         if get_file_haul_number(filename, biology_file_ids.get(key, '')) 
         not in common_hauls
     ]
-
+    # ---- Create list
+    non_filtered_filenames_lst = "\n".join(non_filtered_filenames)
     print(
-        f"The following files are parts of incomplete filesets: "
-        f"{'\n'.join(non_filtered_filenames)}"
+        f"The following files are parts of incomplete filesets: \n"
+        f"{non_filtered_filenames_lst}"
     )
 
     # Return the curated filename list

From dd24ebd71888fc3b842e63a4ab8a1ef65d536ea0 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 21 Aug 2024 10:20:27 -0700
Subject: [PATCH 67/81] Change to enable multiple ship data sources

---
 echopop/live/live_acoustics.py  | 13 ++++++++-----
 echopop/live/live_survey.py     |  3 ++-
 echopop/live/live_visualizer.py | 22 ++++++++++++++++++++--
 echopop/live/sql_methods.py     | 13 +++++++++++--
 4 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py
index 44e61ae0..3d8ac59c 100644
--- a/echopop/live/live_acoustics.py
+++ b/echopop/live/live_acoustics.py
@@ -60,6 +60,8 @@ def preprocess_acoustic_data(survey_data: pd.DataFrame,
     )
 
     # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object
+    # ---- Add `ship_id` from the file configuration
+    prc_nasc_df_filtered.loc[:, "ship_id"] = file_configuration["ship_id"]
     # ---- Replace NASC `NaN` values with `0.0`
     prc_nasc_df_filtered.loc[:, "NASC"] = prc_nasc_df_filtered.loc[:, "NASC"].fillna(0.0)
     # ---- Drop the `frequency_nominal` column and return the output 
@@ -221,7 +223,7 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict,
     # else:
     nasc_data_df = (
         acoustic_data_df
-        .groupby(["longitude", "latitude", "ping_time", "source"] + gridding_column, 
+        .groupby(["ship_id", "longitude", "latitude", "ping_time", "source"] + gridding_column, 
                     observed=False)
         .apply(integrate_nasc, echometrics, include_groups=False).droplevel(-1)
         .reset_index()
@@ -239,9 +241,9 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict,
         # ---- Reorder columns
         nasc_data_df = nasc_data_df[
             gridding_column
-            + ["longitude", "latitude", "ping_time", "source", "nasc", "n_layers", "nasc_db", 
-               "mean_Sv", "max_Sv", "aggregation_index", "center_of_mass", "dispersion", "evenness", 
-               "occupied_area"]
+            + ["ship_id", "longitude", "latitude", "ping_time", "source", "nasc", "n_layers", 
+               "nasc_db", "mean_Sv", "max_Sv", "aggregation_index", "center_of_mass", "dispersion", 
+               "evenness", "occupied_area"]
         ]
 
     # Return the output
@@ -261,7 +263,8 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict
     # ----
     df[add_columns] = 0.0
     # ---- Assign values for key values
-    key_values = [f"{str(index)}-{df.loc[index, 'source']}" for index in df.index]    
+    key_values = [f"{df.loc[index, "ship_id"]}-{str(index)}-{df.loc[index, 'source']}" 
+                  for index in df.index]    
     # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint
     df.loc[:, "id"] = key_values
 
diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index d8470366..a8a2a70e 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -340,7 +340,8 @@ def estimate_population(self,
             eldp.acoustic_pipeline(self.input["acoustics"],
                                     self.input["spatial"]["strata"],
                                     self.config,
-                                    verbose=verbose)   
+                                    verbose=verbose,
+                                    contrast_columns=["ship_id"])   
             # --- Validate successful run
             self.meta["provenance"]["acoustic_population"] = True
         
diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py
index 2b48ae5c..69c33dd5 100644
--- a/echopop/live/live_visualizer.py
+++ b/echopop/live/live_visualizer.py
@@ -304,6 +304,12 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
             * (max_size - min_size) + min_size
         )    
     
+    # Define colors for ship_ids (you can customize these colors as needed)
+    ship_id_colors = {
+        ship_id: plt.cm.tab10(i)  # Use a colormap for distinct colors; adjust as needed
+        for i, ship_id in enumerate(survey_gdf['ship_id'].unique())
+    }
+
     # Create a figure and a 2xn grid of subplots
     if len(intact_variables) == 4:
         fig, axes = plt.subplots(2, 2, figsize=(10, 10))
@@ -323,8 +329,18 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
         custom_cmap = ListedColormap(newcolors)
         # ---- Plot cruisetrack
         # survey_gdf.plot(ax=ax, color="dimgray", linewidth=0.25, linestyle="-")
-        ax.plot(survey_gdf.geometry.x, survey_gdf.geometry.y, color="dimgray", 
-                linewidth=0.25, linestyle="-")
+        # ax.plot(survey_gdf.geometry.x, survey_gdf.geometry.y, color="dimgray", 
+        #         linewidth=0.25, linestyle="-")
+        handles = []  # List to store legend handles
+        for ship_id, group in survey_gdf.groupby("ship_id"):
+            # Sort the group by latitude or longitude
+            # group = group.sort_values(by=["latitude", "longitude"])  
+            color = ship_id_colors.get(ship_id, 'gray')
+            line_handle, = ax.plot(group.geometry.x, group.geometry.y, color=color, 
+                        linewidth=0.25, linestyle="-", label=ship_id, zorder=1)
+            handles.append(line_handle)  # Add handle to legend
+            # ax.plot(group.geometry.x, group.geometry.y, label=ship_id, linewidth=0.25, 
+            #         linestyle="-", zorder=1)
         # ---- Drop "empty" values
         sub_gdf = survey_gdf[survey_gdf[var] > VARIABLE_MAP[var]["minimum"]]
         # ---- Assign color range
@@ -348,6 +364,7 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
                           max_size=VARIABLE_MAP[var]["size"][1]),
             cmap=custom_cmap,
             norm=norm,
+            zorder = 2
             # edgecolor="black",
             # linewidths=0.1
         )    
@@ -390,6 +407,7 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
         # ---- Add scale text
         ax.text(x0 + x_scale + scalebar_length_in_degrees / 2, y0 + y_scale - (y1 - y0) * 0.025, 
                 f'{scalebar_length} km', ha='center', va='top', color='black')
+        # ax.legend(handles=handles, title='Ship ID')
 
         # ax.text(scalebar_x + (scalebar_length / 200), 
         #         scalebar_y - scalebar_y_offset, 
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index 682bef65..fc28d2d1 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -744,11 +744,20 @@ def query_dataset(db_file: str,
         valid_keys = list(set(inspected_table.keys()).intersection(set(data_columns)))
         # ---- Get unique identifiers
         unique_keys_df = get_unique_identifiers(data_dict, unique_keys)
+        # ---- Conditional string formatting helper function
+        def format_value(value):
+            if isinstance(value, str):
+                return f"'{value.replace("'", "''")}'"
+            return str(value)
         # ---- Create conditional string  
         conditional_str = " | ".join(
-            [" & ".join([f"{col} = {val}" for col, val in row.items()]) 
+            [" & ".join([f"{col} = {format_value(val)}" for col, val in row.items()]) 
             for _, row in unique_keys_df.iterrows()]
-        )          
+        )
+        # conditional_str = " | ".join(
+        #     [" & ".join([f"{col} = {val}" for col, val in row.items()]) 
+        #     for _, row in unique_keys_df.iterrows()]
+        # )          
         # conditional_str = (
         #    " & ".join([f"{col} in {np.unique(unique_keys_df[col]).tolist()}" 
         #                for col in unique_keys_df.columns])  

From 3c4830ac22ee095921d5476deb300795f37f375e Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 21 Aug 2024 11:27:50 -0700
Subject: [PATCH 68/81] Fix to cases where lon/lat/ping_time were NaN/NaT

---
 echopop/live/live_acoustics.py       | 2 ++
 echopop/live/live_data_processing.py | 7 ++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py
index 3d8ac59c..f59b3d13 100644
--- a/echopop/live/live_acoustics.py
+++ b/echopop/live/live_acoustics.py
@@ -45,6 +45,8 @@ def preprocess_acoustic_data(survey_data: pd.DataFrame,
     # ---- Filter out any unused frequency coordinates
     prc_nasc_df_filtered = (
         survey_data[survey_data["frequency_nominal"] == transmit_settings["frequency"]]
+        # ---- Drop NaN/NaT values from longitude/latitude/ping_time
+        .dropna(subset=["longitude", "latitude", "ping_time"])
     )
 
     # Get grid coordinates
diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py
index 8673bd53..a2dcaa46 100644
--- a/echopop/live/live_data_processing.py
+++ b/echopop/live/live_data_processing.py
@@ -86,7 +86,7 @@ def acoustic_pipeline(acoustic_dict: dict,
         if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]):
 
             # ---- Merge the NASC and sigma_bs datasets
-            nasc_biology = acoustic_df.merge(sigma_bs_df, on=spatial_column + contrast_columns)
+            nasc_biology = acoustic_df.merge(sigma_bs_df, on=spatial_column)
             # ---- Compute the number densities (animals nmi^-2)
             nasc_biology["number_density"] = (
                 nasc_biology["nasc"]
@@ -101,7 +101,7 @@ def acoustic_pipeline(acoustic_dict: dict,
             if weight_spatial_averages is not None:
                 # Merge average weights with number density estimates
                 nasc_biology = nasc_biology.merge(weight_spatial_averages, 
-                                                  on=spatial_column + contrast_columns)
+                                                  on=spatial_column)
 
                 # Compute biomass densities
                 nasc_biology["biomass_density"] = (
@@ -109,7 +109,8 @@ def acoustic_pipeline(acoustic_dict: dict,
                 )
 
             # Update the survey population estimate DataFrame with the newly computed densities
-            if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]):        
+            if (all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]) 
+                and not nasc_biology.empty):        
                 sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df", 
                                  columns=["number_density", "biomass_density"], 
                                  unique_columns=["id"])

From 4bce56044815049fc6e51b2a6b7e943ab1dd22ed Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 21 Aug 2024 12:05:05 -0700
Subject: [PATCH 69/81] f-string fix for sql_methods

---
 echopop/live/sql_methods.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index fc28d2d1..9b6354e6 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -745,10 +745,10 @@ def query_dataset(db_file: str,
         # ---- Get unique identifiers
         unique_keys_df = get_unique_identifiers(data_dict, unique_keys)
         # ---- Conditional string formatting helper function
-        def format_value(value):
-            if isinstance(value, str):
-                return f"'{value.replace("'", "''")}'"
-            return str(value)
+        def format_value(x):
+            if isinstance(x, str):
+                return "'{}'".format(x.replace("'", "''"))
+            return str(x)
         # ---- Create conditional string  
         conditional_str = " | ".join(
             [" & ".join([f"{col} = {format_value(val)}" for col, val in row.items()]) 

From 50937f123806ad337959c8a5630356737cc52b40 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 21 Aug 2024 12:24:05 -0700
Subject: [PATCH 70/81] Fixed `ship_id` f-string issue

---
 echopop/live/live_acoustics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py
index f59b3d13..2a9347a0 100644
--- a/echopop/live/live_acoustics.py
+++ b/echopop/live/live_acoustics.py
@@ -265,7 +265,7 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict
     # ----
     df[add_columns] = 0.0
     # ---- Assign values for key values
-    key_values = [f"{df.loc[index, "ship_id"]}-{str(index)}-{df.loc[index, 'source']}" 
+    key_values = [f"{df.loc[index, 'ship_id']}-{str(index)}-{df.loc[index, 'source']}" 
                   for index in df.index]    
     # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint
     df.loc[:, "id"] = key_values

From f324647f5e8ce8307e2f498875bb2113453f9c2f Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 21 Aug 2024 17:13:16 -0700
Subject: [PATCH 71/81] Fixes to odd SQL table column shuffling

---
 echopop/live/live_biology.py |   5 +-
 echopop/live/live_core.py    | 192 +++++++++++++++++++++++++++++++++++
 echopop/live/live_survey.py  |   3 +-
 echopop/live/sql_methods.py  |  13 ++-
 4 files changed, 207 insertions(+), 6 deletions(-)

diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py
index 99264e0f..5e70d92b 100644
--- a/echopop/live/live_biology.py
+++ b/echopop/live/live_biology.py
@@ -195,9 +195,10 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame,
         insertion_df = sigma_bs_df.copy()
         # ---- Create
         SQL(biology_db, "create", table_name="sigma_bs_mean_df", dataframe=insertion_df, 
-            primary_keys=["id"])
+            primary_keys=key_list+["id"])
         # ---- Populate table
-        SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=insertion_df)
+        SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=insertion_df,
+            id_columns=key_list+["id"])
     else:
         # ---- Get previous values in the table
         table_df = SQL(biology_db, "select", table_name="sigma_bs_mean_df")
diff --git a/echopop/live/live_core.py b/echopop/live/live_core.py
index 256b9f27..388a8240 100644
--- a/echopop/live/live_core.py
+++ b/echopop/live/live_core.py
@@ -32,6 +32,198 @@
     },
 }
 
+# Required data configuration YAML structure
+LIVE_CONFIG_INIT_MODEL = {
+    "required_keys": ["acoustics", "biology", "geospatial"],
+    "optional_keys": [],
+    "keys": {
+        "acoustics": {
+            "required_keys": ["transmit", "TS_length_regression_parameters"],
+            "optional_keys": [],
+            "keys": {
+                "transmit": {
+                    "required_keys": ["frequency", "units"],
+                    "optional_keys": [],
+                    "keys": {
+                        "frequency": float,
+                        "units": ["Hz", "kHz"],
+                    },
+                },
+                "TS_length_regression_parameters": {
+                    "required_keys": ["*"],
+                    "optional_keys": [],
+                    "keys": {
+                        "*": {
+                            "required_keys": ["number_code", "TS_L_slope", "TS_L_intercept", 
+                                              "length_units"],
+                            "optional_keys": ["character_code"],
+                            "keys": {
+                                "number_code": int,
+                                "characeter_code": str,
+                                "TS_L_slope": float,
+                                "TS_L_intercept": float,
+                                "length_units": ["mm", "cm", "m"],
+                            },
+                        },
+                    },
+                },
+            },
+        },
+        "biology": {
+            "required_keys": ["length_distribution", "catch"],
+            "optional_keys": ["stations"],
+            "keys": {
+                "length_distribution": {
+                    "required_keys": ["bins"],
+                    "optional_keys": [],
+                    "keys": {
+                        "bins": [float, int],
+                        },
+                    },
+                "stations": {
+                    "required_keys": ["separate_stations", "station_id"],
+                    "optional_keys": [],
+                    "keys": {
+                        "separate_stations": bool,
+                        "station_id": [str],
+                    },
+                },
+                "catch": {
+                    "required_keys": ["partition"],
+                    "optional_keys": [],
+                    "keys": {
+                        "partition": str,
+                    },
+                },
+            },
+        },
+        "geospatial": {
+            "required_keys": ["projection", "link_biology_acoustics"],
+            "optional_keys": ["inpfc", "griddify"],    
+            "keys": {
+                "inpfc": {
+                    "required_keys": ["latitude_max", "stratum_names"],
+                    "optional_keys": [],
+                    "keys": {
+                        "latitude_max": [float],
+                        "stratum_names": [int, str],
+                    },
+                },
+                "griddify": {
+                    "required_keys": ["bounds", "grid_resolution"],
+                    "optional_keys": [],
+                    "keys": {
+                        "bounds": {
+                            "required_keys": [("latitude", "longitude"), ("x", "y")],
+                            "optional_keys": [],
+                            "keys": {
+                                "latitude": [float],
+                                "longitude": [float],
+                                "x": [float],
+                                "y": [float]
+                            },
+                        }, 
+                        "grid_resolution": {
+                            "required_keys":[("latitude_distance", "longitude_distance"), 
+                                             ("x_distance", "y_distance")],   
+                            "optional_keys": [],
+                            "keys": {
+                                "longitude_distance": float,
+                                "latitude_distance": float,
+                                "x_distance": float,
+                                "y_distnace": float,
+                            }
+                        }
+                    },
+                },
+                "link_biology_acoustics": ["closest_haul", "global", "INPFC", "weighted_haul"],
+                "projection": str,
+            },
+        },
+    },
+}
+
+# Required data configuration YAML structure
+LIVE_CONFIG_DATA_MODEL = {
+    "required_keys": ["ship_id", "survey_year", "database_directory", "input_directories"],
+    "optional_keys": ["species", "data_root_dir"],    
+    "keys": {
+        "data_root_dir": str,
+        "database_directory": str,
+        "input_directories": {
+            "required_keys": ["acoustics", "biology"],
+            "optional_keys": ["coastline", "grid"],
+            "keys": {
+                "acoustics": {
+                    "required_keys": ["database_name", "directory", "extension"],
+                    "optional_keys": [],
+                    "keys": {
+                        "directory": str,
+                        "database_name": str,
+                        "extension": ["zarr"],
+                    },
+                },
+                "biology": {
+                    "required_keys": ["database_name", "directory", "extension", "file_index", 
+                                      "file_ids", "file_name_formats"],
+                    "optional_keys": [],
+                    "keys": {
+                        "directory": str,
+                        "database_name": str,
+                        "extension": ["csv"],
+                        "file_name_formats": {
+                            "required_keys": ["*"],
+                            "optional_keys": [],
+                            "keys": {
+                                "*": str,
+                            },
+                        },
+                        "file_ids": {
+                            "required_keys": ["*"],
+                            "optional_keys": [],
+                            "keys": {
+                                "*": str,
+                            },
+                        },   
+                        "file_index": {
+                            "required_keys": ["*"],
+                            "optional_keys": [],
+                            "keys": {
+                                "*": [str],
+                            },
+                        },   
+                    },
+                },
+                "coastline": {
+                    "required_keys": ["directory", "coastline_name"],
+                    "optional_keys": [],
+                    "keys": {
+                        "directory": str,
+                        "coastline_name": str,
+                    },
+                },
+                "grid": {
+                    "required_keys": ["database_name"],
+                    "optional_keys": [],
+                    "keys": {
+                        "database_name": str,
+                    },
+                },
+            },
+        },
+        "ship_id": [str, int],
+        "species": {
+            "required_keys": [],
+            "optional_keys": ["text_code", "number_code"],
+            "keys": {
+                "text_code": str,
+                "number_code": int,
+            },
+        },
+        "survey_year": int,
+    },
+}
+
 # TODO: Update structure with additional information (as needed)
 # TODO: Documentation
 LIVE_INPUT_FILE_CONFIG_MAP = {
diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index a8a2a70e..51cc4ba8 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -60,8 +60,7 @@ def __init__(
 
         # Loading the configuration settings and definitions that are used to
         # initialize the Survey class object
-        self.config = eldl.live_configuration(Path(live_init_config_path), 
-                                              Path(live_file_config_path))
+        self.config = eldl.live_configuration(live_init_config_path, live_file_config_path)
         # ---- Initialize config key for database files
         self.config.update(
             {"database": {key: None for key in self.config["input_directories"].keys()}}
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index 9b6354e6..0e5f6a97 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -116,16 +116,23 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data
         conflict_columns (list): List of column names to use for conflict resolution.
     """
     
+    # Create 'inspector' for the db file
+    inspector = inspect(connection)
+    # ---- Get the column names from the db file
+    table_columns = [col['name'] for col in inspector.get_columns(table_name)]
+
     # Prepare the SQL statement for insertion
     # ---- Check whether `columns` is '*'
     if "*" in columns:
         # ---- Create 'inspector' for the db file
         inspector = inspect(connection)
         # ---- Get the column names from the db file
-        columns = [col['name'] for col in inspector.get_columns(table_name)]
+        columns = table_columns
     # ---- If not a List
     elif not isinstance(columns, list):
         columns = [columns]
+    # ---- Match column indexing with original table
+    columns = [col for col in table_columns if col in columns]
     # ---- Prepare the columns as a string of column names
     column_names = ", ".join(columns)
 
@@ -136,6 +143,8 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data
     # Convert the DataFrame into a tuple and then into a string
     # ---- Replace NaN with None
     dataframe = dataframe.replace([np.nan], [None])
+    # ---- Match column indexing with original table
+    dataframe = dataframe[columns]
     # ---- DataFrame to Tuple
     data_tuple = [tuple(row) for row in dataframe.itertuples(index=False)]
     
@@ -157,7 +166,7 @@ def format_value(x):
     #                                 else 'NULL' if x is None else str(x), row))})"
     #     for row in data_tuple
     # )
-    data_str = ", ".join(f"({','.join(map(lambda x: format_value(x), row))})"  for row in data_tuple)
+    data_str = ", ".join(f"({','.join(map(lambda x: format_value(x), row))})" for row in data_tuple)
     
     # Construct the "ON CONFLICT, DO UPDATE SET" if needed
     on_conflict_clause = ""

From f0f8001cb2f2a8a5765d31c4c56fe67c2fd88698 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 21 Aug 2024 17:37:55 -0700
Subject: [PATCH 72/81] Minor improvements to visualizer code

---
 echopop/live/live_visualizer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py
index 69c33dd5..4c59d975 100644
--- a/echopop/live/live_visualizer.py
+++ b/echopop/live/live_visualizer.py
@@ -7,6 +7,7 @@
 import geopandas as gpd
 from typing import Union, Optional
 from pathlib import Path
+import matplotlib.gridspec as gridspec
 
 def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame],
                          projection: str,
@@ -317,6 +318,7 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
         fig, axes = plt.subplots(1, 3, figsize=(10, 10))
     elif len(intact_variables) == 2:
         fig, axes = plt.subplots(1, 1, figsize=(10, 10))
+    plt.subplots_adjust(left=0.1, right=0.9, bottom=0.1, top=0.9, wspace=0.0, hspace=0.0)
 
     # Iterate through and plot all subplots
     for ax, var in zip(axes.flat, intact_variables):

From 5fcc0c9ceeefdfefd77f1e387579cb90af44ffad Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 21 Aug 2024 17:39:49 -0700
Subject: [PATCH 73/81] New configuration file validator

---
 echopop/live/live_data_loading.py | 220 +++++++++++++++++++++---------
 1 file changed, 154 insertions(+), 66 deletions(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index 06d1a6fc..80dee085 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -8,11 +8,14 @@
 from datetime import datetime
 import xarray as xr
 import os
+import copy
 
 from .live_core import(
     LIVE_FILE_FORMAT_MAP,
     LIVE_INPUT_FILE_CONFIG_MAP,
-    SPATIAL_CONFIG_MAP
+    SPATIAL_CONFIG_MAP,
+    LIVE_CONFIG_INIT_MODEL,
+    LIVE_CONFIG_DATA_MODEL
 )
 
 from .live_spatial_methods import create_inpfc_strata
@@ -42,8 +45,11 @@ def live_configuration(live_init_config_path: Union[str, Path],
     # Read the YAML configuration/recipe file to parameterize the `LiveSurvey` class
     # ---- Initialization settings
     init_config = yaml.safe_load(Path(live_init_config_path).read_text())
+    # -------- Validate
+    init_config = validate_live_config(copy.deepcopy(init_config), LIVE_CONFIG_INIT_MODEL)
     # ---- Filepath/directory settings
     file_config = yaml.safe_load(Path(live_file_config_path).read_text())
+    file_config = validate_live_config(copy.deepcopy(file_config), LIVE_CONFIG_DATA_MODEL)
     
     # Check for intersecting/duplicative configuration keys
     # ---- Compare sets of keys from each dictionary
@@ -456,71 +462,6 @@ def convert_datetime(timestamp: Union[int, str, pd.Series]):
     else:
         return datetime.strptime(timestamp, datetime_format)
 
-# def load_biology_data(file_configuration: dict):
-
-#     # Get the acoustic file settings and root directory
-#     # ---- File settings
-#     file_settings = file_configuration["input_directories"]["biology"]
-#     # ---- Root directory
-#     root_directory = file_configuration["data_root_dir"]
-
-#     # Get and validate the acoustic data directory and files
-#     biology_files = validate_data_directory(root_directory, file_settings)
-
-#     # Query `biology.db` to process only new files (or create the db file in the first place)
-#     # SQL(biology_db, "drop", table_name="files_read")
-#     new_biology_files, file_configuration["database"]["biology"] = (
-#         query_processed_files(root_directory, file_settings, biology_files)
-#     )
-
-#     # Get the file-specific settings, datatypes, columns, etc.
-#     # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
-#     biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"]
-#     # ---- Extract the expected file name ID's
-#     biology_file_ids = file_settings["file_name_formats"]
-#     # ---- Extract all of the file ids
-#     biology_config_ids = list(biology_file_ids.keys())
-#     # ---- Initialize the dictionary that will define this key in the `input` attribute
-#     biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
-#     # ---- Create filepath object
-#     directory_path = Path(file_configuration["data_root_dir"]) / file_settings["directory"]
-    
-#     # Add SQL file to dict
-#     file_configuration["database"]["biology"] = (
-#         Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"]         
-#     )
-
-#     # Iterate through the different biology datasets and read them in
-#     for dataset in list(biology_file_ids.keys()):
-#         # ---- Get dataset-specific file lists
-#         dataset_files = filter_filenames(directory_path, 
-#                                          file_settings["file_name_formats"][dataset], 
-#                                          new_biology_files, 
-#                                          file_settings["extension"])
-#         # ---- If there are dataset files available
-#         if dataset_files:
-#             # ---- Read in validated biology data
-#             dataframe_list = [read_biology_csv(Path(file), 
-#                                                file_settings["file_name_formats"][dataset], 
-#                                                biology_config_map[dataset]) 
-#                               for file in dataset_files]
-#             # ---- Concatenate the dataset
-#             dataframe_combined = pd.concat(dataframe_list, ignore_index=True)
-#             # ---- Lower-case sex
-#             if "sex" in dataframe_combined.columns: 
-#                 dataframe_combined["sex"] = dataframe_combined["sex"].str.lower()
-#             # ---- Lower-case trawl partition type
-#             if "trawl_partition" in dataframe_combined.columns: 
-#                 dataframe_combined["trawl_partition"] = dataframe_combined["trawl_partition"].str.lower()
-#             # ---- Reformat datetime column
-#             if "datetime" in dataframe_combined.columns:
-#                 dataframe_combined["datetime"] = convert_datetime(dataframe_combined["datetime"])
-#             # ---- Add to the data dictionary
-#             biology_output[f"{dataset}_df"] = dataframe_combined
-
-#     # Pre-process and return the results
-#     return preprocess_biology_data(biology_output, file_configuration)
-
 def validate_hauls_config(spatial_config: dict, link_method: str):
 
     # Get the link method configuration map
@@ -705,3 +646,150 @@ def validate_spatial_config(spatial_config: dict):
         validate_inpfc_config(spatial_config, link_method)
     elif link_method != "global": 
         validate_hauls_config(spatial_config, link_method)
+
+def validate_live_config(config, reference_model):
+    """Validate configuration inputs"""
+
+    # Recursive function for validating entire nested dictionary
+    def validate_keys(config, model, path=""):
+
+        # Get the required/optional/actual keys
+        # ---- Keys that are required by the software
+        required_keys = model.get("required_keys", [])
+        # ---- Keys that are optionally incorporated into the software
+        optional_keys = model.get("optional_keys", [])
+        # ---- Navigate the nested branches
+        keys = model.get("keys", {})
+
+        # General helper functions
+        # ----
+        def get_keys_from_tuples(tuples):
+            """Parse key names from tuples"""
+            return {key for group in tuples if isinstance(group, tuple) for key in group}
+        # ----
+        def find_missing_keys(required_keys, keys_to_check):
+            """Find any missing keys"""
+            all_required_keys = get_keys_from_tuples(required_keys)
+            valid_keys_in_tuples = set()
+            for group in required_keys:
+                if isinstance(group, tuple):
+                    if any(key in keys_to_check for key in group):
+                        valid_keys_in_tuples.update(group)
+            missing_keys = [key for key in valid_keys_in_tuples if key not in keys_to_check]
+            unexpected_keys = [key for key in keys_to_check if key not in all_required_keys]
+            return missing_keys, unexpected_keys
+        # ----
+        def check_for_missing_keys(required_keys, config_keys, path):
+            """Check whether any required keys are missing"""
+            missing_required = []
+            for key in required_keys:
+                if isinstance(key, tuple):
+                    missing_keys, unexpected_keys_for_keys = find_missing_keys(required_keys, 
+                                                                               config_keys)
+                    if missing_keys:
+                        raise ValueError(
+                            f"Missing required key(s): {', '.join(missing_keys)} at {path}"
+                        )
+                    return unexpected_keys_for_keys
+                elif key not in config_keys and key != "*":
+                    missing_required.append(key)
+            if missing_required:
+                raise ValueError(
+                    f"Missing required key(s): {', '.join(missing_required)} at {path}"
+                )
+            return []
+        # ----
+        def check_for_unexpected_keys(config_keys, required_keys):
+            """Check for unexpected keys"""
+            unexpected_keys = []
+            for key in config_keys:
+                if (key not in required_keys 
+                    and key not in optional_keys 
+                    and "*" not in required_keys):
+                    if not any(key in group for group in required_keys if isinstance(group, tuple)):
+                        unexpected_keys.append(key)
+            return unexpected_keys
+
+        # Top-level validation
+        if path == "":
+            missing_primary_keys = [key for key in required_keys 
+                                    if key != "*" and key not in config]
+            if missing_primary_keys:
+                raise ValueError(f"Missing primary key(s): {', '.join(missing_primary_keys)}")
+            unexpected_primary_keys = [key for key in config 
+                                       if key not in required_keys 
+                                       and key not in optional_keys 
+                                       and "*" not in required_keys]
+            # ---- Raise error
+            if unexpected_primary_keys:
+                raise ValueError(
+                    f"Unexpected primary key(s) found: {', '.join(unexpected_primary_keys)}"
+                )
+        # Nested validation
+        else:
+            config_keys = config.keys()
+            unexpected_keys = check_for_missing_keys(required_keys, config_keys, path)
+            unexpected_keys.extend(check_for_unexpected_keys(config_keys, required_keys))
+            # ---- Raise error
+            if unexpected_keys:
+                raise ValueError(f"Unexpected key(s) found: {', '.join(unexpected_keys)} at {path}")
+
+        # Recursively validate nested dictionaries and lists
+        for key, sub_model in keys.items():
+            if key == "*" and isinstance(sub_model, dict):
+                for sub_key in config:
+                    validate_keys(config[sub_key], 
+                                  sub_model, path=f"{path}.{sub_key}" if path else sub_key)
+            elif key == "*" and isinstance(sub_model, list):
+                for sub_key in config:
+                    validate_list(config[sub_key], sub_model, key, path)
+            elif key == "*":
+                for sub_key in config:
+                    validate_type(config[sub_key], sub_model, key, path)
+            elif key in config:
+                if isinstance(sub_model, dict):
+                    validate_keys(config[key], sub_model, path=f"{path}.{key}" if path else key)
+                elif isinstance(sub_model, list):
+                    validate_list(config[key], sub_model, key, path)
+                else:
+                    validate_type(config[key], sub_model, key, path)
+
+    # Additional helper functions
+    # ----
+    def validate_list(config_value, allowed_types, key, path):
+        """Validate configuration with model that is formatted as a list"""
+        if all(isinstance(item, (str, int, float)) for item in allowed_types):
+            if config_value not in allowed_types:
+                raise ValueError(
+                    f"Invalid value for key '{key}' at {path}. Expected one of: {allowed_types}"
+                )
+        elif not isinstance(config_value, list):
+            if type(config_value) not in allowed_types:
+                raise ValueError(
+                    f"Invalid value for key '{key}' at {path}. Expected one of: {allowed_types}"
+                )
+        else:
+            if isinstance(config_value, list):
+                for item in config_value:
+                    if not any(isinstance(item, t) for t in allowed_types):
+                        raise ValueError(
+                            f"Invalid type for items in list '{key}' at {path}. Expected one of: "
+                            f"{allowed_types}"
+                        )
+            else:
+                raise ValueError(
+                    f"Invalid type for key '{key}' at {path}. Expected a list of: {allowed_types}"
+                )
+    # ----
+    def validate_type(config_value, expected_type, key, path):
+        """Validate configuration with model that is at the furthest point along a branch"""
+        if not isinstance(config_value, expected_type):
+            raise ValueError(
+                f"Invalid type for key '{key}' at {path}. Expected type: {expected_type}"
+            )
+
+    # Validate all branches within the configuration dictionary
+    validate_keys(config, reference_model)
+
+    # Return
+    return config

From 7db976454f930b71e8471cb3f092d169d698157a Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 21 Aug 2024 18:25:27 -0700
Subject: [PATCH 74/81] Data reading validators

---
 config_files/live_survey_year_2019_config.yml |   4 +-
 echopop/live/live_data_loading.py             | 176 ++++++++----
 echopop/test_workflow.py                      | 260 +++++++++++++++++-
 3 files changed, 382 insertions(+), 58 deletions(-)

diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml
index e52db83c..fe8bb8b7 100644
--- a/config_files/live_survey_year_2019_config.yml
+++ b/config_files/live_survey_year_2019_config.yml
@@ -7,7 +7,8 @@
 ##############################################################################
 # Parameters
 
-survey_year: 2019            # survey year being considered
+ship_id: R/V Shimada
+survey_year: 2024           # survey year being considered
 species:
   text_code: pacific_hake    # target species for the survey year -- species name
   number_code: 22500         # target species for the survey year -- numeric code
@@ -26,6 +27,7 @@ input_directories:
     extension: zarr
   biology:
     directory: biology/
+    # directory: s3://sh2407-upload/data/Echopop-biology/
     database_name: biology.db
     extension: csv 
     file_name_formats:
diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index 80dee085..f096ebed 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -9,6 +9,8 @@
 import xarray as xr
 import os
 import copy
+import boto3
+from botocore.exceptions import ClientError
 
 from .live_core import(
     LIVE_FILE_FORMAT_MAP,
@@ -218,66 +220,65 @@ def read_acoustic_zarr(file: Path, config_map: dict, xarray_kwargs: dict = {}) -
     # Return a Tuple
     return zarr_data_df_filtered, data_units
 
-# TODO: Documentation
-def validate_data_directory(file_configuration: dict, dataset: str,
-                            input_filenames: Optional[list] = None) -> List[Path]:
-
-    # Get the dataset file settings
-    file_settings = file_configuration["input_directories"][dataset]
+def construct_directorypath(file_configuration: dict, file_settings: dict):
+    """Construct the root directory path."""
 
-    # Get the acoustic file settings and root directory
-    # ---- Root directory
-    if "data_root_dir" in file_configuration.keys():
-        # root_directory = Path(file_configuration["data_root_dir"])
+    # Get the general root_directory, if present
+    if "data_root_dir" in file_configuration:
         root_directory = file_configuration["data_root_dir"]
-    else: 
-        # root_directory = Path()
+    else:
         root_directory = ""
-    # ---- File folder
-    # data_directory = Path(file_settings["directory"])
+
+    # Get the local directory (or this may be the root directory depending on the config)
     data_directory = file_settings["directory"]
-    # ---- Createa directory path
-    # directory_path = root_directory / data_directory
+
+    # Return the directory path
     if root_directory != "":    
-        directory_path = "/".join([root_directory, data_directory])
+        return "/".join([root_directory, data_directory])
     else:
-        directory_path = data_directory
+        return data_directory
 
-    # Validate filepath, columns, datatypes
-    # ---- Error evaluation (if applicable)
-    # if not directory_path.exists():
-    #     raise FileNotFoundError(
-    #         f"The acoustic data directory [{directory_path}] does not exist."
-    #     )
+def is_s3_path(path):
+    """Check if a path is an S3 path."""
+    return path.startswith("s3://")
 
-    # Validate that files even exist
-    # ---- List available *.zarr files
-    # data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}"))
-    # ---- Error evaluation (if applicable)
-    # if not data_files:
-    #     raise FileNotFoundError(
-    #         f"No `*.{file_settings['extension']}` files found in [{directory_path}]!"
-    #     )
-    
-    # Check and format specific input filenames
-    if isinstance(input_filenames, list):
-        # data_files = [directory_path / filename for filename in input_filenames]
-        data_files = ["/".join([directory_path, filename]) for filename in input_filenames]
-    # ---- Raise Error
-    elif input_filenames is not None:
+# TODO: Documentation
+def validate_data_directory(file_configuration: dict, dataset: str,
+                            input_filenames: Optional[list] = None) -> List[Path]:
+
+    # Get the dataset file settings
+    file_settings = file_configuration["input_directories"][dataset]
+
+    # Get the data file settings and directorypath
+    directory_path = construct_directorypath(file_configuration, file_settings)
+
+    # Validate `input_filenames` input
+    if input_filenames is not None and not isinstance(input_filenames, list):
         raise TypeError(
             "Data loading argument `input_filenames` must be a list."
         )        
-    else:
-        data_files = list(Path(directory_path).glob(f"*{'.'+file_settings['extension']}"))
-
-    # Database root directory
-    database_root_directory = file_configuration["database_directory"]
-
-    # Initialize the database file
-    initialize_database(database_root_directory, file_settings)
+    
+    # Format data filenames
+    if input_filenames is not None:
+        data_files = ["/".join([directory_path, filename]) for filename in input_filenames]
 
-    # Clean the file names    
+    # Validate directories and format filepath names
+    # ---- S3 bucket
+    if is_s3_path(directory_path):
+        # ---- Validate
+        validate_s3_path(directory_path, file_configuration["storage_options"])
+        # ---- Format data files
+        if input_filenames is None:
+            data_files = []
+    # ---- Local
+    else:
+        # ---- Validate
+        validate_local_path(directory_path, file_settings)
+        # ---- Format data files
+        if input_filenames is None:            
+            data_files = list(Path(directory_path).glob(f"*{'.'+file_settings['extension']}"))
+    
+    # Clean the filenames
     data_files = [
         re.sub(r'//', r'\\', str(filename)).replace('/', '\\') 
         if not str(filename).startswith('s3://') 
@@ -285,6 +286,12 @@ def validate_data_directory(file_configuration: dict, dataset: str,
         for filename in data_files
     ]
 
+    # Database root directory
+    database_root_directory = file_configuration["database_directory"]
+
+    # Initialize the database file
+    initialize_database(database_root_directory, file_settings)
+
     # Drop incomplete datasets
     if dataset == "biology":
         data_files = validate_complete_biology_dataset(data_files, 
@@ -299,6 +306,79 @@ def validate_data_directory(file_configuration: dict, dataset: str,
     # Return the valid filenames/paths
     return valid_files
 
+def validate_s3_path(s3_path: str, cloud_credentials: dict):
+    """Check if (parts of) S3 path exists."""
+
+    # Redundant validation that S3 object validation is appropriate
+    if not is_s3_path(s3_path):
+        raise ValueError("The path is not an S3 path.")    
+    
+    # Validate credentials
+    if not all([True if param in cloud_credentials.keys() else False 
+                for param in ["key", "secret"]]):
+        # ---- Find missing credentials
+        missing_creds = set(["key", "secret"]) - set(cloud_credentials)
+        # ---- Format into string
+        missing_creds_str = ", ".join(["'{}'".format(x.replace("'", "''")) for x in missing_creds])
+        # ---- Raise Error
+        raise PermissionError(
+            f"Required S3 credentials missing: {missing_creds_str}."
+        )
+
+    # Remove the s3:// prefix
+    s3_path_reduced = s3_path[len("s3://"):]
+
+    # Split into bucket and key
+    parts = s3_path_reduced.split("/", 1)
+    if len(parts) < 2:
+        raise ValueError(f"Invalid S3 path format for '{s3_path}'.")
+    
+    # Get bucket name and directory keys
+    bucket_name, directory = parts
+
+    # Initialize the S3 client
+    s3_client = boto3.client("s3", 
+                             aws_access_key_id=cloud_credentials["key"], 
+                             aws_secret_access_key=cloud_credentials["secret"])
+    
+    # Check if the bucket exists
+    try:
+        s3_client.head_bucket(Bucket=bucket_name)
+    except ClientError as e:
+        raise FileNotFoundError(
+            f"S3 bucket '{bucket_name}' does not exist or you do not have access."
+        )
+    
+    # Check if the S3 directory exists
+    try:
+        # ---- Ping a response from the bucket     
+        response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=directory, MaxKeys=1)
+        # ---- Check for `Contents`
+        if "Contents" not in response:
+            raise FileNotFoundError(f"S3 path '{s3_path}' does not exist.")
+    except ClientError as e: 
+        # --- Raise Error and propagate it upwards
+        raise e
+
+def validate_local_path(directory_path: str, file_settings: dict):
+
+    # Validate filepath
+    # ---- Error evaluation (if applicable)
+    if not Path(directory_path).exists():
+        raise FileNotFoundError(
+            f"The data directory [{directory_path}] does not exist."
+        )
+    
+    # Validate that files even exist
+    # ---- List available files of target extension
+    data_files = list(Path(directory_path).glob(f"*{'.'+file_settings['extension']}"))
+    # ---- Error evaluation (if applicable)
+    if not data_files:
+        raise FileNotFoundError(
+            f"No `*.{file_settings['extension']}` files found in [{directory_path}]!"
+        )
+
+
 def validate_complete_biology_dataset(data_files: List[str], 
                                       directory_path: str,
                                       file_configuration: dict):
diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index a22fa10b..8e15088c 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -2,6 +2,24 @@
 from echopop.live.sql_methods import SQL
 import echopop.live.live_visualizer as elv
 from pathlib import Path
+from echopop.live import live_data_processing as eldp
+from echopop.live import live_data_loading as eldl
+from echopop.live.live_core import(
+    LIVE_DATA_STRUCTURE, LIVE_INPUT_FILE_CONFIG_MAP
+)
+import boto3
+from botocore.exceptions import NoCredentialsError, ClientError
+import pandas as pd
+import numpy as np
+from echopop.live.sql_methods import SQL, sql_data_exchange, get_table_key_names, sql_group_update, query_processed_files, sql_update_strata_summary
+from echopop.live.live_spatial_methods import apply_spatial_definitions
+from echopop.live.live_acoustics import average_sigma_bs, compute_nasc
+from echopop.live.live_biology import compute_sigma_bs
+from echopop.acoustics import ts_length_regression, to_dB, to_linear
+from echopop.utils.operations import group_interpolator_creator
+from functools import reduce
+from echopop.live.live_data_loading import filter_filenames, read_biology_csv
+
 ####################################################################################################
 # TEST: Set up `LiveSurvey` object
 # NOTE: General initialization parameter configuration
@@ -9,10 +27,174 @@
 # NOTE: File configuration
 live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
 # NOTE: Create object
-realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path, verbose=True)
+realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True)
 # NOTE: String-representation via `LiveSurvey.__repr__`: 
 # NOTE: Lists current files being processed and linked databases (WIP)
-realtime_survey
+self = realtime_survey
+file_configuration = self.config
+
+input_filenames = ["202407_003_operation_info.csv", "202407_22500_003_lf.csv", "202407_22500_003_spec.csv", "202407_003_catch_perc.csv"]
+realtime_survey.config["input_directories"]["biology"]["directory"] = "s3://sh2407-upload/data/Echopop-biology"
+
+survey_data = SQL("C:/Users/Brandyn/Downloads/acoustics.db", "select", table_name="survey_data_df")
+
+
+del realtime_survey.config["data_root_dir"]
+self = realtime_survey
+
+# realtime_survey.config["storage_options"] = aws_credentials
+realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True)
+realtime_survey.load_biology_data(input_filenames=input_filenames)
+realtime_survey.input["biology"]
+def is_s3_path(path):
+    """Check if a path is an S3 path."""
+    return path.startswith("s3://")
+
+dataset_directory = realtime_survey.config["input_directories"]["biology"]["directory"]
+s3_path = dataset_directory
+is_s3_path(dataset_directory)
+
+cloud_credentials = aws_credentials
+cloud_credentials = {}
+def validate_s3_path(s3_path: str, cloud_credentials: dict):
+    """Check if (parts of) S3 path exists."""
+
+    # Redundant validation that S3 object validation is appropriate
+    if not is_s3_path(s3_path):
+        raise ValueError("The path is not an S3 path.")    
+    
+    # Validate credentials
+    if not all([True if param in cloud_credentials.keys() else False 
+                for param in ["key", "secret"]]):
+        # ---- Find missing credentials
+        missing_creds = set(["key", "secret"]) - set(cloud_credentials)
+        # ---- Format into string
+        missing_creds_str = ", ".join(["'{}'".format(x.replace("'", "''")) for x in missing_creds])
+        # ---- Raise Error
+        raise PermissionError(
+            f"Required S3 credentials missing: {missing_creds_str}."
+        )
+
+    # Remove the s3:// prefix
+    s3_path_reduced = s3_path[len("s3://"):]
+
+    # Split into bucket and key
+    parts = s3_path_reduced.split("/", 1)
+    if len(parts) < 2:
+        raise ValueError(f"Invalid S3 path format for '{s3_path}'.")
+    
+    # Get bucket name and directory keys
+    bucket_name, directory = parts
+
+    # Initialize the S3 client
+    s3_client = boto3.client("s3", 
+                             aws_access_key_id=cloud_credentials["key"], 
+                             aws_secret_access_key=cloud_credentials["secret"])
+    
+    # Check if the bucket exists
+    try:
+        s3_client.head_bucket(Bucket=bucket_name)
+    except ClientError as e:
+        raise FileNotFoundError(
+            f"S3 bucket '{bucket_name}' does not exist or you do not have access."
+        )
+    
+    # Check if the S3 directory exists
+    try:
+        # ---- Ping a response from the bucket     
+        response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=directory, MaxKeys=1)
+        # ---- Check for `Contents`
+        if "Contents" not in response:
+            raise FileNotFoundError(f"S3 path '{s3_path}' does not exist.")
+    except ClientError as e: 
+        # --- Raise Error and propagate it upwards
+        raise e
+
+validate_s3_path(s3_path, cloud_credentials)
+
+import pandas as pd
+
+self = realtime_survey
+biology_files = self.meta["provenance"]["biology_files_read"]
+file_configuration = self.config
+dataset = "biology"
+
+# Get the dataset file settings
+file_settings = file_configuration["input_directories"][dataset]
+
+def construct_directorypath(file_configuration: dict, file_settings: dict):
+    """Construct the root directory path."""
+
+    # Get the general root_directory, if present
+    if "data_root_dir" in file_configuration:
+        root_directory = file_configuration["data_root_dir"]
+    else:
+        root_directory = ""
+
+    # Get the local directory (or this may be the root directory depending on the config)
+    data_directory = file_settings["directory"]
+
+    # Return the directory path
+    if root_directory != "":    
+        return "/".join([root_directory, data_directory])
+    else:
+        return data_directory
+
+directory_path = construct_directorypath(file_configuration, file_settings)
+
+def validate_local_path(directory_path: str):
+
+    # Validate filepath
+    # ---- Error evaluation (if applicable)
+    if not Path(directory_path).exists():
+        raise FileNotFoundError(
+            f"The acoustic data directory [{directory_path}] does not exist."
+        )
+    
+    # Validate that files even exist
+    # ---- List available files of target extension
+    data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}"))
+    # ---- Error evaluation (if applicable)
+    if not data_files:
+        raise FileNotFoundError(
+            f"No `*.{file_settings['extension']}` files found in [{directory_path}]!"
+        )
+
+
+
+
+# Get the biology data file settings
+file_settings = file_configuration["input_directories"]["biology"]
+
+# Get the file-specific settings, datatypes, columns, etc.
+# ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
+biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] 
+# ---- Extract the expected file name ID's
+biology_file_ids = file_settings["file_name_formats"]
+# ---- Extract all of the file ids
+biology_config_ids = list(biology_file_ids.keys())
+# ---- Initialize the dictionary that will define this key in the `input` attribute
+biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
+
+
+# Initialize a session with AWS credentials
+s3_client = boto3.client(
+    's3',
+    aws_access_key_id=aws_credentials["key"],
+    aws_secret_access_key=aws_credentials["secret"]
+)
+response = s3_client.list_buckets()
+buckets = response.get('Buckets', [])
+for bucket in buckets:
+    print(f"Bucket Name: {bucket['Name']}")
+s3_client.head_bucket(Bucket="sh2407-upload")
+realtime_survey.load_biology_data(pandas_kwargs=aws_credentials, input_filenames=input_filenames)
+realtime_survey.config["ship_id"]
+grid_data = SQL(realtime_survey.config["database"]["grid"], "select", table_name="grid_df")
+grid_data[grid_data.abundance > 0]
+bucket = boto3.client("s3", region_name=None)
+bucket.head_bucket(Bucket=realtime_survey.config["input_directories"]["biology"]["directory"] + "/")
+bucket.list_objects_v2(Bucket=realtime_survey.config["input_directories"]["biology"]["directory"], Prefix=path, MaxKeys=1)
 ####################################################################################################
 # TEST: TRIGGER --> NEW ACOUSTIC DATA
 # NOTE: Load new acoustic data (Either glob file search or `input_filenames Optional[List[str]]`)
@@ -25,11 +207,13 @@
 realtime_survey.estimate_population(working_dataset="acoustic")
 # NOTE: String-representation via `LiveSurvey.__repr__`: 
 # NOTE: Lists current files being processed and linked databases (WIP)
-realtime_survey
+realtime_survey.input["acoustics"]
 ####################################################################################################
 # TEST: TRIGGER --> NEW BIOLOGY DATA
 # NOTE: Load new biological data (Either glob file search or `input_filenames Optional[List[str]]`)
 realtime_survey.load_biology_data()
+len(realtime_survey.meta["provenance"]["biology_files_checkpoint1"])
+realtime_survey.meta["provenance"]["biology_files_checkpoint3"]
 # NOTE: Process new biological data
 # NOTE: This will update linked database tables
 realtime_survey.process_biology_data()
@@ -50,15 +234,18 @@
 # ---- ACOUSTIC
 SQL(db_file=realtime_survey.config["database"]["acoustics"],
     command="select", table_name="files_processed")
+dat = SQL(db_file=realtime_survey.config["database"]["acoustics"],command="select", table_name="files_processed")
 # ---- BIOLOGICAL
-SQL(db_file=realtime_survey.config["database"]["biology"],
-    command="select", table_name="files_processed")
+SQL(db_file=realtime_survey.config["database"]["biology"],command="select", table_name="files_processed")
+dat.loc[0:, "filepath"][105]
 ####################################################################################################
 # TEST: `LiveSurvey` --[(key) SQL tables]--> Users
 # !!! The SQL functions will fail if the tables have not yet been created/initialized
 # ---- ACOUSTICS
 # NOTE: Mean linear backscatter coefficient (`sigma_bs`) keyed for each haul and stratum
 SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df")
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="specimen_df").latitude.max()
+realtime_survey.input["spatial"]["strata"]
 # NOTE: Along-track acoustically-derived number/biomass densities and NASC 
 SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
 # ---- BIOLOGICAL
@@ -76,7 +263,59 @@
 import panel as pn
 # ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table
 survey_data_db = Path(realtime_survey.config["database"]["acoustics"])
-grid_db = Path(realtime_survey.config["database"]["grid"])
+# grid_db = Path(realtime_survey.config["database"]["grid"])
+grid_db = Path("C:/Users/Brandyn/Downloads/grid.db")
+dat = SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
+dat
+dat1 = SQL(grid_db, "select", table_name="grid_df")
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df")
+
+sql_cmd = "SELECT * FROM sigma_bs_mean_df ORDER BY stratum, haul_num, species_id"
+# Create the engine
+engine = create_engine(f"sqlite:///{"C:/Users/Brandyn/Downloads/biology.db"}")
+# Create the SQL database connection and send the script 
+with engine.connect() as connection:
+    table = connection.execute(text(sql_cmd))
+
+data = table.fetchall() 
+dd = pd.DataFrame(data, columns=table.keys()).loc[0:1, :]
+dd = dd[["stratum", "haul_num", "species_id", "sigma_bs", "sigma_bs_count", "sigma_bs_sum", "id"]]
+dd.loc[:, "id"] = pd.Series([f"{(4,4,4)}", f"{(5,5,5)}"])
+SQL("C:/Users/Brandyn/Downloads/biology.db", "insert", table_name="sigma_bs_mean_df", dataframe=dd)
+SQL("C:/Users/Brandyn/Downloads/biology.db", "map")
+SQL(biology_db, "drop", table_name="sigma_bs_mean_df")
+SQL(biology_db, "select", table_name="sigma_bs_mean_df")
+dd.loc[:, "haul_num"] = pd.Series([101, 103])
+dd = dd[["species_id", "haul_num", "id", "stratum", "sigma_bs", "sigma_bs_count", "sigma_bs_sum"]]
+SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=dd, id_columns=key_list+["id"])
+SQL(biology_db, "select", table_name="sigma_bs_mean_df")
+import numpy as np; import pandas as pd
+SQL("C:/Users/Brandyn/Downloads/biology.db", "select", table_name="length_weight_df")
+sigma_bs_df = SQL("C:/Users/Brandyn/Downloads/biology.db", "select", table_name="sigma_bs_mean_df")
+table_df = SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df")
+sigma_bs_df = table_df
+# ---- Check the table keys
+table_keys = np.unique(table_df["id"]).tolist()
+# ---- Get unique values
+current_keys = np.unique(sigma_bs_df["id"]).tolist()
+# ---- Get INSERTION keys
+insertion_keys = list(set(current_keys).difference(set(table_keys)))
+# ---- Get UPDATE keys
+update_keys = list(set(current_keys).intersection(set(table_keys)))
+insertion_df = sigma_bs_df[sigma_bs_df["id"].isin(insertion_keys)]
+insertion_df.loc[0, "species_id"] = 22500
+insertion_df.loc[0, "stratum"] = 5
+insertion_df.loc[0, "haul_num"] = 100
+insertion_df.loc[0, "sigma_bs"] = 1e-10
+insertion_df.loc[0, "sigma_bs_count"] = 100
+insertion_df.loc[0, "sigma_bs_sum"] = 1e10 * 100
+insertion_df.loc[0, "id"] = f"{(1,1,1)}"
+SQL(realtime_survey.config["database"]["biology"], "insert", table_name="sigma_bs_mean_df", 
+    dataframe=insertion_df)
+SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df")
+survey_data = SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
+dat1[dat1.abundance > 0]
+dat[dat.number_density > 0]
 coast_db = grid_db
 biology_db = Path(realtime_survey.config["database"]["biology"])
 projection = realtime_survey.config["geospatial"]["projection"]
@@ -91,10 +330,13 @@ def plt_to_pn(fig):
     panel.show() # OR panel.servable() if you want to serve it in a Panel server
 # ---- PLOT GRID
 fig = elv.plot_livesurvey_grid(grid_db, projection, coast_db)
+fig.show()
 plt_to_pn(fig)
 # ---- PLOT TRACK
-fig1 = elv.plot_livesurvey_track(survey_data_db, projection, coast_db)
-plt_to_pn(fig)
+from echopop.live.live_visualizer import plot_livesurvey_track
+fig1 = plot_livesurvey_track(survey_data, projection, coast_db)
+fig1.show()
+plt_to_pn(fig1)
 # ---- PLOT DISTRIBUTIONS
 weight_table = SQL(biology_db, "select", 
                    table_name="length_weight_df")
@@ -105,7 +347,7 @@ def plt_to_pn(fig):
 length_table = SQL(biology_db, "select", 
                    table_name="length_df")
 fig2 = elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table)
-plt_to_pn(fig)
+plt_to_pn(fig2)
 ### MULTIPANEL
 panel0 = pn.panel(fig, name='Gridded population estimates')
 panel1 = pn.panel(fig1, name='Alongtrack population estimates')

From cef3036ffecd4a05fa666023df0c8d05b0cacc25 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 26 Aug 2024 10:29:45 -0700
Subject: [PATCH 75/81] Clarified config validation error messages

---
 echopop/live/live_data_loading.py | 48 ++++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index f096ebed..3018604e 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -48,10 +48,12 @@ def live_configuration(live_init_config_path: Union[str, Path],
     # ---- Initialization settings
     init_config = yaml.safe_load(Path(live_init_config_path).read_text())
     # -------- Validate
-    init_config = validate_live_config(copy.deepcopy(init_config), LIVE_CONFIG_INIT_MODEL)
+    init_config = validate_live_config(copy.deepcopy(init_config), LIVE_CONFIG_INIT_MODEL,
+                                       live_init_config_path)
     # ---- Filepath/directory settings
     file_config = yaml.safe_load(Path(live_file_config_path).read_text())
-    file_config = validate_live_config(copy.deepcopy(file_config), LIVE_CONFIG_DATA_MODEL)
+    file_config = validate_live_config(copy.deepcopy(file_config), LIVE_CONFIG_DATA_MODEL,
+                                       live_file_config_path)
     
     # Check for intersecting/duplicative configuration keys
     # ---- Compare sets of keys from each dictionary
@@ -727,8 +729,12 @@ def validate_spatial_config(spatial_config: dict):
     elif link_method != "global": 
         validate_hauls_config(spatial_config, link_method)
 
-def validate_live_config(config, reference_model):
+def validate_live_config(config: dict, reference_model: dict, filename: Union[str, Path]):
     """Validate configuration inputs"""
+    
+    # Convert to string if Path
+    if isinstance(filename, Path):
+        filename = str(filename)
 
     # Recursive function for validating entire nested dictionary
     def validate_keys(config, model, path=""):
@@ -768,14 +774,17 @@ def check_for_missing_keys(required_keys, config_keys, path):
                                                                                config_keys)
                     if missing_keys:
                         raise ValueError(
-                            f"Missing required key(s): {', '.join(missing_keys)} at {path}"
+                            f"Missing required configuration key(s): "
+                            f"{', '.join(missing_keys)} at {path} in configuration file "
+                            f"'{filename}'."
                         )
                     return unexpected_keys_for_keys
                 elif key not in config_keys and key != "*":
                     missing_required.append(key)
             if missing_required:
                 raise ValueError(
-                    f"Missing required key(s): {', '.join(missing_required)} at {path}"
+                    f"Missing required configuration key(s): {', '.join(missing_required)} at "
+                    f"{path} in configuration file '{filename}'."
                 )
             return []
         # ----
@@ -795,7 +804,10 @@ def check_for_unexpected_keys(config_keys, required_keys):
             missing_primary_keys = [key for key in required_keys 
                                     if key != "*" and key not in config]
             if missing_primary_keys:
-                raise ValueError(f"Missing primary key(s): {', '.join(missing_primary_keys)}")
+                raise ValueError(
+                    f"Missing primary configuration key(s): {', '.join(missing_primary_keys)} in "
+                    f"configuration file '{filename}'."
+                )
             unexpected_primary_keys = [key for key in config 
                                        if key not in required_keys 
                                        and key not in optional_keys 
@@ -803,7 +815,8 @@ def check_for_unexpected_keys(config_keys, required_keys):
             # ---- Raise error
             if unexpected_primary_keys:
                 raise ValueError(
-                    f"Unexpected primary key(s) found: {', '.join(unexpected_primary_keys)}"
+                    f"Unexpected primary key(s) found in configuration file '{filename}': "
+                    f"{', '.join(unexpected_primary_keys)}"
                 )
         # Nested validation
         else:
@@ -812,7 +825,10 @@ def check_for_unexpected_keys(config_keys, required_keys):
             unexpected_keys.extend(check_for_unexpected_keys(config_keys, required_keys))
             # ---- Raise error
             if unexpected_keys:
-                raise ValueError(f"Unexpected key(s) found: {', '.join(unexpected_keys)} at {path}")
+                raise ValueError(
+                    f"Unexpected key(s) found: {', '.join(unexpected_keys)} at {path} in "
+                    f"configuration file '{filename}'."
+                    )
 
         # Recursively validate nested dictionaries and lists
         for key, sub_model in keys.items():
@@ -841,31 +857,35 @@ def validate_list(config_value, allowed_types, key, path):
         if all(isinstance(item, (str, int, float)) for item in allowed_types):
             if config_value not in allowed_types:
                 raise ValueError(
-                    f"Invalid value for key '{key}' at {path}. Expected one of: {allowed_types}"
+                    f"Invalid value for key '{key}' at {path} in {filename}. Expected one of: "
+                    f"{allowed_types}"
                 )
         elif not isinstance(config_value, list):
             if type(config_value) not in allowed_types:
                 raise ValueError(
-                    f"Invalid value for key '{key}' at {path}. Expected one of: {allowed_types}"
+                    f"Invalid value for key '{key}' at {path} in {filename}. Expected one of: "
+                    f"{allowed_types}"
                 )
         else:
             if isinstance(config_value, list):
                 for item in config_value:
                     if not any(isinstance(item, t) for t in allowed_types):
                         raise ValueError(
-                            f"Invalid type for items in list '{key}' at {path}. Expected one of: "
-                            f"{allowed_types}"
+                            f"Invalid type for items in list '{key}' at {path} in {filename}. "
+                            f"Expected one of: {allowed_types}"
                         )
             else:
                 raise ValueError(
-                    f"Invalid type for key '{key}' at {path}. Expected a list of: {allowed_types}"
+                    f"Invalid type for key '{key}' at {path} in {filename}. Expected a list of: "
+                    f"{allowed_types}"
                 )
     # ----
     def validate_type(config_value, expected_type, key, path):
         """Validate configuration with model that is at the furthest point along a branch"""
         if not isinstance(config_value, expected_type):
             raise ValueError(
-                f"Invalid type for key '{key}' at {path}. Expected type: {expected_type}"
+                f"Invalid type for key '{key}' at {path} in {filename}. Expected type: "
+                f"{expected_type}"
             )
 
     # Validate all branches within the configuration dictionary

From 0aa88ac61619087b0703bce480b0bf5417c54a22 Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Mon, 26 Aug 2024 10:55:14 -0700
Subject: [PATCH 76/81] Pre-commit formatting changes

---
 config_files/live_initialization_config.yml   |   22 +-
 config_files/live_survey_year_2019_config.yml |    8 +-
 echopop/live/__init__.py                      |    2 +-
 echopop/live/live_acoustics.py                |  303 +-
 echopop/live/live_biology.py                  |  815 +--
 echopop/live/live_core.py                     |  101 +-
 echopop/live/live_data_loading.py             |  395 +-
 echopop/live/live_data_processing.py          |  231 +-
 echopop/live/live_spatial_methods.py          |  316 +-
 echopop/live/live_survey.py                   |  315 +-
 echopop/live/live_visualizer.py               |  438 +-
 echopop/live/sql_methods.py                   |  487 +-
 echopop/mesh_generation.py                    | 4520 +++++++++--------
 echopop/test_workflow.py                      |  766 +--
 echopop/utils/operations.py                   |   12 +-
 echopop/zarr_read_ingest_test.py              | 3789 +++++++-------
 16 files changed, 6532 insertions(+), 5988 deletions(-)

diff --git a/config_files/live_initialization_config.yml b/config_files/live_initialization_config.yml
index ae265343..4e386b3e 100644
--- a/config_files/live_initialization_config.yml
+++ b/config_files/live_initialization_config.yml
@@ -8,16 +8,16 @@
   ########################
   biology:
     # Length-binning
-    # NOTE: start : end : number 
+    # NOTE: start : end : number
     length_distribution:
       bins: [2, 80, 40]
     # Station separation
     # NOTE: if `separate_stations` is True, `['list']` is required for `station_id`
-    stations: 
+    stations:
       separate_stations: True
       station_id: ["length", "specimen"]
-    # Trawl identifier 
-    catch: 
+    # Trawl identifier
+    catch:
       partition: codend
 
   #####################################################################################################################
@@ -25,10 +25,10 @@
   ########################
   geospatial:
     inpfc:                                  # INPFC northern latitude limits and labels
-      latitude_max: [36.0, 40.5, 43.0,    
+      latitude_max: [36.0, 40.5, 43.0,
                     45.7667, 48.50, 55.0]
-      stratum_names: [1, 2, 3, 4, 5, 6]     
-    griddify: 
+      stratum_names: [1, 2, 3, 4, 5, 6]
+    griddify:
       # Coordinate bounds
       bounds:
         latitude: [32.75, 55.50]
@@ -39,20 +39,20 @@
         y_distance: 25.0
     projection: epsg:4326                   # EPSG integer code for geodetic parameter dataset
   # TODO: Remember to convert this back to a string
-  # NOTE: `link_biology_acoustics` defines how biological and acoustic data are linked with one another. This 
+  # NOTE: `link_biology_acoustics` defines how biological and acoustic data are linked with one another. This
   # comprises True/False statements that denote the desired association. All values set to "True" will be output.
   # `global`        --> NASC associated with sigma_bs calculated from all survey data
-  # `INPFC`         --> NASC for each INPFC stratum associated with matched stratum-specific sigma_bs 
+  # `INPFC`         --> NASC for each INPFC stratum associated with matched stratum-specific sigma_bs
   # `closest_haul`  --> NASC associated with sigma_bs calculated from the closest (spatially) trawls
   # `weighted_haul` --> NASC associated with sigma_bs calculated from all survey data weighted by distance from haul coordinates
     link_biology_acoustics: INPFC
 
   #####################################################################################################################
   # Acoustics settings#
-  ######################## 
+  ########################
   acoustics:
     # Acoustic transmit frequency (Hz or kHz)
-    transmit: 
+    transmit:
       frequency: 38.0
       units: kHz
     # Target strength (TS) - length (L) regression: TS=m*log10(L)+b
diff --git a/config_files/live_survey_year_2019_config.yml b/config_files/live_survey_year_2019_config.yml
index fe8bb8b7..485ad86e 100644
--- a/config_files/live_survey_year_2019_config.yml
+++ b/config_files/live_survey_year_2019_config.yml
@@ -20,7 +20,7 @@ database_directory: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_fil
 
 ##############################################################################
 # Input data directories
-input_directories: 
+input_directories:
   acoustics:
     directory: acoustics/
     database_name: acoustics.db
@@ -29,7 +29,7 @@ input_directories:
     directory: biology/
     # directory: s3://sh2407-upload/data/Echopop-biology/
     database_name: biology.db
-    extension: csv 
+    extension: csv
     file_name_formats:
       catch: "{DATE:YYYYMM}_{HAUL}_{FILE_ID:catch_perc}"
       length: "{DATE:YYYYMM}_{SPECIES_CODE}_{HAUL}_{FILE_ID:lf}"
@@ -40,12 +40,12 @@ input_directories:
       length: [haul_num, species_id]
       specimen: [haul_num, species_id]
       trawl_info: []
-    file_ids: 
+    file_ids:
       catch: catch_perc
       length: lf
       specimen: spec
       trawl_info: operation_info
-  coastline: 
+  coastline:
     directory: coastline/
     coastline_name: ne_10m_land
   grid:
diff --git a/echopop/live/__init__.py b/echopop/live/__init__.py
index 325afcbb..1dca9f3e 100644
--- a/echopop/live/__init__.py
+++ b/echopop/live/__init__.py
@@ -2,4 +2,4 @@
 
 __all__ = ["operations"]
 
-# from _echopop_version import version as __version__  # noqa
\ No newline at end of file
+# from _echopop_version import version as __version__  # noqa
diff --git a/echopop/live/live_acoustics.py b/echopop/live/live_acoustics.py
index 2a9347a0..d2f7e763 100644
--- a/echopop/live/live_acoustics.py
+++ b/echopop/live/live_acoustics.py
@@ -1,19 +1,21 @@
-from typing import Union, Optional, List
+from typing import Optional, Union
+
 import numpy as np
 import pandas as pd
 
-from ..acoustics import ts_length_regression, to_linear, to_dB
-from .live_spatial_methods import apply_spatial_definitions, apply_griddify_definitions
-from .sql_methods import sql_data_exchange, SQL, query_processed_files
+from ..acoustics import to_linear, ts_length_regression
+from .live_spatial_methods import apply_griddify_definitions, apply_spatial_definitions
+from .sql_methods import query_processed_files, sql_data_exchange
+
 
 # TODO: Documentation
-def configure_transmit_frequency(frequency_values: pd.Series,
-                                 transmit_settings: dict, 
-                                 current_units: str):
-    
+def configure_transmit_frequency(
+    frequency_values: pd.Series, transmit_settings: dict, current_units: str
+):
+
     # Extract transmit frequency units defined in configuration file
     configuration_units = transmit_settings["units"]
-    
+
     # Transform the units, if necessary
     # ---- Hz to kHz
     if current_units == "Hz" and configuration_units == "kHz":
@@ -24,11 +26,12 @@ def configure_transmit_frequency(frequency_values: pd.Series,
     # ---- No change
     else:
         return frequency_values
-    
+
+
 # TODO: Documentation
-def preprocess_acoustic_data(survey_data: pd.DataFrame,
-                             spatial_dict: dict,
-                             file_configuration: dict) -> pd.DataFrame:
+def preprocess_acoustic_data(
+    survey_data: pd.DataFrame, spatial_dict: dict, file_configuration: dict
+) -> pd.DataFrame:
 
     # Get acoustic processing settings
     acoustic_analysis_settings = file_configuration["acoustics"]
@@ -37,10 +40,10 @@ def preprocess_acoustic_data(survey_data: pd.DataFrame,
 
     # Filter the dataset
     # ---- Configure `frequency_nominal`, if necessary
-    survey_data.loc[:, "frequency_nominal"] = (
-        configure_transmit_frequency(survey_data.loc[:, "frequency_nominal"],
-                                     transmit_settings,
-                                     acoustic_analysis_settings["dataset_units"]["frequency"])
+    survey_data.loc[:, "frequency_nominal"] = configure_transmit_frequency(
+        survey_data.loc[:, "frequency_nominal"],
+        transmit_settings,
+        acoustic_analysis_settings["dataset_units"]["frequency"],
     )
     # ---- Filter out any unused frequency coordinates
     prc_nasc_df_filtered = (
@@ -50,15 +53,17 @@ def preprocess_acoustic_data(survey_data: pd.DataFrame,
     )
 
     # Get grid coordinates
-    prc_nasc_df_filtered = pd.concat([
-        prc_nasc_df_filtered,
-        apply_griddify_definitions(prc_nasc_df_filtered, file_configuration["geospatial"])
-    ], axis = 1)
-    
+    prc_nasc_df_filtered = pd.concat(
+        [
+            prc_nasc_df_filtered,
+            apply_griddify_definitions(prc_nasc_df_filtered, file_configuration["geospatial"]),
+        ],
+        axis=1,
+    )
+
     # Apply spatial settings
-    prc_nasc_df_filtered = (
-        prc_nasc_df_filtered
-        .assign(stratum=apply_spatial_definitions(prc_nasc_df_filtered["latitude"], spatial_dict))
+    prc_nasc_df_filtered = prc_nasc_df_filtered.assign(
+        stratum=apply_spatial_definitions(prc_nasc_df_filtered["latitude"], spatial_dict)
     )
 
     # Remaining adjustments to the acoustic data prior to being passed to the `LiveSurvey` object
@@ -66,31 +71,28 @@ def preprocess_acoustic_data(survey_data: pd.DataFrame,
     prc_nasc_df_filtered.loc[:, "ship_id"] = file_configuration["ship_id"]
     # ---- Replace NASC `NaN` values with `0.0`
     prc_nasc_df_filtered.loc[:, "NASC"] = prc_nasc_df_filtered.loc[:, "NASC"].fillna(0.0)
-    # ---- Drop the `frequency_nominal` column and return the output 
-    return prc_nasc_df_filtered.drop(columns = ["frequency_nominal"])
+    # ---- Drop the `frequency_nominal` column and return the output
+    return prc_nasc_df_filtered.drop(columns=["frequency_nominal"])
+
 
 # TODO: Documentation
-def average_sigma_bs(length: Union[pd.DataFrame, float, int], 
-                     weights: Optional[Union[float, int, str]] = None):
+def average_sigma_bs(
+    length: Union[pd.DataFrame, float, int], weights: Optional[Union[float, int, str]] = None
+):
 
     # Function approach for dataframe input
     if isinstance(length, pd.DataFrame):
-        if "length" not in length.columns: 
-            raise ValueError(
-                "Column [`length`] missing from dataframe input `length`."
-            )
+        if "length" not in length.columns:
+            raise ValueError("Column [`length`] missing from dataframe input `length`.")
         elif "TS_L_slope" not in length.columns:
-            raise ValueError(
-                "Column [`TS_L_slope`] missing from dataframe input `length`."
-            )
+            raise ValueError("Column [`TS_L_slope`] missing from dataframe input `length`.")
         elif "TS_L_slope" not in length.columns:
-            raise ValueError(
-                "Column [`TS_L_intercept`] missing from dataframe input `length`."
-            )
-        else:           
+            raise ValueError("Column [`TS_L_intercept`] missing from dataframe input `length`.")
+        else:
             # ---- Compute the TS (as an array)
-            target_strength = ts_length_regression(length["length"], length["TS_L_slope"], 
-                                                   length["TS_L_intercept"])
+            target_strength = ts_length_regression(
+                length["length"], length["TS_L_slope"], length["TS_L_intercept"]
+            )
             # ---- Convert to `sigma_bs`
             sigma_bs_value = to_linear(target_strength)
             # ---- Weighted or arithmetic avveraging
@@ -100,10 +102,11 @@ def average_sigma_bs(length: Union[pd.DataFrame, float, int],
                 raise ValueError(
                     f"Defined `weights` column, {weights}, missing from dataframe input "
                     f"`length`."
-                )               
+                )
             else:
                 return (sigma_bs_value * length[weights]).sum() / length[weights].sum()
 
+
 # TODO: Documentation
 # TODO: Refactor
 def estimate_echometrics(acoustic_data_df: pd.DataFrame):
@@ -113,81 +116,92 @@ def estimate_echometrics(acoustic_data_df: pd.DataFrame):
 
     # Compute ABC
     # ---- Convert NASC to ABC
-    acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2)
+    acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852**2)
 
     # Pre-compute the change in depth
     acoustic_df["dz"] = acoustic_df["depth"].diff()
     # ---- Change first cell !
-    acoustic_df.loc[0, "dz"] = (
-        acoustic_df.loc[1, "depth"] - acoustic_df.loc[0, "depth"]
-    )
+    acoustic_df.loc[0, "dz"] = acoustic_df.loc[1, "depth"] - acoustic_df.loc[0, "depth"]
 
     # Initialize echometrics dictionary
     echometrics = {}
 
     # Compute the metrics center-of-mass
     if acoustic_df["NASC"].sum() == 0.0:
-        echometrics.update({
-            "n_layers": 0,
-            "mean_Sv": -999,
-            "max_Sv": -999,
-            "nasc_db": np.nan,
-            "center_of_mass": np.nan,
-            "dispersion": np.nan,
-            "evenness": np.nan,
-            "aggregation_index": np.nan,    
-            "occupied_area": 0.0,        
-        })
+        echometrics.update(
+            {
+                "n_layers": 0,
+                "mean_Sv": -999,
+                "max_Sv": -999,
+                "nasc_db": np.nan,
+                "center_of_mass": np.nan,
+                "dispersion": np.nan,
+                "evenness": np.nan,
+                "aggregation_index": np.nan,
+                "occupied_area": 0.0,
+            }
+        )
     else:
-        
-        # Create the `echometrics` dictionary 
-        echometrics.update({
-            # ---- Number of layers
-            "n_layers": int(acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size),
-            # ---- Mean Sv (back-calculated)
-            "mean_Sv": float(
-                10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max())
-            ),
-            # ---- Max Sv (back-calculated)
-            "max_Sv": float(
-                10 * np.log10(acoustic_df["ABC"].max() 
-                              / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"])
-            ),
-            # ---- (Logarithmic) acoustic abundance
-            "nasc_db": float(10 * np.log10(acoustic_df["ABC"].sum())),
-            # ---- Center-of-mass
-            "center_of_mass": float(
-                (acoustic_df["depth"] * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum()
-            ),
-            # ---- Evenness
-            "evenness": float(
-                (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2
-            ),
-            # ---- Occupied area
-            "occupied_area": float(
-                acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max()
-            )
-        })
+
+        # Create the `echometrics` dictionary
+        echometrics.update(
+            {
+                # ---- Number of layers
+                "n_layers": int(acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size),
+                # ---- Mean Sv (back-calculated)
+                "mean_Sv": float(
+                    10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max())
+                ),
+                # ---- Max Sv (back-calculated)
+                "max_Sv": float(
+                    10
+                    * np.log10(
+                        acoustic_df["ABC"].max()
+                        / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"]
+                    )
+                ),
+                # ---- (Logarithmic) acoustic abundance
+                "nasc_db": float(10 * np.log10(acoustic_df["ABC"].sum())),
+                # ---- Center-of-mass
+                "center_of_mass": float(
+                    (acoustic_df["depth"] * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum()
+                ),
+                # ---- Evenness
+                "evenness": float(
+                    (acoustic_df["NASC"] ** 2).sum() / ((acoustic_df["NASC"]).sum()) ** 2
+                ),
+                # ---- Occupied area
+                "occupied_area": float(
+                    acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max()
+                ),
+            }
+        )
 
         # Update variable-dependent metrics
-        echometrics.update({
-            # ---- Dispersion
-            "dispersion": float(
-                ((acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2 
-                * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum()                
-            ),
-            # ---- Index of aggregation
-            "aggregation_index": float(1 / echometrics["evenness"]), 
-        })
+        echometrics.update(
+            {
+                # ---- Dispersion
+                "dispersion": float(
+                    (
+                        (acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2
+                        * acoustic_df["NASC"]
+                    ).sum()
+                    / (acoustic_df["NASC"]).sum()
+                ),
+                # ---- Index of aggregation
+                "aggregation_index": float(1 / echometrics["evenness"]),
+            }
+        )
 
     # Return the dictionary
     return echometrics
 
+
 def integrate_nasc(data_df: pd.DataFrame, echometrics: bool = True):
 
     # Vertically integrate PRC NASC
     nasc_dict = {"nasc": data_df["NASC"].sum()}
-    
+
     # Horizontally concatenate `echometrics`, if `True`
     if echometrics:
         # ---- Compute values
@@ -202,21 +216,23 @@ def integrate_nasc(data_df: pd.DataFrame, echometrics: bool = True):
 
     # return pd.DataFrame([nasc_dict])
 
-def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict,
-                 echometrics: bool = True):
+
+def compute_nasc(
+    acoustic_data_df: pd.DataFrame, file_configuration: dict, echometrics: bool = True
+):
 
     # Get spatial definitions, if any
     # spatial_column = file_configuration["spatial_column"]
 
     # Get stratum column, if any
     gridding_column = file_configuration["gridding_column"]
-    
+
     # Integrate NASC (and compute the echometrics, if necessary)
     # ---- Get number of unique sources
     # if len(np.unique(acoustic_data_df["ping_time"])) > 1:
     #     nasc_data_df = (
     #         acoustic_data_df
-    #         .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column, 
+    #         .groupby(["longitude", "latitude", "ping_time", "source"] + spatial_column,
     #                  observed=False)
     #         .apply(integrate_nasc, echometrics, include_groups=False).unstack()
     #         .reset_index()
@@ -224,49 +240,74 @@ def compute_nasc(acoustic_data_df: pd.DataFrame, file_configuration: dict,
     #     )
     # else:
     nasc_data_df = (
-        acoustic_data_df
-        .groupby(["ship_id", "longitude", "latitude", "ping_time", "source"] + gridding_column, 
-                    observed=False)
-        .apply(integrate_nasc, echometrics, include_groups=False).droplevel(-1)
+        acoustic_data_df.groupby(
+            ["ship_id", "longitude", "latitude", "ping_time", "source"] + gridding_column,
+            observed=False,
+        )
+        .apply(integrate_nasc, echometrics, include_groups=False)
+        .droplevel(-1)
         .reset_index()
         .sort_values("ping_time")
     )
     # ---- Amend the dtypes if echometrics were computed
     if echometrics:
         # ---- Set dtypes
-        nasc_data_df = (
-            nasc_data_df
-            .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float,
-                    "center_of_mass": float, "dispersion": float, "evenness": float,
-                    "aggregation_index": float, "occupied_area": float})
+        nasc_data_df = nasc_data_df.astype(
+            {
+                "n_layers": int,
+                "mean_Sv": float,
+                "max_Sv": float,
+                "nasc_db": float,
+                "center_of_mass": float,
+                "dispersion": float,
+                "evenness": float,
+                "aggregation_index": float,
+                "occupied_area": float,
+            }
         )
         # ---- Reorder columns
         nasc_data_df = nasc_data_df[
             gridding_column
-            + ["ship_id", "longitude", "latitude", "ping_time", "source", "nasc", "n_layers", 
-               "nasc_db", "mean_Sv", "max_Sv", "aggregation_index", "center_of_mass", "dispersion", 
-               "evenness", "occupied_area"]
+            + [
+                "ship_id",
+                "longitude",
+                "latitude",
+                "ping_time",
+                "source",
+                "nasc",
+                "n_layers",
+                "nasc_db",
+                "mean_Sv",
+                "max_Sv",
+                "aggregation_index",
+                "center_of_mass",
+                "dispersion",
+                "evenness",
+                "occupied_area",
+            ]
         ]
 
     # Return the output
     return nasc_data_df
 
+
 def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict, meta_dict: dict):
 
     # Get acoustic database filename
     acoustic_db = file_configuration["database"]["acoustics"]
-   
-        # Create a copy of the dataframe
+
+    # Create a copy of the dataframe
     df = nasc_data_df.copy()
-    
+
     # Add population-specific columns (specified in the file configuration)
     # TODO: Add to `yaml` file for configuration; hard-code for now
     add_columns = ["number_density", "biomass_density"]
     # ----
     df[add_columns] = 0.0
     # ---- Assign values for key values
-    key_values = [f"{df.loc[index, 'ship_id']}-{str(index)}-{df.loc[index, 'source']}" 
-                  for index in df.index]    
+    key_values = [
+        f"{df.loc[index, 'ship_id']}-{str(index)}-{df.loc[index, 'source']}" for index in df.index
+    ]
     # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint
     df.loc[:, "id"] = key_values
 
@@ -274,15 +315,23 @@ def format_acoustic_dataset(nasc_data_df: pd.DataFrame, file_configuration: dict
     root_database = file_configuration["database_directory"]
 
     # Update the successfully processed files
-    query_processed_files(root_database, 
-                          file_configuration["input_directories"]["acoustics"],
-                          meta_dict["provenance"]["acoustic_files_read"],
-                          processed=True)
-    
+    query_processed_files(
+        root_database,
+        file_configuration["input_directories"]["acoustics"],
+        meta_dict["provenance"]["acoustic_files_read"],
+        processed=True,
+    )
+
     # Insert the new data into the database & pull in the combined dataset
     # TODO: Replace with single-direction INSERT statement instead of INSERT/SELECT
-    _ = sql_data_exchange(acoustic_db, dataframe=df, table_name="survey_data_df",
-                          id_columns=["id"], primary_keys=["id"], output_type=pd.DataFrame)
-           
+    _ = sql_data_exchange(
+        acoustic_db,
+        dataframe=df,
+        table_name="survey_data_df",
+        id_columns=["id"],
+        primary_keys=["id"],
+        output_type=pd.DataFrame,
+    )
+
     # Return the formatted dataframe
-    return df
\ No newline at end of file
+    return df
diff --git a/echopop/live/live_biology.py b/echopop/live/live_biology.py
index 5e70d92b..a9cacb4b 100644
--- a/echopop/live/live_biology.py
+++ b/echopop/live/live_biology.py
@@ -1,11 +1,19 @@
-import pandas as pd
+from functools import reduce
+
 import numpy as np
-from .sql_methods import SQL, sql_data_exchange, get_table_key_names, sql_group_update, query_processed_files, sql_update_strata_summary
-from .live_spatial_methods import apply_spatial_definitions
-from .live_acoustics import average_sigma_bs
-from ..acoustics import ts_length_regression, to_dB, to_linear
+import pandas as pd
+
 from ..utils.operations import group_interpolator_creator
-from functools import reduce
+from .live_acoustics import average_sigma_bs
+from .live_spatial_methods import apply_spatial_definitions
+from .sql_methods import (
+    SQL,
+    get_table_key_names,
+    sql_data_exchange,
+    sql_group_update,
+    sql_update_strata_summary,
+)
+
 
 def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict):
 
@@ -20,6 +28,7 @@ def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict):
     # Return output
     return data_copy
 
+
 def merge_trawl_info(biology_dict: dict):
 
     # Get the trawl information dictionary
@@ -37,15 +46,15 @@ def merge_trawl_info(biology_dict: dict):
     # Drop the trawl information
     del biology_dict["trawl_info_df"]
 
+
 def prepare_length_distribution(file_configuration: dict):
 
     # Get the length distribution parameters
     distrib_params = file_configuration["biology"]["length_distribution"]["bins"]
 
     # Create histogram bins
-    length_bins = (
-        np.linspace(**{key: value for key, value in zip(["start", "stop", "num"], distrib_params)}, 
-                    dtype=float)
+    length_bins = np.linspace(
+        **{key: value for key, value in zip(["start", "stop", "num"], distrib_params)}, dtype=float
     )
 
     # Get the binwidths
@@ -56,8 +65,8 @@ def prepare_length_distribution(file_configuration: dict):
 
     # Format as a DataFrame and return the output
     # ---- Add Categorical interval column
-    length_bins_df = (
-        pd.DataFrame({"length_bin": length_bins, "interval": pd.cut(length_bins, intervals)})
+    length_bins_df = pd.DataFrame(
+        {"length_bin": length_bins, "interval": pd.cut(length_bins, intervals)}
     )
     # ---- Add numeric lower boundary
     length_bins_df["lower"] = length_bins_df["interval"].apply(lambda x: x.left).astype(float)
@@ -67,11 +76,12 @@ def prepare_length_distribution(file_configuration: dict):
     # Return the dataframe that will be incorporated into the biological data attribute
     return length_bins_df
 
+
 def preprocess_biology_data(biology_output: dict, spatial_dict: dict, file_configuration: dict):
-  
+
     # Get SQL database file
     biology_db = file_configuration["database"]["biology"]
-    
+
     # Get contrasts used for filtering the dataset
     # ---- Species
     species_filter = file_configuration["species"]["number_code"]
@@ -82,8 +92,9 @@ def preprocess_biology_data(biology_output: dict, spatial_dict: dict, file_confi
 
     # Apply the filter
     filtered_biology_output = {
-        key: biology_data_filter(df, filter_dict) 
-        for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty
+        key: biology_data_filter(df, filter_dict)
+        for key, df in biology_output.items()
+        if isinstance(df, pd.DataFrame) and not df.empty
     }
     # ---- Create new data flag
     file_configuration["length_distribution"] = prepare_length_distribution(file_configuration)
@@ -99,7 +110,7 @@ def preprocess_biology_data(biology_output: dict, spatial_dict: dict, file_confi
         table_list = list(set(SQL(biology_db, "map")) - set(["files_read"]))
         # ---- Plug into the dictionary
         filtered_biology_output.update({key: pd.DataFrame() for key in table_list})
-    # ---- Initialize the results dictionary   
+    # ---- Initialize the results dictionary
     sql_results_dict = {key: pd.DataFrame() for key in filtered_biology_output.keys()}
 
     # Update the SQL database
@@ -109,17 +120,21 @@ def preprocess_biology_data(biology_output: dict, spatial_dict: dict, file_confi
         # ---- Create copy
         df = df.copy()
         # ---- Assign values for key values
-        key_values = [str(index) + "-" + "-".join(df.loc[index, key_columns].values.astype(str)) 
-                      for index in df.index]
+        key_values = [
+            str(index) + "-" + "-".join(df.loc[index, key_columns].values.astype(str))
+            for index in df.index
+        ]
         # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint
         df.loc[:, "id"] = key_values
         # ---- Insert the new data into the database & pull in the combined dataset
-        table_df = sql_data_exchange(biology_db, 
-                                     dataframe=df, 
-                                     table_name=table_name, 
-                                     id_columns=["id"],
-                                     primary_keys=["id"],
-                                     output_type=pd.DataFrame)
+        table_df = sql_data_exchange(
+            biology_db,
+            dataframe=df,
+            table_name=table_name,
+            id_columns=["id"],
+            primary_keys=["id"],
+            output_type=pd.DataFrame,
+        )
         # ---- Drop SQL db identifier
         if "id" in table_df.columns:
             table_df.drop(columns="id", inplace=True)
@@ -129,18 +144,19 @@ def preprocess_biology_data(biology_output: dict, spatial_dict: dict, file_confi
     # Return the output
     return filtered_biology_output, sql_results_dict
 
-def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame, 
-                     file_configuration: dict):
+
+def compute_sigma_bs(
+    specimen_data: pd.DataFrame, length_data: pd.DataFrame, file_configuration: dict
+):
 
     # Determine contrast columns
     # ----- Check for "stratum" column in spatial definitions configuration
     stratum_column = file_configuration["spatial_column"]
     # ---- Append to other defined keys
     contrast_columns = stratum_column + ["haul_num", "species_id", "length"]
-    
+
     # Meld the biological datasets
-    length_datasets = specimen_data.meld(length_data, 
-                                         contrasts=contrast_columns)
+    length_datasets = specimen_data.meld(length_data, contrasts=contrast_columns)
 
     # Get the TS-length model parameterization
     ts_length_parameters_spp = [
@@ -152,21 +168,22 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame,
     # Extract the target species information
     target_species = pd.DataFrame.from_dict(ts_length_parameters_spp)
     # ---- Filter out non-target species
-    length_datasets = (
-        length_datasets[length_datasets["species_id"].isin(target_species["number_code"])]
-    )
+    length_datasets = length_datasets[
+        length_datasets["species_id"].isin(target_species["number_code"])
+    ]
     # ---- Merge with `length_datasets`
-    ts_length_df = length_datasets.merge(target_species, 
-                                         left_on=["species_id"], right_on=["number_code"])
+    ts_length_df = length_datasets.merge(
+        target_species, left_on=["species_id"], right_on=["number_code"]
+    )
 
     # Compute the mean sigma_bs for this particular haul
     # ---- Create primary key list
     key_list = list(set(contrast_columns) - set(["length"]))
     # ---- Compute haul-specific means
     sigma_bs_df = (
-        ts_length_df
-        .groupby(key_list, observed=False)
-        [["TS_L_slope", "TS_L_intercept", "length", "length_count"]]
+        ts_length_df.groupby(key_list, observed=False)[
+            ["TS_L_slope", "TS_L_intercept", "length", "length_count"]
+        ]
         .apply(lambda x: average_sigma_bs(x, weights="length_count"))
         .to_frame("sigma_bs")
     )
@@ -174,9 +191,7 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame,
     # For SQL database storage purposes, the sum and count are stored instead
     # ---- Count sum
     sigma_bs_df["sigma_bs_count"] = (
-        ts_length_df.reset_index()
-        .groupby(key_list, observed=False)["length_count"]
-        .sum()
+        ts_length_df.reset_index().groupby(key_list, observed=False)["length_count"].sum()
     )
     # ---- Value sum
     sigma_bs_df["sigma_bs_sum"] = sigma_bs_df["sigma_bs"] * sigma_bs_df["sigma_bs_count"]
@@ -194,11 +209,21 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame,
         # ---- Create an insertion dataframe
         insertion_df = sigma_bs_df.copy()
         # ---- Create
-        SQL(biology_db, "create", table_name="sigma_bs_mean_df", dataframe=insertion_df, 
-            primary_keys=key_list+["id"])
+        SQL(
+            biology_db,
+            "create",
+            table_name="sigma_bs_mean_df",
+            dataframe=insertion_df,
+            primary_keys=key_list + ["id"],
+        )
         # ---- Populate table
-        SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=insertion_df,
-            id_columns=key_list+["id"])
+        SQL(
+            biology_db,
+            "insert",
+            table_name="sigma_bs_mean_df",
+            dataframe=insertion_df,
+            id_columns=key_list + ["id"],
+        )
     else:
         # ---- Get previous values in the table
         table_df = SQL(biology_db, "select", table_name="sigma_bs_mean_df")
@@ -215,22 +240,27 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame,
             # ---- Create DataFrame
             insertion_df = sigma_bs_df[sigma_bs_df["id"].isin(insertion_keys)]
             # ---- INSERT
-            SQL(biology_db, "insert", table_name="sigma_bs_mean_df", 
-                dataframe=insertion_df)
+            SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=insertion_df)
         # ---- UPDATE values
         if update_keys:
             update_df = sigma_bs_df[sigma_bs_df["id"].isin(update_keys)]
             # ---- Create a filter condition command
-            sql_group_update(biology_db, dataframe=update_df, table_name="sigma_bs_mean_df", 
-                             columns=["sigma_bs_count", "sigma_bs_sum"], operation="+",
-                             unique_columns=["id"], id_columns=["id"])
+            sql_group_update(
+                biology_db,
+                dataframe=update_df,
+                table_name="sigma_bs_mean_df",
+                columns=["sigma_bs_count", "sigma_bs_sum"],
+                operation="+",
+                unique_columns=["id"],
+                id_columns=["id"],
+            )
             # condition_str = " & ".join([f"id = {id_value}" for id_value in update_keys])
 
-        #     SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=update_df, 
-        #         operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], 
+        #     SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=update_df,
+        #         operation="+", columns=["sigma_bs_count", "sigma_bs_sum"],
         #         condition=condition_str)
         #             # ---- Check the present keys
-        # current_keys_dict = SQL(acoustic_db, "inspect", table_name="sigma_bs_mean_df", 
+        # current_keys_dict = SQL(acoustic_db, "inspect", table_name="sigma_bs_mean_df",
         #                         columns=key_list)
         # # ---- Insert if missing
         # if not all([all(sigma_bs_df[key].isin(current_keys_dict[key])) for key in key_list]):
@@ -238,22 +268,30 @@ def compute_sigma_bs(specimen_data: pd.DataFrame, length_data: pd.DataFrame,
         # # ---- Update if not missing
         # else:
         #     # ---- Create a filter condition command
-        #     condition_str = " & ".join([f"{key} in {np.unique(sigma_bs_df[key])}" for key in key_list])
-        #     # ---- Update the table key 
-        #     SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df, 
-        #         operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], condition=condition_str)
+        # condition_str = " & ".join([f"{key} in {np.unique(sigma_bs_df[key])}" for key inkey_list])
+        #     # ---- Update the table key
+        #    SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", dataframe=sigma_bs_df,
+        #       operation="+", columns=["sigma_bs_count", "sigma_bs_sum"], condition=condition_str)
         #     # ---- Update the actual `sigma_bs` value in the table
         #     SQL(acoustic_db, "update", table_name="sigma_bs_mean_df", columns=["sigma_bs"],
         #         operation="sigma_bs_sum / sigma_bs_count", condition=condition_str)
-        
-def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.DataFrame, 
-                             file_configuration: dict):
-    
+
+
+def length_weight_regression(
+    specimen_data: pd.DataFrame, distribution_df: pd.DataFrame, file_configuration: dict
+):
+
     # Get the spatial column name, if there is one
     spatial_column = file_configuration["spatial_column"].copy()
     # ---- Append additional columns that will be used
-    contrast_columns = spatial_column + ["trawl_partition", "sex", "haul_num", "species_id", "length_bin"]
-    
+    contrast_columns = spatial_column + [
+        "trawl_partition",
+        "sex",
+        "haul_num",
+        "species_id",
+        "length_bin",
+    ]
+
     # Gather specimen measurements to represent 'all' fish
     specimen_data_all = specimen_data.assign(sex="all")
 
@@ -261,7 +299,7 @@ def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.Da
     # ---- Vertical concatenation
     specimen_data_all = pd.concat(
         [specimen_data[specimen_data["sex"].isin(["male", "female"])], specimen_data_all],
-        ignore_index=True
+        ignore_index=True,
     )
     # ---- Remove bad values
     specimen_data_all.dropna(subset=["length", "weight"], inplace=True)
@@ -273,21 +311,26 @@ def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.Da
     # ---- Query database
     # if not SQL(biology_db, "validate", table_name="specimen_data_df"):
     # ---- Assign values for key values
-    key_values = [str(index) + "-" 
-                    + "-".join(specimen_data_all.loc[index, contrast_columns].values.astype(str)) 
-                    for index in specimen_data_all.index]
+    key_values = [
+        str(index)
+        + "-"
+        + "-".join(specimen_data_all.loc[index, contrast_columns].values.astype(str))
+        for index in specimen_data_all.index
+    ]
     # ---- Add an autoincrementing tag that will serve as a primary key and unique constraint
     specimen_data_all.loc[:, "id"] = key_values
     # ---- Insert the new data into the database & pull in the combined dataset
-    specimen_data_sql = sql_data_exchange(biology_db, 
-                                          dataframe=specimen_data_all, 
-                                          table_name="specimen_data_df", 
-                                          id_columns=["id"], 
-                                          primary_keys=["id"], 
-                                          output_type=pd.DataFrame)    
+    specimen_data_sql = sql_data_exchange(
+        biology_db,
+        dataframe=specimen_data_all,
+        table_name="specimen_data_df",
+        id_columns=["id"],
+        primary_keys=["id"],
+        output_type=pd.DataFrame,
+    )
     # ---- Drop SQL db identifier
     specimen_data_sql.drop(columns="id", inplace=True)
-        
+
     # Fit length-weight linear regression by male, female, and all fish
     length_weight_regression_df = (
         specimen_data_sql.groupby(["species_id", "sex"])
@@ -329,14 +372,12 @@ def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.Da
     )
     # ---- Merge with the fitted weights
     weight_fitted_distribution_df = weight_fitted_distribution_df.merge(
-        weight_fitted_df,
-        on=["species_id", "sex", "length_bin"],
-        how="outer"
+        weight_fitted_df, on=["species_id", "sex", "length_bin"], how="outer"
     )
     # ---- Fill missing counts
-    weight_fitted_distribution_df["weight_mean"] = (
-        weight_fitted_distribution_df["weight_mean"].fillna(0.0)
-    )
+    weight_fitted_distribution_df["weight_mean"] = weight_fitted_distribution_df[
+        "weight_mean"
+    ].fillna(0.0)
     # ---- Fill missing weights
     weight_fitted_distribution_df["count"] = (
         weight_fitted_distribution_df["count"].fillna(0).astype(int)
@@ -354,39 +395,60 @@ def length_weight_regression(specimen_data: pd.DataFrame, distribution_df: pd.Da
 
     # Check for `weight_fitted_df` in the database file
     # ---- Create id/primary key
-    key_values = ["-".join(weight_fitted_distribution_df
-                           .loc[idx, ["species_id", "sex", "length_bin"]]
-                           .values.astype(str)) 
-                for idx in weight_fitted_distribution_df.index]
+    key_values = [
+        "-".join(
+            weight_fitted_distribution_df.loc[
+                idx, ["species_id", "sex", "length_bin"]
+            ].values.astype(str)
+        )
+        for idx in weight_fitted_distribution_df.index
+    ]
     # ---- Add to the output
     output_df = weight_fitted_distribution_df.assign(id=key_values)
     # ---- Query database
     if not SQL(biology_db, "validate", table_name="weight_fitted_df"):
         # ---- Create
-        SQL(biology_db, "create", table_name="weight_fitted_df", 
-            dataframe=output_df, primary_keys=["id"])       
+        SQL(
+            biology_db,
+            "create",
+            table_name="weight_fitted_df",
+            dataframe=output_df,
+            primary_keys=["id"],
+        )
         # ---- Populate table
-        SQL(biology_db, "insert", table_name="weight_fitted_df", 
-            dataframe=output_df, id_columns=["id"])
+        SQL(
+            biology_db,
+            "insert",
+            table_name="weight_fitted_df",
+            dataframe=output_df,
+            id_columns=["id"],
+        )
     else:
         # ---- Update the table
-        sql_group_update(db_file=biology_db, 
-                         dataframe=output_df, 
-                         table_name="weight_fitted_df", 
-                         columns=["weight_fitted"],
-                         unique_columns=["species_id", "sex", "length_bin"], 
-                         id_columns=["id"])
-        
+        sql_group_update(
+            db_file=biology_db,
+            dataframe=output_df,
+            table_name="weight_fitted_df",
+            columns=["weight_fitted"],
+            unique_columns=["species_id", "sex", "length_bin"],
+            id_columns=["id"],
+        )
+
     # Return the dataframe
     return weight_fitted_distribution_df
 
-def length_bin_weights(length_data: pd.DataFrame, specimen_data: pd.DataFrame, 
-                       length_weight_df: pd.DataFrame, file_configuration: dict):
-    
+
+def length_bin_weights(
+    length_data: pd.DataFrame,
+    specimen_data: pd.DataFrame,
+    length_weight_df: pd.DataFrame,
+    file_configuration: dict,
+):
+
     # Get the spatial column name, if there is one
     contrast_columns = file_configuration["spatial_column"].copy()
     # ---- Get the spatial key
-    spatial_key = contrast_columns.copy()
+    # spatial_key = contrast_columns.copy()
     # ---- Append additional columns that will be used
     contrast_columns.extend(["sex", "species_id"])
 
@@ -394,7 +456,7 @@ def length_bin_weights(length_data: pd.DataFrame, specimen_data: pd.DataFrame,
     biology_db = file_configuration["database"]["biology"]
 
     # Pull the relevant data
-    # SQL(biology_db, "select", table_name="length_df", 
+    # SQL(biology_db, "select", table_name="length_df",
     #     columns=list(set(length_data.columns) - set(["length_bin"])))
     # list(set(length_data.columns) - set(["length_bin"]))
     # Get length distribution
@@ -410,6 +472,7 @@ def length_bin_weights(length_data: pd.DataFrame, specimen_data: pd.DataFrame,
         dependent_var="weight_fitted",
         contrast=["sex", "species_id"],
     )
+
     # ---- Create helper/lambda function
     def weight_interpolator(dataframe_row):
         sex = dataframe_row["sex"]
@@ -418,8 +481,8 @@ def weight_interpolator(dataframe_row):
         if (sex, species_id) in interpolators:
             return interpolators[(sex, species_id)](length)
         else:
-            return None    
-            
+            return None
+
     # Extract only sexed fish from the unaged (station 1) length dataset
     length_data_sexed = length_data[length_data["sex"].isin(["male", "female"])].copy()
     # ---- Add interpolated weights to the general length dataset
@@ -428,8 +491,9 @@ def weight_interpolator(dataframe_row):
     )
     # ---- Convert interpolated weights (summed across length counts) into a table
     length_table_sexed = (
-        length_data_sexed
-        .groupby(list(set(contrast_columns).union(set(["length_bin"]))))["weight_interp"].sum()
+        length_data_sexed.groupby(list(set(contrast_columns).union(set(["length_bin"]))))[
+            "weight_interp"
+        ].sum()
     ).reset_index()
 
     # Remove specimen data with missing data required for this analysis
@@ -439,15 +503,16 @@ def weight_interpolator(dataframe_row):
     specimen_data_filtered = specimen_data_filtered.dropna(subset=["length", "weight"])
     # ---- Convert to a table
     specimen_table_sexed = (
-        specimen_data_filtered
-        .groupby(list(set(contrast_columns).union(set(["length_bin"]))))["weight"].sum()
+        specimen_data_filtered.groupby(list(set(contrast_columns).union(set(["length_bin"]))))[
+            "weight"
+        ].sum()
     ).reset_index()
 
     # Check for `length_weight_df` in the database file
     # ---- Combine the datasets
-    full_weight_distrib = (
-        pd.concat([length_table_sexed.rename(columns={"weight_interp": "weight"}), 
-                   specimen_table_sexed], ignore_index=True)
+    full_weight_distrib = pd.concat(
+        [length_table_sexed.rename(columns={"weight_interp": "weight"}), specimen_table_sexed],
+        ignore_index=True,
     )
     # ---- Sum by bin
     full_weight_distrib = (
@@ -455,46 +520,69 @@ def weight_interpolator(dataframe_row):
     )
     # ---- Create id/primary key
     full_weight_distrib.loc[:, "id"] = (
-        full_weight_distrib[contrast_columns + ["length_bin"]].apply(tuple, axis=1).astype(str)
+        full_weight_distrib[contrast_columns + ["length_bin"]]
+        .apply(tuple, axis=1)
+        .astype(str)
         .str.replace("'", "")
     )
     #
-    key_values = ["-".join(length_table_sexed.reset_index()
-                           .loc[idx, ["species_id", "sex", "length_bin"]]
-                           .values.astype(str)) 
-                for idx in length_table_sexed.reset_index().index]
+    key_values = [
+        "-".join(
+            length_table_sexed.reset_index()
+            .loc[idx, ["species_id", "sex", "length_bin"]]
+            .values.astype(str)
+        )
+        for idx in length_table_sexed.reset_index().index
+    ]
     # ---- Add to the output
     length_table_sexed["id"] = key_values
     # ---- Query database
     if not SQL(biology_db, "validate", table_name="length_weight_df"):
         # ---- Create full table
         overall_weight_distrib = (
-            pd.DataFrame({"stratum": file_configuration["geospatial"]["inpfc"]["stratum_names"] + 
-                          [len(file_configuration["geospatial"]["inpfc"]["stratum_names"]) + 1]})
+            pd.DataFrame(
+                {
+                    "stratum": file_configuration["geospatial"]["inpfc"]["stratum_names"]
+                    + [len(file_configuration["geospatial"]["inpfc"]["stratum_names"]) + 1]
+                }
+            )
             .merge(pd.DataFrame({"sex": ["male", "female"]}), how="cross")
-            .merge(pd.DataFrame(
-                {"species_id": np.unique(file_configuration["species"]["number_code"])}
-            ), how="cross")
+            .merge(
+                pd.DataFrame(
+                    {"species_id": np.unique(file_configuration["species"]["number_code"])}
+                ),
+                how="cross",
+            )
             .merge(distribution_df.filter(["length_bin"]), how="cross")
         )
         # ---- Pre-allocate weight
         overall_weight_distrib.loc[:, "weight"] = 0.0
         # ---- Create id/primary key
         overall_weight_distrib.loc[:, "id"] = (
-            overall_weight_distrib[contrast_columns + ["length_bin"]].apply(tuple, axis=1)
+            overall_weight_distrib[contrast_columns + ["length_bin"]]
+            .apply(tuple, axis=1)
             .astype(str)
             .str.replace("'", "")
         )
         # ---- Create
-        SQL(biology_db, "create", table_name="length_weight_df", 
-            dataframe=overall_weight_distrib, primary_keys=["id"])    
+        SQL(
+            biology_db,
+            "create",
+            table_name="length_weight_df",
+            dataframe=overall_weight_distrib,
+            primary_keys=["id"],
+        )
         # ---- INSERT
-        SQL(biology_db, "insert", table_name="length_weight_df", 
-            dataframe=overall_weight_distrib)            
+        SQL(biology_db, "insert", table_name="length_weight_df", dataframe=overall_weight_distrib)
     # ---- UPDATE
-    sql_group_update(biology_db, dataframe=full_weight_distrib, table_name="length_weight_df", 
-                     columns=["weight"],
-                     unique_columns=["id"], id_columns=["id"])   
+    sql_group_update(
+        biology_db,
+        dataframe=full_weight_distrib,
+        table_name="length_weight_df",
+        columns=["weight"],
+        unique_columns=["id"],
+        id_columns=["id"],
+    )
     # table_df = SQL(biology_db, "select", table_name="length_weight_df")
     # # ---- Check the table keys
     # table_keys = np.unique(table_df["id"]).tolist()
@@ -509,61 +597,66 @@ def weight_interpolator(dataframe_row):
     #         # ---- Create DataFrame
     #         insertion_df = full_weight_distrib[full_weight_distrib["id"].isin(insertion_keys)]
     #         # ---- INSERT
-    #         SQL(biology_db, "insert", table_name="length_weight_df", 
+    #         SQL(biology_db, "insert", table_name="length_weight_df",
     #             dataframe=insertion_df)
     #     # ---- UPDATE values
     #     if update_keys:
     #         update_df = full_weight_distrib[full_weight_distrib["id"].isin(update_keys)]
     #         # ---- Create a filter condition command
-    #         sql_group_update(biology_db, dataframe=update_df, table_name="length_weight_df", 
+    #         sql_group_update(biology_db, dataframe=update_df, table_name="length_weight_df",
     #                          columns=["weight"],
-    #                          unique_columns=["id"], id_columns=["id"])            
+    #                          unique_columns=["id"], id_columns=["id"])
 
     #     # ---- Update the table
-    #     sql_group_update(db_file=biology_db, 
-    #                      dataframe=length_table_sexed, 
-    #                      table_name="length_weight_df", 
+    #     sql_group_update(db_file=biology_db,
+    #                      dataframe=length_table_sexed,
+    #                      table_name="length_weight_df",
     #                      columns=["weight_interp"],
-    #                      unique_columns=contrast_columns, 
+    #                      unique_columns=contrast_columns,
     #                      id_columns=["id"])
     # length_sql_sexed
-    
-    
+
     # , specimen_sql_sexed
 
     # Return outputs
     return length_table_sexed, specimen_table_sexed
 
-def number_proportions(specimen_binned: pd.DataFrame, specimen_binned_filtered: pd.DataFrame,
-                       length_binned: pd.DataFrame, file_configuration: dict):
-    
+
+def number_proportions(
+    specimen_binned: pd.DataFrame,
+    specimen_binned_filtered: pd.DataFrame,
+    length_binned: pd.DataFrame,
+    file_configuration: dict,
+):
+
     # Get the spatial column name, if there is one
     contrast_columns = file_configuration["spatial_column"].copy()
     # ---- Append additional columns that will be used
     contrast_columns.extend(["sex", "species_id"])
 
-
     # Get unique values of each contrast column across the biological datasets
-    dfs = [pd.DataFrame({col: df[col].unique().tolist()}) 
-        for col, df in zip(contrast_columns, [specimen_binned,
-                                                specimen_binned_filtered, 
-                                                length_binned])]
+    dfs = [
+        pd.DataFrame({col: df[col].unique().tolist()})
+        for col, df in zip(
+            contrast_columns, [specimen_binned, specimen_binned_filtered, length_binned]
+        )
+    ]
     # ---- Reduce into a single DataFrame
-    count_total = reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs)
+    count_total = reduce(lambda left, right: pd.merge(left, right, how="cross"), dfs)
     # ---- Set the indices
     count_total.set_index(contrast_columns, inplace=True)
     # ---- Specimen count
     count_total["total_specimen"] = specimen_binned.groupby(contrast_columns)["count"].sum()
     # ---- Specimen filtered count
-    count_total["total_specimen_filtered"] = (
-        specimen_binned_filtered.groupby(contrast_columns)["count"].sum()
-    )
+    count_total["total_specimen_filtered"] = specimen_binned_filtered.groupby(contrast_columns)[
+        "count"
+    ].sum()
     # ---- Length count
     count_total["total_length"] = length_binned.groupby(contrast_columns)["count"].sum()
     # ---- Fill NaN
     count_total.fillna(0, inplace=True)
-    count_total = (
-        count_total.reset_index().set_index(list(set(contrast_columns) - set(["sex", "species_id"])))
+    count_total = count_total.reset_index().set_index(
+        list(set(contrast_columns) - set(["sex", "species_id"]))
     )
     # ---- Grand totals
     count_total["total_overall"] = (
@@ -577,8 +670,10 @@ def number_proportions(specimen_binned: pd.DataFrame, specimen_binned_filtered:
     specimen_number_proportion = specimen_binned_filtered[
         specimen_binned_filtered["sex"].isin(["male", "female", "all"])
     ].merge(
-        count_total[list(set(contrast_columns).union(set(["total_specimen_filtered", "total_overall"])))],
-        on=contrast_columns
+        count_total[
+            list(set(contrast_columns).union(set(["total_specimen_filtered", "total_overall"])))
+        ],
+        on=contrast_columns,
     )
     # ---- Within-dataset proportion
     specimen_number_proportion["proportion_number_specimen"] = (
@@ -602,7 +697,7 @@ def number_proportions(specimen_binned: pd.DataFrame, specimen_binned_filtered:
         length_binned["sex"].isin(["male", "female", "all"])
     ].merge(
         count_total[list(set(contrast_columns).union(set(["total_length", "total_overall"])))],
-        on=contrast_columns
+        on=contrast_columns,
     )
     # ---- Within-dataset proportion
     length_number_proportion["proportion_number_length"] = (
@@ -616,9 +711,7 @@ def number_proportions(specimen_binned: pd.DataFrame, specimen_binned_filtered:
     # Gather unaged (sexed) number proportions
     # ---- Merge
     sex_number_proportions = sex_number_proportions.merge(
-        length_number_proportion.groupby(contrast_columns)[
-            "proportion_number_length_overall"
-        ]
+        length_number_proportion.groupby(contrast_columns)["proportion_number_length_overall"]
         .sum()
         .reset_index(),
         how="outer",
@@ -627,13 +720,15 @@ def number_proportions(specimen_binned: pd.DataFrame, specimen_binned_filtered:
     sex_number_proportions["proportion_number_overall"] = (
         sex_number_proportions.proportion_number_specimen_overall
         + sex_number_proportions.proportion_number_length_overall
-    )    
+    )
 
     # Return the output
     return specimen_number_proportion, length_number_proportion, sex_number_proportions
 
-def length_bin_counts(length_data: pd.DataFrame, specimen_data: pd.DataFrame, 
-                      file_configuration: dict):
+
+def length_bin_counts(
+    length_data: pd.DataFrame, specimen_data: pd.DataFrame, file_configuration: dict
+):
 
     # Get the spatial column name, if there is one
     contrast_columns = file_configuration["spatial_column"].copy()
@@ -673,12 +768,12 @@ def length_bin_counts(length_data: pd.DataFrame, specimen_data: pd.DataFrame,
         contrasts=contrast_columns,
         variable="length_count",
         fun="sum",
-    )    
+    )
 
     return (
-        specimen_number_distribution, 
-        specimen_number_distribution_filtered, 
-        length_number_distribution
+        specimen_number_distribution,
+        specimen_number_distribution_filtered,
+        length_number_distribution,
     )
 
 
@@ -698,12 +793,12 @@ def length_bin_counts(length_data: pd.DataFrame, specimen_data: pd.DataFrame,
 #     # Generate number counts for the length distribution
 #     length_datasets = (
 #         biology_dict["specimen_df"]
-#         .meld(biology_dict["length_df"], 
+#         .meld(biology_dict["length_df"],
 #               contrasts=list(set(contrast_columns).union(["length_bin"])))
-#     )    
+#     )
 #     # ---- Create 'all'
 #     length_datasets_all = pd.concat([
-#         length_datasets[length_datasets["sex"].isin(["male", "female"])], 
+#         length_datasets[length_datasets["sex"].isin(["male", "female"])],
 #         length_datasets.assign(sex="all")
 #     ])
 
@@ -712,7 +807,7 @@ def length_bin_counts(length_data: pd.DataFrame, specimen_data: pd.DataFrame,
 #         length_datasets_all
 #         .groupby(contrast_columns, observed=False)["length_count"].sum()
 #     )
-    
+
 #     # Get distinct DataFrame columns
 #     distinct_keys = (
 #         grouped_length
@@ -735,27 +830,27 @@ def length_bin_counts(length_data: pd.DataFrame, specimen_data: pd.DataFrame,
 #     # ---- Create id/primary key
 #     key_values = ["-".join(output_df
 #                            .loc[idx, ["species_id", "sex", "length_bin"]]
-#                            .values.astype(str)) 
+#                            .values.astype(str))
 #                 for idx in output_df.index]
 #     # ---- Add to the output
 #     output_df["id"] = key_values
 #     # ---- Query database
 #     if not SQL(biology_db, "validate", table_name="length_count_df"):
 #         # ---- Create
-#         SQL(biology_db, "create", table_name="length_count_df", 
-#             dataframe=output_df, primary_keys=["id"])       
+#         SQL(biology_db, "create", table_name="length_count_df",
+#             dataframe=output_df, primary_keys=["id"])
 #         # ---- Populate table
-#         SQL(biology_db, "insert", table_name="length_count_df", 
+#         SQL(biology_db, "insert", table_name="length_count_df",
 #             dataframe=output_df, id_columns=["id"])
 #     else:
 #         # ---- Update the table
-#         sql_group_update(db_file=biology_db, 
-#                          dataframe=output_df, 
-#                          table_name="length_count_df", 
+#         sql_group_update(db_file=biology_db,
+#                          dataframe=output_df,
+#                          table_name="length_count_df",
 #                          columns=["count"],
-#                          unique_columns=contrast_columns, 
+#                          unique_columns=contrast_columns,
 #                          id_columns=["id"])
-        
+
 #     # Return output
 #     return output_df
 
@@ -767,25 +862,26 @@ def _quantize_lengths(dataset, distribution):
         # ---- Cut/merge the underlying histogram/discretized length bins
         if "length" in dataset.columns:
             # ---- Cut the intervals
-            dataset["length_bin"] = pd.cut(dataset["length"], 
-                                        np.unique(np.hstack([distribution["lower"], 
-                                                                distribution["upper"]])),
-                                            labels=distribution["length_bin"]).astype(float)
+            dataset["length_bin"] = pd.cut(
+                dataset["length"],
+                np.unique(np.hstack([distribution["lower"], distribution["upper"]])),
+                labels=distribution["length_bin"],
+            ).astype(float)
         # ---- Return the dataset
         return dataset
-        
+
     # Update the data dictionary
-    biology_dict.update({
-        k: _quantize_lengths(d, distribution_df) for k, d in biology_dict.items()
-    })
+    biology_dict.update({k: _quantize_lengths(d, distribution_df) for k, d in biology_dict.items()})
 
 
-def compute_average_weights(specimen_number_proportion: pd.DataFrame, 
-                            length_number_proportion: pd.DataFrame, 
-                            sex_number_proportions: pd.DataFrame,
-                            length_weight_df: pd.DataFrame,
-                            distribution_df: pd.DataFrame,
-                            file_configuration: dict):
+def compute_average_weights(
+    specimen_number_proportion: pd.DataFrame,
+    length_number_proportion: pd.DataFrame,
+    sex_number_proportions: pd.DataFrame,
+    length_weight_df: pd.DataFrame,
+    distribution_df: pd.DataFrame,
+    file_configuration: dict,
+):
 
     # Get the spatial column name, if there is one
     contrast_columns = file_configuration["spatial_column"].copy()
@@ -795,25 +891,30 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame,
     overall_proportions = sex_number_proportions[sex_number_proportions["sex"] == "all"]
     updated_proportions = sex_number_proportions.copy()
 
-    updated_proportions["number_proportion_length_all"] = overall_proportions["proportion_number_length_overall"].values[0]
-    updated_proportions["number_proportion_specimen_all"] = overall_proportions["proportion_number_specimen_overall"].values[0]
+    updated_proportions["number_proportion_length_all"] = overall_proportions[
+        "proportion_number_length_overall"
+    ].values[0]
+    updated_proportions["number_proportion_specimen_all"] = overall_proportions[
+        "proportion_number_specimen_overall"
+    ].values[0]
 
     # Calculate the mixed aged and unaged number proportions
-    updated_proportions["proportion_length"] = (
-        updated_proportions["number_proportion_length_all"] / 
-        (updated_proportions["number_proportion_length_all"] +
-        updated_proportions["proportion_number_specimen_overall"])
+    updated_proportions["proportion_length"] = updated_proportions[
+        "number_proportion_length_all"
+    ] / (
+        updated_proportions["number_proportion_length_all"]
+        + updated_proportions["proportion_number_specimen_overall"]
     )
     # ---- Calculate aged number proportions per sex per stratum
-    updated_proportions["proportion_specimen"] = (
-        updated_proportions["proportion_number_specimen_overall"] / (
-            updated_proportions["proportion_number_specimen_overall"] +
-            updated_proportions["proportion_length"]
-        )
+    updated_proportions["proportion_specimen"] = updated_proportions[
+        "proportion_number_specimen_overall"
+    ] / (
+        updated_proportions["proportion_number_specimen_overall"]
+        + updated_proportions["proportion_length"]
     )
     # ---- Reduce the columns
-    proportion_df = (
-        updated_proportions.filter(contrast_columns + ["proportion_length", "proportion_specimen"])
+    proportion_df = updated_proportions.filter(
+        contrast_columns + ["proportion_length", "proportion_specimen"]
     )
 
     # Combine the aged-unaged (or station-specific) proportions for calculations
@@ -828,8 +929,9 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame,
     ).reset_index()
     # ---- Convert to Table (to replicate indexed matrix operations)
     station_proportions_table = station_proportions.pivot_table(
-        index=["species_id", "group", "sex"], 
-        columns=file_configuration["spatial_column"].copy(), values="proportion"
+        index=["species_id", "group", "sex"],
+        columns=file_configuration["spatial_column"].copy(),
+        values="proportion",
     ).fillna(0.0)
 
     # Calculate the number length proportions that will later be converted into weight
@@ -842,19 +944,22 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame,
         .reset_index(name="number_proportion")
     )
     # ---- Length
-    length_length_distribution = (
-        length_number_proportion[length_number_proportion.sex != "unsexed"][
-            contrast_columns + ["length_bin", "proportion_number_length"]
-        ].rename(columns={"proportion_number_length": "number_proportion"})
+    length_length_distribution = length_number_proportion[
+        length_number_proportion.sex != "unsexed"
+    ][contrast_columns + ["length_bin", "proportion_number_length"]].rename(
+        columns={"proportion_number_length": "number_proportion"}
     )
 
     # Get unique values of each contrast column across the biological datasets
-    dfs = [pd.DataFrame({col: df[col].unique().tolist()}) 
-        for col, df in zip(contrast_columns, [specimen_number_proportion, 
-                                            length_number_proportion, 
-                                            sex_number_proportions])]
+    dfs = [
+        pd.DataFrame({col: df[col].unique().tolist()})
+        for col, df in zip(
+            contrast_columns,
+            [specimen_number_proportion, length_number_proportion, sex_number_proportions],
+        )
+    ]
     # ---- Reduce into a single DataFrame
-    full_contrast_keys = reduce(lambda left, right: pd.merge(left, right, how='cross'), dfs)
+    full_contrast_keys = reduce(lambda left, right: pd.merge(left, right, how="cross"), dfs)
 
     #
     length_distribution_df = distribution_df.copy()
@@ -865,17 +970,29 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame,
     )
 
     specimen_length_complete = complete_distrib_df.copy()
-    specimen_length_complete["number_proportion"] = specimen_length_distribution.set_index(contrast_columns + ["length_bin"]).sort_index()
-    specimen_length_complete.loc[:, "number_proportion"] = specimen_length_complete["number_proportion"].fillna(0.0)
+    specimen_length_complete["number_proportion"] = specimen_length_distribution.set_index(
+        contrast_columns + ["length_bin"]
+    ).sort_index()
+    specimen_length_complete.loc[:, "number_proportion"] = specimen_length_complete[
+        "number_proportion"
+    ].fillna(0.0)
 
     length_length_complete = complete_distrib_df.copy()
-    length_length_complete["number_proportion"] = length_length_distribution.set_index(contrast_columns + ["length_bin"]).sort_index()
-    length_length_complete.loc[:, "number_proportion"] = length_length_complete["number_proportion"].fillna(0.0)
+    length_length_complete["number_proportion"] = length_length_distribution.set_index(
+        contrast_columns + ["length_bin"]
+    ).sort_index()
+    length_length_complete.loc[:, "number_proportion"] = length_length_complete[
+        "number_proportion"
+    ].fillna(0.0)
 
     # ---- Concatenate the two datasets
     combined_number_proportions = (
-        pd.concat([specimen_length_complete.assign(group="specimen"), 
-                length_length_complete.assign(group="length")])
+        pd.concat(
+            [
+                specimen_length_complete.assign(group="specimen"),
+                length_length_complete.assign(group="length"),
+            ]
+        )
     ).reset_index()
     # ---- Convert to Table (to replicate indexed matrix operations)
     length_proportions_table = combined_number_proportions.pivot_table(
@@ -894,47 +1011,55 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame,
     # ---- All
     fitted_weight_table.loc[:, "all", :]
     weight_all = fitted_weight_table.loc[:, "all", :]["weight_fitted"].values.dot(
-        length_proportions_table.loc[:, "specimen", "all"] 
+        length_proportions_table.loc[:, "specimen", "all"]
         * station_proportions_table.loc[:, "specimen", "all"]
         + length_proportions_table.loc[:, "length", "all"]
         * station_proportions_table.loc[:, "length", "all"]
     )
     weight_male = fitted_weight_table.loc[:, "male", :]["weight_fitted"].values.dot(
-        length_proportions_table.loc[:, "specimen", "male"] 
+        length_proportions_table.loc[:, "specimen", "male"]
         * station_proportions_table.loc[:, "specimen", "male"]
         + length_proportions_table.loc[:, "length", "male"]
         * station_proportions_table.loc[:, "length", "male"]
     )
     weight_female = fitted_weight_table.loc[:, "female", :]["weight_fitted"].values.dot(
-        length_proportions_table.loc[:, "specimen", "female"] 
+        length_proportions_table.loc[:, "specimen", "female"]
         * station_proportions_table.loc[:, "specimen", "female"]
         + length_proportions_table.loc[:, "length", "female"]
         * station_proportions_table.loc[:, "length", "female"]
     )
     # ---- Combine the averaged weights for each sex and all fish
     fitted_weight_df = full_contrast_keys.copy()
-    fitted_weight_df["average_weight"] = (
-        np.concatenate([weight_all, weight_male, weight_female])
-    )
+    fitted_weight_df["average_weight"] = np.concatenate([weight_all, weight_male, weight_female])
 
     # Get database file
     biology_db = file_configuration["database"]["biology"]
 
     # Insert/update the table
     # ---- Create id/primary key
-    key_values = ["-".join(fitted_weight_df.reset_index()
-                           .loc[idx, contrast_columns]
-                           .values.astype(str)) 
-                for idx in fitted_weight_df.reset_index().index]
+    key_values = [
+        "-".join(fitted_weight_df.reset_index().loc[idx, contrast_columns].values.astype(str))
+        for idx in fitted_weight_df.reset_index().index
+    ]
     # ---- Add to the output
     fitted_weight_df["id"] = key_values
     if not SQL(biology_db, "validate", table_name="weight_stratum_df"):
         # ---- Create
-        SQL(biology_db, "create", table_name="weight_stratum_df", 
-            dataframe=fitted_weight_df, primary_keys=["id"])       
+        SQL(
+            biology_db,
+            "create",
+            table_name="weight_stratum_df",
+            dataframe=fitted_weight_df,
+            primary_keys=["id"],
+        )
         # ---- Populate table
-        SQL(biology_db, "insert", table_name="weight_stratum_df", 
-            dataframe=fitted_weight_df, id_columns=["id"])
+        SQL(
+            biology_db,
+            "insert",
+            table_name="weight_stratum_df",
+            dataframe=fitted_weight_df,
+            id_columns=["id"],
+        )
     else:
         # ---- Get previous values in the table
         table_df = SQL(biology_db, "select", table_name="weight_stratum_df")
@@ -953,27 +1078,38 @@ def compute_average_weights(specimen_number_proportion: pd.DataFrame,
             # ---- Create DataFrame
             insertion_df = fitted_weight_df[fitted_weight_df["current_keys"].isin(insertion_keys)]
             # ---- INSERT
-            SQL(biology_db, "insert", table_name="weight_stratum_df", 
-                dataframe=insertion_df.drop(columns="current_keys"))
+            SQL(
+                biology_db,
+                "insert",
+                table_name="weight_stratum_df",
+                dataframe=insertion_df.drop(columns="current_keys"),
+            )
         # ---- UPDATE values
         if update_keys:
             # ---- Create DataFrame
             update_df = fitted_weight_df[fitted_weight_df["current_keys"].isin(update_keys)]
             # ---- UPDATE
-            sql_group_update(biology_db, dataframe=update_df, 
-                             table_name="weight_stratum_df", columns=["average_weight"],
-                             unique_columns=contrast_columns,
-                             id_columns=["id"])
+            sql_group_update(
+                biology_db,
+                dataframe=update_df,
+                table_name="weight_stratum_df",
+                columns=["average_weight"],
+                unique_columns=contrast_columns,
+                id_columns=["id"],
+            )
     # Return output
     return fitted_weight_df
 
-def weight_proportions(catch_data: pd.DataFrame,
-                       specimen_weight_binned: pd.DataFrame,
-                       length_weight_binned: pd.DataFrame,
-                       length_number_proportion: pd.DataFrame,
-                       length_weight_df: pd.DataFrame,
-                       file_configuration: dict):
-    
+
+def weight_proportions(
+    catch_data: pd.DataFrame,
+    specimen_weight_binned: pd.DataFrame,
+    length_weight_binned: pd.DataFrame,
+    length_number_proportion: pd.DataFrame,
+    length_weight_df: pd.DataFrame,
+    file_configuration: dict,
+):
+
     # Get the spatial column name, if there is one
     spatial_column = file_configuration["spatial_column"]
     # ---- Append additional columns that will be used
@@ -982,77 +1118,58 @@ def weight_proportions(catch_data: pd.DataFrame,
     # Calculate grouped totals
     # ---- Sum the net haul weights from station 1/unaged fish
     catch_weights = catch_data.count_variable(
-        contrasts=["species_id"] + spatial_column, 
-        variable="haul_weight", fun="sum"
+        contrasts=["species_id"] + spatial_column, variable="haul_weight", fun="sum"
     )
     # ---- Rename resulting columns for both
     catch_weights.rename(columns={"count": "total_weight"}, inplace=True)
-    
-    # For the specimen data 
+
+    # For the specimen data
     # ---- Sum the net haul weights from station 1/unaged fish
-    specimen_weights_sex = (
-        specimen_weight_binned
-        .groupby(contrast_columns)["weight"]
-        .sum()
-    )
+    specimen_weights_sex = specimen_weight_binned.groupby(contrast_columns)["weight"].sum()
     # ---- Total (per stratum, if it exists)
     specimen_weight_total = specimen_weights_sex.transpose().unstack(1).sum(axis=1)
-    
+
     # For the length (unaged) dataset
-    length_weights_sex = (
-        length_weight_binned
-        .groupby(contrast_columns)["weight_interp"]
-        .sum()
-    )
+    length_weights_sex = length_weight_binned.groupby(contrast_columns)["weight_interp"].sum()
     # ---- Further reduce to the grand total (per stratum, if it exists)
     length_weight_total = length_weights_sex.transpose().unstack(1).sum(axis=1)
 
     # ---- Standardize the unaged sexed weights
-    length_weight_standardized = (
-        (length_weights_sex / length_weight_total).unstack(0) 
-        * catch_weights["total_weight"].to_numpy()
-    )
-    
+    length_weight_standardized = (length_weights_sex / length_weight_total).unstack(
+        0
+    ) * catch_weights["total_weight"].to_numpy()
+
     # Calculate the specimen weight proportions
     # ---- Pivot weight bins
-    specimen_weight_binned_pvt = (
-        specimen_weight_binned.pivot_table(
-            columns=spatial_column,
-            index=["length_bin", "species_id", "sex"],
-            values="weight",
-            observed = False
-        )
+    specimen_weight_binned_pvt = specimen_weight_binned.pivot_table(
+        columns=spatial_column,
+        index=["length_bin", "species_id", "sex"],
+        values="weight",
+        observed=False,
     )
     # ---- Divide by the aged stratum weights (relative to only aged fish)
-    specimen_weight_proportions_pvt = (
-        specimen_weight_binned_pvt / specimen_weight_total.to_numpy()
-    )
+    specimen_weight_proportions_pvt = specimen_weight_binned_pvt / specimen_weight_total.to_numpy()
     # ---- Pivot back to the desired format
     specimen_weight_proportion = (
-        specimen_weight_proportions_pvt
-        .stack().reset_index(name="weight_proportion")
-        .pivot_table(columns=spatial_column + ["species_id", "sex"], 
-                    index="length_bin", values="weight_proportion")
-    )    
-    # ---- Calculate the internal (i.e. only aged fish) for each sex
-    within_specimen_sex_proportions = (
-        specimen_weight_proportion.sum()
+        specimen_weight_proportions_pvt.stack()
+        .reset_index(name="weight_proportion")
+        .pivot_table(
+            columns=spatial_column + ["species_id", "sex"],
+            index="length_bin",
+            values="weight_proportion",
+        )
     )
+    # ---- Calculate the internal (i.e. only aged fish) for each sex
+    within_specimen_sex_proportions = specimen_weight_proportion.sum()
 
     # Calculate the total strata weights
     # ---- Index `catch_weights`
     catch_weights_idx = catch_weights.set_index(spatial_column + ["species_id"])
     # ---- Compute the spatially-stratified/grouped weights
-    spatial_weights = (
-        pd.concat([specimen_weight_total.to_frame("total_weight"), catch_weights_idx])
-        .pivot_table(
-            columns=spatial_column, 
-            aggfunc="sum", 
-            values="total_weight", 
-            observed=False
-        )
-    )
-    
+    spatial_weights = pd.concat(
+        [specimen_weight_total.to_frame("total_weight"), catch_weights_idx]
+    ).pivot_table(columns=spatial_column, aggfunc="sum", values="total_weight", observed=False)
+
     # Calculate the weight proportions relative to the overall stratum weights
     # ---- Aged
     # -------- Reformat into dataframe and merge with total stratum weights
@@ -1067,9 +1184,9 @@ def weight_proportions(catch_data: pd.DataFrame,
         specimen_weights_binned_df["specimen_weight"] / specimen_weights_binned_df["total_weight"]
     )
     # -------- Consolidate to calculate the sexed proportions per stratum
-    specimen_weight_sex_proportions = specimen_weights_binned_df.groupby(spatial_column + ["species_id", "sex"])[
-        "weight_proportion_overall"
-    ].sum()
+    specimen_weight_sex_proportions = specimen_weights_binned_df.groupby(
+        spatial_column + ["species_id", "sex"]
+    )["weight_proportion_overall"].sum()
     # ---- Unaged
     # -------- Reformat into dataframe and merge with total stratum weights
     length_weights_sex_standardized_df = (
@@ -1085,14 +1202,18 @@ def weight_proportions(catch_data: pd.DataFrame,
     )
     # -------- Back-calculate the sexed weight proportions relative to just unaged fish
     # ------------ Aggregate proportions
-    length_total_sex_proportions = length_weights_sex_standardized_df.pivot_table(
-        columns=["species_id", "sex"], index=spatial_column, values="weight_proportion_overall"
-    ).transpose().unstack(["species_id"]).sum(axis=0)
+    length_total_sex_proportions = (
+        length_weights_sex_standardized_df.pivot_table(
+            columns=["species_id", "sex"], index=spatial_column, values="weight_proportion_overall"
+        )
+        .transpose()
+        .unstack(["species_id"])
+        .sum(axis=0)
+    )
     # ------------ Re-compute the proportions
     length_weight_sex_proportions = (
         length_weights_sex_standardized_df.pivot_table(
-            index=["species_id", "sex"], columns=spatial_column, 
-            values="weight_proportion_overall"
+            index=["species_id", "sex"], columns=spatial_column, values="weight_proportion_overall"
         )
         / length_total_sex_proportions.to_numpy()
     )
@@ -1115,11 +1236,17 @@ def weight_proportions(catch_data: pd.DataFrame,
     # ---- Generate the fitted weight array
     fitted_weights = length_weight_all.copy()
     # ---- Get actual length bins in dataset
-    fitted_weights = fitted_weights[fitted_weights["length_bin"].isin(length_number_proportions["length_bin"])]
+    fitted_weights = fitted_weights[
+        fitted_weights["length_bin"].isin(length_number_proportions["length_bin"])
+    ]
     # ---- Apportion the averaged weights
-    length_apportioned_weights = length_number_proportions_tbl.T * fitted_weights["weight_fitted"].to_numpy()
+    length_apportioned_weights = (
+        length_number_proportions_tbl.T * fitted_weights["weight_fitted"].to_numpy()
+    )
     # ---- Compute the average weight proportions per length bin per stratum
-    average_length_bin_weights = length_apportioned_weights.T / length_apportioned_weights.sum(axis=1)
+    average_length_bin_weights = length_apportioned_weights.T / length_apportioned_weights.sum(
+        axis=1
+    )
     # ---- Convert back to a DataFrame
     average_length_bin_weights_df = average_length_bin_weights.unstack().reset_index(
         name="weight_proportion"
@@ -1132,7 +1259,9 @@ def weight_proportions(catch_data: pd.DataFrame,
     unaged_proportions = 1 - aged_proportions
     # -------- Re-weight the unaged sexed proportions
     unaged_weight_sex_proportions_overall = (
-        (length_weight_sex_proportions * unaged_proportions.unstack().transpose()).astype(float).fillna(0.0)
+        (length_weight_sex_proportions * unaged_proportions.unstack().transpose())
+        .astype(float)
+        .fillna(0.0)
     )
 
     unaged_proportions.unstack().transpose()
@@ -1148,18 +1277,18 @@ def weight_proportions(catch_data: pd.DataFrame,
         )
     )
     # ---- Aged: stratum-sex relative to total weights
-    aged_sex_df =within_specimen_sex_proportions.reset_index(name="weight_proportion_aged").set_index(
-            spatial_column + ["species_id", "sex"]
-        )
+    aged_sex_df = within_specimen_sex_proportions.reset_index(
+        name="weight_proportion_aged"
+    ).set_index(spatial_column + ["species_id", "sex"])
     # ---- Add the aged sex proportiosn relative to the overall survey
     aged_sex_df["weight_proportion_overall_aged"] = specimen_weight_sex_proportions
     # ---- Consolidate the aged and unaged sexed dataframes
     # -------- Initialize the dataframe
-    aged_unaged_sex_proportions = aged_sex_df.reset_index().set_index(["species_id", "sex"] + spatial_column)
-    # --------- Add the within-unaged weight proportions
-    aged_unaged_sex_proportions["weight_proportion_unaged"] = (
-        length_weight_sex_proportions.stack()
+    aged_unaged_sex_proportions = aged_sex_df.reset_index().set_index(
+        ["species_id", "sex"] + spatial_column
     )
+    # --------- Add the within-unaged weight proportions
+    aged_unaged_sex_proportions["weight_proportion_unaged"] = length_weight_sex_proportions.stack()
     # --------- Add the overall-unaged weight proportions
     aged_unaged_sex_proportions["weight_proportion_overall_unaged"] = (
         unaged_weight_sex_proportions_overall.stack()
@@ -1169,10 +1298,10 @@ def weight_proportions(catch_data: pd.DataFrame,
     # ---- Set index
     aged_unaged_proportions.set_index(spatial_column + ["species_id"], inplace=True)
     # -------- Add unaged proportions
-    aged_unaged_proportions["unaged_proportions"] = unaged_proportions#.reset_index()
+    aged_unaged_proportions["unaged_proportions"] = unaged_proportions  # .reset_index()
     # ---- Reset the index
     aged_unaged_proportions = aged_unaged_proportions.reset_index()
-    
+
     # Return output
     return {
         "aged_weight_proportions_df": aged_overall_df,
@@ -1183,10 +1312,12 @@ def weight_proportions(catch_data: pd.DataFrame,
         "aged_unaged_weight_proportions_df": aged_unaged_proportions,
     }
 
+
 # TODO: NEED TO UPDATE TO EITHER INSERT IF NOT PRESENT OR UPDATE OTHERWISE ! ! !
 # ! SEE ABOVE
-def summarize_strata(nasc_biology_data: pd.DataFrame, spatial_data: pd.DataFrame,
-                     file_configuration: dict):
+def summarize_strata(
+    nasc_biology_data: pd.DataFrame, spatial_data: pd.DataFrame, file_configuration: dict
+):
 
     # Get biology database
     acoustic_db = file_configuration["database"]["acoustics"]
@@ -1200,25 +1331,47 @@ def summarize_strata(nasc_biology_data: pd.DataFrame, spatial_data: pd.DataFrame
         # Create copy
         strata_df = spatial_data.copy()
 
-        # Define new columns 
-        strata_df[["length_mean", "weight_mean", "TS_mean", "number_density_mean", 
-                "biomass_density_mean", "abundance_sum", "biomass_sum"]] = np.nan
+        # Define new columns
+        strata_df[
+            [
+                "length_mean",
+                "weight_mean",
+                "TS_mean",
+                "number_density_mean",
+                "biomass_density_mean",
+                "abundance_sum",
+                "biomass_sum",
+            ]
+        ] = np.nan
         # ---- Drop 'latitude_interval'
         strata_df.drop(columns=["latitude_interval"], inplace=True)
 
         # ---- Create
-        SQL(biology_db, "create", table_name="strata_summary_df", 
-            dataframe=strata_df, primary_keys=["stratum"])       
+        SQL(
+            biology_db,
+            "create",
+            table_name="strata_summary_df",
+            dataframe=strata_df,
+            primary_keys=["stratum"],
+        )
         # ---- Populate table
-        SQL(biology_db, "insert", table_name="strata_summary_df", 
-            dataframe=strata_df, id_columns=["stratum"])
-        
+        SQL(
+            biology_db,
+            "insert",
+            table_name="strata_summary_df",
+            dataframe=strata_df,
+            id_columns=["stratum"],
+        )
+
     # Get unique strata values
     strata_values = np.unique(nasc_biology_data["stratum"]).tolist()
-    
+
     # Update the table
-    sql_update_strata_summary(source_db=acoustic_db, target_db=biology_db, 
-                              source_table="survey_data_df", target_table="strata_summary_df", 
-                              data_columns=[("number_density", "mean"), 
-                                            ("biomass_density", "mean")], 
-                              strata=strata_values)
\ No newline at end of file
+    sql_update_strata_summary(
+        source_db=acoustic_db,
+        target_db=biology_db,
+        source_table="survey_data_df",
+        target_table="strata_summary_df",
+        data_columns=[("number_density", "mean"), ("biomass_density", "mean")],
+        strata=strata_values,
+    )
diff --git a/echopop/live/live_core.py b/echopop/live/live_core.py
index 388a8240..6c41b33a 100644
--- a/echopop/live/live_core.py
+++ b/echopop/live/live_core.py
@@ -1,5 +1,3 @@
-from datetime import datetime
-
 import pandas as pd
 
 LIVE_DATA_STRUCTURE = {
@@ -28,7 +26,7 @@
     "results": {
         "acoustics": dict(),
         "biology": dict(),
-        "stratified": dict(),        
+        "stratified": dict(),
     },
 }
 
@@ -54,8 +52,12 @@
                     "optional_keys": [],
                     "keys": {
                         "*": {
-                            "required_keys": ["number_code", "TS_L_slope", "TS_L_intercept", 
-                                              "length_units"],
+                            "required_keys": [
+                                "number_code",
+                                "TS_L_slope",
+                                "TS_L_intercept",
+                                "length_units",
+                            ],
                             "optional_keys": ["character_code"],
                             "keys": {
                                 "number_code": int,
@@ -78,8 +80,8 @@
                     "optional_keys": [],
                     "keys": {
                         "bins": [float, int],
-                        },
                     },
+                },
                 "stations": {
                     "required_keys": ["separate_stations", "station_id"],
                     "optional_keys": [],
@@ -99,7 +101,7 @@
         },
         "geospatial": {
             "required_keys": ["projection", "link_biology_acoustics"],
-            "optional_keys": ["inpfc", "griddify"],    
+            "optional_keys": ["inpfc", "griddify"],
             "keys": {
                 "inpfc": {
                     "required_keys": ["latitude_max", "stratum_names"],
@@ -120,20 +122,22 @@
                                 "latitude": [float],
                                 "longitude": [float],
                                 "x": [float],
-                                "y": [float]
+                                "y": [float],
                             },
-                        }, 
+                        },
                         "grid_resolution": {
-                            "required_keys":[("latitude_distance", "longitude_distance"), 
-                                             ("x_distance", "y_distance")],   
+                            "required_keys": [
+                                ("latitude_distance", "longitude_distance"),
+                                ("x_distance", "y_distance"),
+                            ],
                             "optional_keys": [],
                             "keys": {
                                 "longitude_distance": float,
                                 "latitude_distance": float,
                                 "x_distance": float,
                                 "y_distnace": float,
-                            }
-                        }
+                            },
+                        },
                     },
                 },
                 "link_biology_acoustics": ["closest_haul", "global", "INPFC", "weighted_haul"],
@@ -146,7 +150,7 @@
 # Required data configuration YAML structure
 LIVE_CONFIG_DATA_MODEL = {
     "required_keys": ["ship_id", "survey_year", "database_directory", "input_directories"],
-    "optional_keys": ["species", "data_root_dir"],    
+    "optional_keys": ["species", "data_root_dir"],
     "keys": {
         "data_root_dir": str,
         "database_directory": str,
@@ -164,8 +168,14 @@
                     },
                 },
                 "biology": {
-                    "required_keys": ["database_name", "directory", "extension", "file_index", 
-                                      "file_ids", "file_name_formats"],
+                    "required_keys": [
+                        "database_name",
+                        "directory",
+                        "extension",
+                        "file_index",
+                        "file_ids",
+                        "file_name_formats",
+                    ],
                     "optional_keys": [],
                     "keys": {
                         "directory": str,
@@ -184,14 +194,14 @@
                             "keys": {
                                 "*": str,
                             },
-                        },   
+                        },
                         "file_index": {
                             "required_keys": ["*"],
                             "optional_keys": [],
                             "keys": {
                                 "*": [str],
                             },
-                        },   
+                        },
                     },
                 },
                 "coastline": {
@@ -234,11 +244,11 @@
         },
         "xarray_variables": {
             "NASC": float,
-            "frequency_nominal": float, 
+            "frequency_nominal": float,
             "latitude": float,
             "longitude": float,
             "ping_time": "datetime64[ns]",
-        }
+        },
     },
     "biology": {
         "catch": {
@@ -253,7 +263,7 @@
                 "species_code": "species_id",
                 "overall_weight": "haul_weight",
                 "catch_perc": "catch_percentage",
-            }
+            },
         },
         "trawl_info": {
             "dtypes": {
@@ -298,7 +308,7 @@
                 "partition": "trawl_partition",
                 "sex": "sex",
                 "length": "length",
-                "organism_weight": "weight"
+                "organism_weight": "weight",
             },
         },
     },
@@ -320,16 +330,8 @@
         "dtype": int,
         "expression": r"(?P<HAUL>\d+)",
     },
-    "SPECIES_CODE": {
-        "name": "species_id",
-        "dtype": int,
-        "expression": r"(?P<SPECIES_CODE>\d+)"
-    },
-    "FILE_ID": {
-        "name": "file_id",
-        "dtype": str,
-        "expression": r"(?P<FILE_ID>.+)"
-    },
+    "SPECIES_CODE": {"name": "species_id", "dtype": int, "expression": r"(?P<SPECIES_CODE>\d+)"},
+    "FILE_ID": {"name": "file_id", "dtype": str, "expression": r"(?P<FILE_ID>.+)"},
 }
 
 SPATIAL_CONFIG_MAP = {
@@ -338,21 +340,13 @@
             "choices": ["distance", "time"],
         },
     },
-    "global" : {},
+    "global": {},
     "griddify": {
         "bounds": {
-            "longitude": {
-                "types": [float]
-            },
-            "latitude": {
-                "types": [float]
-            },
-            "northings": {
-                "types": [float]
-            },
-            "eastings": {
-                "types": [float]
-            },
+            "longitude": {"types": [float]},
+            "latitude": {"types": [float]},
+            "northings": {"types": [float]},
+            "eastings": {"types": [float]},
             "pairs": [("longitude", "latitude"), ("northings", "eastings")],
         },
         "grid_resolution": {
@@ -374,21 +368,20 @@
             "grid_size_y": {
                 "types": int,
             },
-            "pairs": [("x_distance", "y_distance"), ("d_longitude", "d_latitude"), 
-                      ("grid_size_x", "grid_size_y")],       
+            "pairs": [
+                ("x_distance", "y_distance"),
+                ("d_longitude", "d_latitude"),
+                ("grid_size_x", "grid_size_y"),
+            ],
         },
     },
     "inpfc": {
-        "stratum_names": {
-                "types": [int, str]
-            },
+        "stratum_names": {"types": [int, str]},
         "latitude_max": {
             "types": [float],
         },
     },
     "weighted_haul": {
-        "proximity": {
-            "choices": ["distance", "time"]
-        },
+        "proximity": {"choices": ["distance", "time"]},
     },
-}
\ No newline at end of file
+}
diff --git a/echopop/live/live_data_loading.py b/echopop/live/live_data_loading.py
index 3018604e..f763c0be 100644
--- a/echopop/live/live_data_loading.py
+++ b/echopop/live/live_data_loading.py
@@ -1,32 +1,34 @@
-from pathlib import Path
-from typing import Union, Tuple, Optional, List
-import yaml
+import copy
+import os
 import re
-from .sql_methods import SQL, query_processed_files, sql_data_exchange, initialize_database
-import pandas as pd
-import numpy as np
 from datetime import datetime
-import xarray as xr
-import os
-import copy
+from pathlib import Path
+from typing import List, Optional, Union
+
 import boto3
+import numpy as np
+import pandas as pd
+import xarray as xr
+import yaml
 from botocore.exceptions import ClientError
 
-from .live_core import(
+from .live_core import (
+    LIVE_CONFIG_DATA_MODEL,
+    LIVE_CONFIG_INIT_MODEL,
     LIVE_FILE_FORMAT_MAP,
     LIVE_INPUT_FILE_CONFIG_MAP,
     SPATIAL_CONFIG_MAP,
-    LIVE_CONFIG_INIT_MODEL,
-    LIVE_CONFIG_DATA_MODEL
 )
-
 from .live_spatial_methods import create_inpfc_strata
+from .sql_methods import initialize_database, query_processed_files
+
 
 # TODO: Incorporate complete YAML file validator
 # TODO: Documentation
-def live_configuration(live_init_config_path: Union[str, Path], 
-                       live_file_config_path: Union[str, Path]):
-    
+def live_configuration(
+    live_init_config_path: Union[str, Path], live_file_config_path: Union[str, Path]
+):
+
     # Validate file existence
     # ---- str-to-Path conversion, if necessary
     live_init_config_path = Path(live_init_config_path)
@@ -42,19 +44,21 @@ def live_configuration(live_init_config_path: Union[str, Path],
         ]
         raise FileNotFoundError(
             f"The following configuration files do not exist: {missing_config}."
-            )
+        )
 
     # Read the YAML configuration/recipe file to parameterize the `LiveSurvey` class
     # ---- Initialization settings
     init_config = yaml.safe_load(Path(live_init_config_path).read_text())
     # -------- Validate
-    init_config = validate_live_config(copy.deepcopy(init_config), LIVE_CONFIG_INIT_MODEL,
-                                       live_init_config_path)
+    init_config = validate_live_config(
+        copy.deepcopy(init_config), LIVE_CONFIG_INIT_MODEL, live_init_config_path
+    )
     # ---- Filepath/directory settings
     file_config = yaml.safe_load(Path(live_file_config_path).read_text())
-    file_config = validate_live_config(copy.deepcopy(file_config), LIVE_CONFIG_DATA_MODEL,
-                                       live_file_config_path)
-    
+    file_config = validate_live_config(
+        copy.deepcopy(file_config), LIVE_CONFIG_DATA_MODEL, live_file_config_path
+    )
+
     # Check for intersecting/duplicative configuration keys
     # ---- Compare sets of keys from each dictionary
     config_intersect = set(init_config.keys()).intersection(set(file_config.keys()))
@@ -65,36 +69,38 @@ def live_configuration(live_init_config_path: Union[str, Path],
             f"keys: {' ,'.join(config_intersect)}. Key names must be unique for each configuration "
             f"file."
         )
-    
+
     # Combine both into a dictionary output that can be added to the `LiveSurvey` class object
     return {**init_config, **file_config}
 
-def read_acoustic_files(acoustic_files: List[str],
-                        xarray_kwargs: dict = {}) -> tuple:
+
+def read_acoustic_files(acoustic_files: List[str], xarray_kwargs: dict = {}) -> tuple:
 
     # Get the file-specific settings, datatypes, columns, etc.
     # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
     acoustics_config_map = LIVE_INPUT_FILE_CONFIG_MAP["acoustics"]
 
     # Read all of the zarr files
-    results_list =  [(data_df, unit_dict) if i ==0 else (data_df, None) 
-                     for i, (data_df, unit_dict) in enumerate(
-                        read_acoustic_zarr(file, acoustics_config_map, 
-                                           xarray_kwargs=xarray_kwargs) 
-                        for file in acoustic_files
-    )]
+    results_list = [
+        (data_df, unit_dict) if i == 0 else (data_df, None)
+        for i, (data_df, unit_dict) in enumerate(
+            read_acoustic_zarr(file, acoustics_config_map, xarray_kwargs=xarray_kwargs)
+            for file in acoustic_files
+        )
+    ]
 
     # Concatenate the dataframe component
-    acoustic_data_df = pd.concat([df for df, _ in results_list], ignore_index = True)
+    acoustic_data_df = pd.concat([df for df, _ in results_list], ignore_index=True)
     # ---- Add the `acoustic_data_units` to the dictionary and output the resulting tuple
     return acoustic_data_df, results_list[0][1] if results_list else None
 
-def filter_filenames(directory_path: Path, filename_id: str, 
-                     files: List[Path],
-                     file_extension: str):
+
+def filter_filenames(
+    directory_path: Path, filename_id: str, files: List[Path], file_extension: str
+):
 
     # Drop the `{FIELD_ID}` tag identifier
-    file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', filename_id)
+    file_id_format = re.sub(r"\{FILE_ID:([^}]+)\}", r"\1", filename_id)
     # ---- Replace all other tags with `*` placeholders
     file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
     # ---- Compile the pattern
@@ -102,8 +108,9 @@ def filter_filenames(directory_path: Path, filename_id: str,
     pattern = re.compile(escaped_file_id_format.replace(r"\*", ".*"))
     # pattern = re.compile(rf'{file_id_format.replace(".", r"\.").replace("*", ".*")}')
     # ---- Create Path object with the generalized format: S3
-    s3_files = [filename for filename in files 
-                if filename.startswith("s3://") and pattern.search(filename)]
+    s3_files = [
+        filename for filename in files if filename.startswith("s3://") and pattern.search(filename)
+    ]
     # ---- Local search
     local_files = Path(directory_path).glob(f"{file_id_format}.{file_extension}")
     # ---- Assign to subfile path object
@@ -116,19 +123,21 @@ def filter_filenames(directory_path: Path, filename_id: str,
 
     # Convert list of proposed files from Path to String
     file_str = [str(file) for file in list(files)]
-    
+
     # Find intersection with the proposed filenames and return the output
     return list(set(subfile_str).intersection(set(file_str)))
 
-def read_biology_files(biology_files: List[str], file_configuration: dict, 
-                       pandas_kwargs: dict = {}):
+
+def read_biology_files(
+    biology_files: List[str], file_configuration: dict, pandas_kwargs: dict = {}
+):
 
     # Get the biology data file settings
     file_settings = file_configuration["input_directories"]["biology"]
 
     # Get the file-specific settings, datatypes, columns, etc.
     # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
-    biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] 
+    biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"]
     # ---- Extract the expected file name ID's
     biology_file_ids = file_settings["file_name_formats"]
     # ---- Extract all of the file ids
@@ -141,54 +150,59 @@ def read_biology_files(biology_files: List[str], file_configuration: dict,
         directory_path = "/".join([file_configuration["data_root_dir"], file_settings["directory"]])
     else:
         directory_path = file_settings["directory"]
-    
+
     # Add SQL file to dict
     # file_configuration["database"]["biology"] = (
-    #     Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"]         
+    #     Path(file_configuration["data_root_dir"]) / "database" / file_settings["database_name"]
     # )
     file_configuration["database"]["biology"] = (
-        # Path(file_configuration["database_directory"]) / file_settings["database_name"]    
-        "/".join([file_configuration["database_directory"], file_settings["database_name"]])     
+        # Path(file_configuration["database_directory"]) / file_settings["database_name"]
+        "/".join([file_configuration["database_directory"], file_settings["database_name"]])
     )
 
     # Iterate through the different biology datasets and read them in
     for dataset in list(biology_file_ids.keys()):
         # ---- Get dataset-specific file lists
-        dataset_files = filter_filenames(directory_path, 
-                                         biology_file_ids[dataset], 
-                                         biology_files, 
-                                         file_settings["extension"])
+        dataset_files = filter_filenames(
+            directory_path, biology_file_ids[dataset], biology_files, file_settings["extension"]
+        )
         # ---- If there are dataset files available
         if dataset_files:
             # ---- Read in validated biology data
-            dataframe_list = [read_biology_csv(file, 
-                                               file_settings["file_name_formats"][dataset], 
-                                               biology_config_map[dataset],
-                                               pandas_kwargs) 
-                              for file in dataset_files]
+            dataframe_list = [
+                read_biology_csv(
+                    file,
+                    file_settings["file_name_formats"][dataset],
+                    biology_config_map[dataset],
+                    pandas_kwargs,
+                )
+                for file in dataset_files
+            ]
             # ---- Concatenate the dataset
             dataframe_combined = pd.concat(dataframe_list, ignore_index=True)
             # ---- Lower-case sex
-            if "sex" in dataframe_combined.columns: 
+            if "sex" in dataframe_combined.columns:
                 dataframe_combined["sex"] = dataframe_combined["sex"].str.lower()
             # ---- Lower-case trawl partition type
-            if "trawl_partition" in dataframe_combined.columns: 
-                dataframe_combined["trawl_partition"] = dataframe_combined["trawl_partition"].str.lower()
+            if "trawl_partition" in dataframe_combined.columns:
+                dataframe_combined["trawl_partition"] = dataframe_combined[
+                    "trawl_partition"
+                ].str.lower()
             # ---- Reformat datetime column
             if "datetime" in dataframe_combined.columns:
                 dataframe_combined["datetime"] = convert_datetime(dataframe_combined["datetime"])
             # ---- Add to the data dictionary
             biology_output[f"{dataset}_df"] = dataframe_combined
-    
+
     # Return the output
     return biology_output
 
+
 def read_acoustic_zarr(file: Path, config_map: dict, xarray_kwargs: dict = {}) -> tuple:
-    
+
     # Format the file reading configuration
     # ---- Concatenate into a full configuration map
-    full_config_map = {**config_map["xarray_coordinates"],
-                        **config_map["xarray_variables"]} 
+    full_config_map = {**config_map["xarray_coordinates"], **config_map["xarray_variables"]}
 
     # Determine the file loading method for the `acoustic_files`
     zarr_data_ds = xr.open_dataset(file, engine="zarr", chunks="auto", **xarray_kwargs)
@@ -197,11 +211,9 @@ def read_acoustic_zarr(file: Path, config_map: dict, xarray_kwargs: dict = {}) -
     # ---- Convert to a DataFrame
     zarr_data_df = zarr_data_ds.to_dataframe().reset_index()
     # ---- Check for any missing columns
-    missing_columns = (
-        [key for key in full_config_map.keys() if key not in zarr_data_df.columns]
-    )
+    missing_columns = [key for key in full_config_map.keys() if key not in zarr_data_df.columns]
     # ---- Raise Error, if needed
-    if missing_columns: 
+    if missing_columns:
         raise ValueError(
             f"The following columns are missing from at least one file: in "
             f"{', '.join(missing_columns)}!"
@@ -210,7 +222,7 @@ def read_acoustic_zarr(file: Path, config_map: dict, xarray_kwargs: dict = {}) -
     zarr_data_df_filtered = zarr_data_df[full_config_map.keys()].astype(full_config_map)
 
     # Add the filename as a column
-    zarr_data_df_filtered["source"] = Path(file).name 
+    zarr_data_df_filtered["source"] = Path(file).name
 
     # Gather some of the units
     data_units = {
@@ -222,6 +234,7 @@ def read_acoustic_zarr(file: Path, config_map: dict, xarray_kwargs: dict = {}) -
     # Return a Tuple
     return zarr_data_df_filtered, data_units
 
+
 def construct_directorypath(file_configuration: dict, file_settings: dict):
     """Construct the root directory path."""
 
@@ -235,18 +248,21 @@ def construct_directorypath(file_configuration: dict, file_settings: dict):
     data_directory = file_settings["directory"]
 
     # Return the directory path
-    if root_directory != "":    
+    if root_directory != "":
         return "/".join([root_directory, data_directory])
     else:
         return data_directory
 
+
 def is_s3_path(path):
     """Check if a path is an S3 path."""
     return path.startswith("s3://")
 
+
 # TODO: Documentation
-def validate_data_directory(file_configuration: dict, dataset: str,
-                            input_filenames: Optional[list] = None) -> List[Path]:
+def validate_data_directory(
+    file_configuration: dict, dataset: str, input_filenames: Optional[list] = None
+) -> List[Path]:
 
     # Get the dataset file settings
     file_settings = file_configuration["input_directories"][dataset]
@@ -256,10 +272,8 @@ def validate_data_directory(file_configuration: dict, dataset: str,
 
     # Validate `input_filenames` input
     if input_filenames is not None and not isinstance(input_filenames, list):
-        raise TypeError(
-            "Data loading argument `input_filenames` must be a list."
-        )        
-    
+        raise TypeError("Data loading argument `input_filenames` must be a list.")
+
     # Format data filenames
     if input_filenames is not None:
         data_files = ["/".join([directory_path, filename]) for filename in input_filenames]
@@ -277,14 +291,16 @@ def validate_data_directory(file_configuration: dict, dataset: str,
         # ---- Validate
         validate_local_path(directory_path, file_settings)
         # ---- Format data files
-        if input_filenames is None:            
+        if input_filenames is None:
             data_files = list(Path(directory_path).glob(f"*{'.'+file_settings['extension']}"))
-    
+
     # Clean the filenames
     data_files = [
-        re.sub(r'//', r'\\', str(filename)).replace('/', '\\') 
-        if not str(filename).startswith('s3://') 
-        else str(filename)
+        (
+            re.sub(r"//", r"\\", str(filename)).replace("/", "\\")
+            if not str(filename).startswith("s3://")
+            else str(filename)
+        )
         for filename in data_files
     ]
 
@@ -296,81 +312,82 @@ def validate_data_directory(file_configuration: dict, dataset: str,
 
     # Drop incomplete datasets
     if dataset == "biology":
-        data_files = validate_complete_biology_dataset(data_files, 
-                                                       directory_path, 
-                                                       file_configuration)
-    
+        data_files = validate_complete_biology_dataset(
+            data_files, directory_path, file_configuration
+        )
+
     # Query the SQL database to process only new files (or create the db file in the first place)
-    valid_files, file_configuration["database"][dataset] = (
-        query_processed_files(database_root_directory, file_settings, data_files)
+    valid_files, file_configuration["database"][dataset] = query_processed_files(
+        database_root_directory, file_settings, data_files
     )
 
     # Return the valid filenames/paths
     return valid_files
 
+
 def validate_s3_path(s3_path: str, cloud_credentials: dict):
     """Check if (parts of) S3 path exists."""
 
     # Redundant validation that S3 object validation is appropriate
     if not is_s3_path(s3_path):
-        raise ValueError("The path is not an S3 path.")    
-    
+        raise ValueError("The path is not an S3 path.")
+
     # Validate credentials
-    if not all([True if param in cloud_credentials.keys() else False 
-                for param in ["key", "secret"]]):
+    if not all(
+        [True if param in cloud_credentials.keys() else False for param in ["key", "secret"]]
+    ):
         # ---- Find missing credentials
         missing_creds = set(["key", "secret"]) - set(cloud_credentials)
         # ---- Format into string
         missing_creds_str = ", ".join(["'{}'".format(x.replace("'", "''")) for x in missing_creds])
         # ---- Raise Error
-        raise PermissionError(
-            f"Required S3 credentials missing: {missing_creds_str}."
-        )
+        raise PermissionError(f"Required S3 credentials missing: {missing_creds_str}.")
 
     # Remove the s3:// prefix
-    s3_path_reduced = s3_path[len("s3://"):]
+    s3_path_reduced = s3_path[len("s3://") :]
 
     # Split into bucket and key
     parts = s3_path_reduced.split("/", 1)
     if len(parts) < 2:
         raise ValueError(f"Invalid S3 path format for '{s3_path}'.")
-    
+
     # Get bucket name and directory keys
     bucket_name, directory = parts
 
     # Initialize the S3 client
-    s3_client = boto3.client("s3", 
-                             aws_access_key_id=cloud_credentials["key"], 
-                             aws_secret_access_key=cloud_credentials["secret"])
-    
+    s3_client = boto3.client(
+        "s3",
+        aws_access_key_id=cloud_credentials["key"],
+        aws_secret_access_key=cloud_credentials["secret"],
+    )
+
     # Check if the bucket exists
     try:
         s3_client.head_bucket(Bucket=bucket_name)
-    except ClientError as e:
+    except ClientError:
         raise FileNotFoundError(
             f"S3 bucket '{bucket_name}' does not exist or you do not have access."
         )
-    
+
     # Check if the S3 directory exists
     try:
-        # ---- Ping a response from the bucket     
+        # ---- Ping a response from the bucket
         response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=directory, MaxKeys=1)
         # ---- Check for `Contents`
         if "Contents" not in response:
             raise FileNotFoundError(f"S3 path '{s3_path}' does not exist.")
-    except ClientError as e: 
+    except ClientError as e:
         # --- Raise Error and propagate it upwards
         raise e
 
+
 def validate_local_path(directory_path: str, file_settings: dict):
 
     # Validate filepath
     # ---- Error evaluation (if applicable)
     if not Path(directory_path).exists():
-        raise FileNotFoundError(
-            f"The data directory [{directory_path}] does not exist."
-        )
-    
+        raise FileNotFoundError(f"The data directory [{directory_path}] does not exist.")
+
     # Validate that files even exist
     # ---- List available files of target extension
     data_files = list(Path(directory_path).glob(f"*{'.'+file_settings['extension']}"))
@@ -381,9 +398,9 @@ def validate_local_path(directory_path: str, file_settings: dict):
         )
 
 
-def validate_complete_biology_dataset(data_files: List[str], 
-                                      directory_path: str,
-                                      file_configuration: dict):
+def validate_complete_biology_dataset(
+    data_files: List[str], directory_path: str, file_configuration: dict
+):
 
     # Get the biology data file settings
     file_settings = file_configuration["input_directories"]["biology"]
@@ -396,33 +413,33 @@ def validate_complete_biology_dataset(data_files: List[str],
     def get_file_haul_number(filename, format_string):
         # Step 1: Extract the filename from the full path
         filename_only = os.path.basename(filename)
-            
+
         # Remove the file extension from the filename
         filename_no_ext = os.path.splitext(filename_only)[0]
 
         # Split the format string and filename into parts
-        format_parts = re.findall(r'\{[^}]+\}|[^_]+', format_string)
-        filename_parts = filename_no_ext.split('_')
+        format_parts = re.findall(r"\{[^}]+\}|[^_]+", format_string)
+        filename_parts = filename_no_ext.split("_")
 
         # Find the index of {HAUL} in format_parts
-        haul_index = format_parts.index('{HAUL}')
+        haul_index = format_parts.index("{HAUL}")
 
         # Extract and return the haul number from filename_parts
         if haul_index < len(filename_parts):
             return filename_parts[haul_index]
         return None
-        
+
     # Organize dataset by their respective dataset-type
-    dataset_dict = {key: filter_filenames(directory_path, 
-                                          ds, 
-                                          data_files, 
-                                          file_settings["extension"]) 
-                    for key, ds in biology_file_ids.items()}
-    
+    dataset_dict = {
+        key: filter_filenames(directory_path, ds, data_files, file_settings["extension"])
+        for key, ds in biology_file_ids.items()
+    }
+
     # Extract the haul numbers
     extracted_hauls = {
-        key: set(get_file_haul_number(filename, biology_file_ids.get(key, ''))
-                for filename in filenames)
+        key: set(
+            get_file_haul_number(filename, biology_file_ids.get(key, "")) for filename in filenames
+        )
         for key, filenames in dataset_dict.items()
     }
 
@@ -434,8 +451,7 @@ def get_file_haul_number(filename, format_string):
         filename
         for key, filenames in dataset_dict.items()
         for filename in filenames
-        if get_file_haul_number(filename, biology_file_ids.get(key, '')) 
-        in common_hauls
+        if get_file_haul_number(filename, biology_file_ids.get(key, "")) in common_hauls
     ]
 
     # Get bad files for DEBUG
@@ -443,14 +459,12 @@ def get_file_haul_number(filename, format_string):
         filename
         for key, filenames in dataset_dict.items()
         for filename in filenames
-        if get_file_haul_number(filename, biology_file_ids.get(key, '')) 
-        not in common_hauls
+        if get_file_haul_number(filename, biology_file_ids.get(key, "")) not in common_hauls
     ]
     # ---- Create list
     non_filtered_filenames_lst = "\n".join(non_filtered_filenames)
     print(
-        f"The following files are parts of incomplete filesets: \n"
-        f"{non_filtered_filenames_lst}"
+        f"The following files are parts of incomplete filesets: \n" f"{non_filtered_filenames_lst}"
     )
 
     # Return the curated filename list
@@ -461,40 +475,37 @@ def compile_filename_format(file_name_format: str):
 
     # Create a copy of `file_name_format`
     regex_pattern = file_name_format
-    
+
     # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern
     for key, value in LIVE_FILE_FORMAT_MAP.items():
         regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"])
     # ---- Replace the `FILE_ID` tag
-    regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P<FILE_ID>\1)', regex_pattern)
+    regex_pattern = re.sub(r"\{FILE_ID:(.+?)\}", r"(?P<FILE_ID>\1)", regex_pattern)
 
     # Compile the regex pattern and return the output
     return re.compile(regex_pattern)
 
+
 def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict, pandas_kwargs: dict = {}):
 
     # Read in the `*.csv` file
-    df = pd.read_csv(file, 
-                     usecols=list(config_map["dtypes"].keys()), 
-                     storage_options=pandas_kwargs)
+    df = pd.read_csv(file, usecols=list(config_map["dtypes"].keys()), storage_options=pandas_kwargs)
 
     # Validate the dataframe
     # ---- Check for any missing columns
-    missing_columns = (
-        [key for key in config_map["dtypes"].keys() if key not in df.columns]
-    )
+    missing_columns = [key for key in config_map["dtypes"].keys() if key not in df.columns]
     # ---- Raise Error, if needed
-    if missing_columns: 
+    if missing_columns:
         raise ValueError(
             f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!"
         )
     # ---- Ensure the correct datatypes
     df_validated = df.astype(config_map["dtypes"])
-    # ---- Replace column names and drop 
+    # ---- Replace column names and drop
     df_validated = df_validated.rename(columns=config_map["names"])
 
     # Get the substring components that can be added to the DataFrame
-    filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern)
+    filename_substrings = re.findall(r"\{([^:}]+)(?::[^}]+)?}", pattern)
     # ---- Create sub-list of columns that can be added to the DataFrame
     valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings)))
 
@@ -504,30 +515,32 @@ def read_biology_csv(file: Path, pattern: re.Pattern, config_map: dict, pandas_k
     match_obj = compiled_regex.search(file)
 
     # Iterate through the filename-derived tags and add them to the DataFrame
-    for i in valid_tags: 
+    for i in valid_tags:
         matched_key = LIVE_FILE_FORMAT_MAP[i]
         df_validated[matched_key["name"]] = matched_key["dtype"](match_obj.group(i))
 
     # Return the resulting DataFrame
     return df_validated
 
+
 def infer_datetime_format(timestamp_str: Union[int, str]):
     patterns = {
-        r"^\d{14}$": "%Y%m%d%H%M%S",             # YYYYMMDDHHMMSS
-        r"^\d{8}$": "%Y%m%d",                     # YYYYMMDD
-        r"^\d{6}$": "%H%M%S",                     # HHMMSS
+        r"^\d{14}$": "%Y%m%d%H%M%S",  # YYYYMMDDHHMMSS
+        r"^\d{8}$": "%Y%m%d",  # YYYYMMDD
+        r"^\d{6}$": "%H%M%S",  # HHMMSS
         r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$": "%Y-%m-%d %H:%M:%S",  # YYYY-MM-DD HH:MM:SS
         r"^\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}$": "%Y/%m/%d %H:%M:%S",  # YYYY/MM/DD HH:MM:SS
-        r"^\d{4}-\d{2}-\d{2}$": "%Y-%m-%d",       # YYYY-MM-DD
-        r"^\d{4}/\d{2}/\d{2}$": "%Y/%m/%d"        # YYYY/MM/DD
+        r"^\d{4}-\d{2}-\d{2}$": "%Y-%m-%d",  # YYYY-MM-DD
+        r"^\d{4}/\d{2}/\d{2}$": "%Y/%m/%d",  # YYYY/MM/DD
     }
-    
+
     for pattern, date_format in patterns.items():
         if re.match(pattern, timestamp_str):
             return date_format
-    
+
     raise ValueError("Unknown timestamp format")
 
+
 def convert_datetime(timestamp: Union[int, str, pd.Series]):
 
     if isinstance(timestamp, pd.Series):
@@ -544,6 +557,7 @@ def convert_datetime(timestamp: Union[int, str, pd.Series]):
     else:
         return datetime.strptime(timestamp, datetime_format)
 
+
 def validate_hauls_config(spatial_config: dict, link_method: str):
 
     # Get the link method configuration map
@@ -552,20 +566,21 @@ def validate_hauls_config(spatial_config: dict, link_method: str):
     # Extract the defined settings
     input_method_settings = spatial_config[link_method]
 
-    # Check for `proximity` 
+    # Check for `proximity`
     if "proximity" not in input_method_settings.keys():
         raise KeyError(
             "The following parameters are missing from the biology-acoustic linking method: "
             "'proximity'!"
         )
-    
+
     # Evaluate valid options for `proximity`
     if input_method_settings["proximity"] not in link_method_settings["proximity"]["choices"]:
         raise KeyError(
             f"Value biology-acoustic linking method parameter `proximity` must be one of the : "
             f"following: {link_method_settings['proximity']['choices']}."
-        )       
-    
+        )
+
+
 def validate_griddify_config(spatial_config: dict, link_method: str):
 
     # Get the link method configuration map
@@ -581,8 +596,8 @@ def validate_griddify_config(spatial_config: dict, link_method: str):
         raise KeyError(
             f"The following parameters are missing from the biology-acoustic linking method: "
             f"{list(key_diff)}!"
-        )    
-    
+        )
+
     # Iterate through the keys to evaluate inputs
     for key in list(input_method_settings.keys()):
         # ---- Subset the input method config
@@ -596,7 +611,7 @@ def validate_griddify_config(spatial_config: dict, link_method: str):
             raise KeyError(
                 f"Unexpected parameter(s) ('{parameter_diff}') detected in '{link_method}' "
                 f"configuration."
-            )    
+            )
         # ---- Check if the appropriate coordinate pairs are present
         coordinate_pairs = [set(param).intersection(set(input.keys())) for param in model["pairs"]]
         # ---- Count the number of paired coordinates
@@ -630,7 +645,8 @@ def validate_griddify_config(spatial_config: dict, link_method: str):
                     f"Biology-acoustic linking method argument '{parameter}' within '{key}' "
                     f"for method '{link_method}' must be one of the following types within a list: "
                     f"{config_dtypes}."
-                )    
+                )
+
 
 def validate_inpfc_config(spatial_config: dict, link_method: str):
 
@@ -648,14 +664,14 @@ def validate_inpfc_config(spatial_config: dict, link_method: str):
             f"The following parameters are missing from the biology-acoustic linking method: "
             f"{list(key_diff)}!"
         )
-    
+
     # Iterate through the keys to evaluate inputs
     for key in list(input_method_settings.keys()):
         # ---- Subset the input method config
         input = input_method_settings[key]
         # ---- Get the original config of the dtypes
         model = link_method_settings[key]["types"]
-        # ---- Evaluate if a list 
+        # ---- Evaluate if a list
         if not isinstance(input, list):
             raise TypeError(
                 f"Biology-acoustic linking method argument '{key}' for method '{link_method}' must "
@@ -666,8 +682,9 @@ def validate_inpfc_config(spatial_config: dict, link_method: str):
             raise TypeError(
                 f"Biology-acoustic linking method argument '{key}' for method '{link_method}' must "
                 f"be one of the following types within a list: {model}."
-            )    
-        
+            )
+
+
 def configure_spatial_settings(file_configuration: dict):
 
     # Extract spatial strata *only* if spatial information from the configuration settings
@@ -690,16 +707,19 @@ def configure_spatial_settings(file_configuration: dict):
         spatial_dict.update({"strata": create_inpfc_strata(spatial_config)})
         # ---- Update the stratum classification in the primary file configuration
         file_configuration.update({"spatial_column": ["stratum"]})
-    else: 
+    else:
         # ---- Empty `spatial_column` key
         file_configuration.update({"spatial_column": []})
 
     # Add grid
-    file_configuration.update({"gridding_column": file_configuration["spatial_column"] + ["x", "y"]})
+    file_configuration.update(
+        {"gridding_column": file_configuration["spatial_column"] + ["x", "y"]}
+    )
 
     # Return the dictionary as an output
     return spatial_dict
 
+
 def validate_spatial_config(spatial_config: dict):
 
     # Check the link method
@@ -711,7 +731,7 @@ def validate_spatial_config(spatial_config: dict):
             f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options "
             f"include: 'global', 'closest_haul', 'weighted_haul', 'griddify', and 'INPFC'."
         )
-    
+
     # Verify that associated parameters are present in the configuration settings
     # ---- Get keys as a list
     config_keys = list(spatial_config.keys())
@@ -720,18 +740,19 @@ def validate_spatial_config(spatial_config: dict):
         raise ValueError(
             f"No parameters provided for the biology-acoustic linking ([{link_method}])."
         )
-    
+
     # Check key settings
-    if link_method == "griddify": 
+    if link_method == "griddify":
         validate_griddify_config(spatial_config, link_method)
-    elif link_method == "inpfc": 
+    elif link_method == "inpfc":
         validate_inpfc_config(spatial_config, link_method)
-    elif link_method != "global": 
+    elif link_method != "global":
         validate_hauls_config(spatial_config, link_method)
 
+
 def validate_live_config(config: dict, reference_model: dict, filename: Union[str, Path]):
     """Validate configuration inputs"""
-    
+
     # Convert to string if Path
     if isinstance(filename, Path):
         filename = str(filename)
@@ -752,6 +773,7 @@ def validate_keys(config, model, path=""):
         def get_keys_from_tuples(tuples):
             """Parse key names from tuples"""
             return {key for group in tuples if isinstance(group, tuple) for key in group}
+
         # ----
         def find_missing_keys(required_keys, keys_to_check):
             """Find any missing keys"""
@@ -764,14 +786,16 @@ def find_missing_keys(required_keys, keys_to_check):
             missing_keys = [key for key in valid_keys_in_tuples if key not in keys_to_check]
             unexpected_keys = [key for key in keys_to_check if key not in all_required_keys]
             return missing_keys, unexpected_keys
+
         # ----
         def check_for_missing_keys(required_keys, config_keys, path):
             """Check whether any required keys are missing"""
             missing_required = []
             for key in required_keys:
                 if isinstance(key, tuple):
-                    missing_keys, unexpected_keys_for_keys = find_missing_keys(required_keys, 
-                                                                               config_keys)
+                    missing_keys, unexpected_keys_for_keys = find_missing_keys(
+                        required_keys, config_keys
+                    )
                     if missing_keys:
                         raise ValueError(
                             f"Missing required configuration key(s): "
@@ -787,31 +811,38 @@ def check_for_missing_keys(required_keys, config_keys, path):
                     f"{path} in configuration file '{filename}'."
                 )
             return []
+
         # ----
         def check_for_unexpected_keys(config_keys, required_keys):
             """Check for unexpected keys"""
             unexpected_keys = []
             for key in config_keys:
-                if (key not in required_keys 
-                    and key not in optional_keys 
-                    and "*" not in required_keys):
+                if (
+                    key not in required_keys
+                    and key not in optional_keys
+                    and "*" not in required_keys
+                ):
                     if not any(key in group for group in required_keys if isinstance(group, tuple)):
                         unexpected_keys.append(key)
             return unexpected_keys
 
         # Top-level validation
         if path == "":
-            missing_primary_keys = [key for key in required_keys 
-                                    if key != "*" and key not in config]
+            missing_primary_keys = [
+                key for key in required_keys if key != "*" and key not in config
+            ]
             if missing_primary_keys:
                 raise ValueError(
                     f"Missing primary configuration key(s): {', '.join(missing_primary_keys)} in "
                     f"configuration file '{filename}'."
                 )
-            unexpected_primary_keys = [key for key in config 
-                                       if key not in required_keys 
-                                       and key not in optional_keys 
-                                       and "*" not in required_keys]
+            unexpected_primary_keys = [
+                key
+                for key in config
+                if key not in required_keys
+                and key not in optional_keys
+                and "*" not in required_keys
+            ]
             # ---- Raise error
             if unexpected_primary_keys:
                 raise ValueError(
@@ -828,14 +859,15 @@ def check_for_unexpected_keys(config_keys, required_keys):
                 raise ValueError(
                     f"Unexpected key(s) found: {', '.join(unexpected_keys)} at {path} in "
                     f"configuration file '{filename}'."
-                    )
+                )
 
         # Recursively validate nested dictionaries and lists
         for key, sub_model in keys.items():
             if key == "*" and isinstance(sub_model, dict):
                 for sub_key in config:
-                    validate_keys(config[sub_key], 
-                                  sub_model, path=f"{path}.{sub_key}" if path else sub_key)
+                    validate_keys(
+                        config[sub_key], sub_model, path=f"{path}.{sub_key}" if path else sub_key
+                    )
             elif key == "*" and isinstance(sub_model, list):
                 for sub_key in config:
                     validate_list(config[sub_key], sub_model, key, path)
@@ -879,6 +911,7 @@ def validate_list(config_value, allowed_types, key, path):
                     f"Invalid type for key '{key}' at {path} in {filename}. Expected a list of: "
                     f"{allowed_types}"
                 )
+
     # ----
     def validate_type(config_value, expected_type, key, path):
         """Validate configuration with model that is at the furthest point along a branch"""
diff --git a/echopop/live/live_data_processing.py b/echopop/live/live_data_processing.py
index a2dcaa46..07672a0f 100644
--- a/echopop/live/live_data_processing.py
+++ b/echopop/live/live_data_processing.py
@@ -1,31 +1,24 @@
-import yaml
-import re
+from typing import List
+
+import numpy as np
+import pandas as pd
 
-from functools import reduce
-from .sql_methods import SQL, sql_group_update, query_dataset, get_unique_identifiers
 from .live_biology import summarize_strata
 from .live_spatial_methods import update_population_grid
-from pathlib import Path
-from typing import Union, Tuple, Optional, List
-
-import pandas as pd
+from .sql_methods import query_dataset, sql_group_update
 
-import numpy as np
 
-from .live_core import(
-    LIVE_FILE_FORMAT_MAP,
-    LIVE_INPUT_FILE_CONFIG_MAP
-)
+def get_average_strata_weights(db_file: str, data_dict: dict, unique_columns: list):
 
-def get_average_strata_weights(db_file: str,
-                               data_dict: dict,
-                               unique_columns: list):
-    
     # Get corresponding `weight_fitted_df` from the database
-    weight_fitted_sql_df = query_dataset(db_file, data_dict, table_name="weight_stratum_df",
-                                         data_columns=unique_columns + ["average_weight"],
-                                         unique_columns=unique_columns,
-                                         constraint="sex == 'all'")
+    weight_fitted_sql_df = query_dataset(
+        db_file,
+        data_dict,
+        table_name="weight_stratum_df",
+        data_columns=unique_columns + ["average_weight"],
+        unique_columns=unique_columns,
+        constraint="sex == 'all'",
+    )
     # ---- Use SQL table data if present
     if weight_fitted_sql_df is not None and not weight_fitted_sql_df.empty:
         # ---- Return output
@@ -33,6 +26,7 @@ def get_average_strata_weights(db_file: str,
     else:
         return None
 
+
 def configure_database_paths(file_configuration: dict):
 
     # Extract input directory settings
@@ -42,16 +36,22 @@ def configure_database_paths(file_configuration: dict):
     database_dir = file_configuration["database_directory"]
 
     # Update configuration
-    file_configuration["database"].update({
-        dataset: "/".join([database_dir, file_settings[dataset]["database_name"]]) 
-        for dataset in file_settings.keys() if "database_name" in file_settings[dataset]
-    })
+    file_configuration["database"].update(
+        {
+            dataset: "/".join([database_dir, file_settings[dataset]["database_name"]])
+            for dataset in file_settings.keys()
+            if "database_name" in file_settings[dataset]
+        }
+    )
+
 
-def acoustic_pipeline(acoustic_dict: dict, 
-                      strata_df: pd.DataFrame, 
-                      file_configuration: dict, 
-                      verbose: bool,
-                      contrast_columns: List[str] = []):
+def acoustic_pipeline(
+    acoustic_dict: dict,
+    strata_df: pd.DataFrame,
+    file_configuration: dict,
+    verbose: bool,
+    contrast_columns: List[str] = [],
+):
 
     # Get spatial column
     spatial_column = file_configuration["spatial_column"]
@@ -68,40 +68,32 @@ def acoustic_pipeline(acoustic_dict: dict,
     if acoustic_dict["nasc_df"] is None or acoustic_dict["nasc_df"].empty:
         # ---- Print, if verbose
         if verbose:
-            print(
-                f"No new processed acoustic data available for processing."
-            )
+            print("No new processed acoustic data available for processing.")
     else:
         # Get related acoustic data
-        acoustic_df = get_nasc_sql_data(acoustic_db, 
-                                        acoustic_dict, 
-                                        unique_columns=unique_columns)
-        
+        acoustic_df = get_nasc_sql_data(acoustic_db, acoustic_dict, unique_columns=unique_columns)
+
         # Get the corresopding `sigma_bs` data (and also compute the sample-number weighted average)
-        sigma_bs_df = get_sigma_bs_sql_data(biology_db, 
-                                            acoustic_dict,
-                                            unique_columns=["stratum"])
-        
+        sigma_bs_df = get_sigma_bs_sql_data(biology_db, acoustic_dict, unique_columns=["stratum"])
+
         # Calculate population estimates if valid data are available
         if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]):
 
             # ---- Merge the NASC and sigma_bs datasets
             nasc_biology = acoustic_df.merge(sigma_bs_df, on=spatial_column)
             # ---- Compute the number densities (animals nmi^-2)
-            nasc_biology["number_density"] = (
-                nasc_biology["nasc"]
-                / (4.0 * np.pi * nasc_biology["sigma_bs_mean"])
+            nasc_biology["number_density"] = nasc_biology["nasc"] / (
+                4.0 * np.pi * nasc_biology["sigma_bs_mean"]
             )
 
             # Get the corresponding average strata weights (computed for all fish)
-            weight_spatial_averages = get_average_strata_weights(biology_db,
-                                                                 acoustic_dict,
-                                                                 unique_columns=spatial_column + contrast_columns)
-            
+            weight_spatial_averages = get_average_strata_weights(
+                biology_db, acoustic_dict, unique_columns=spatial_column + contrast_columns
+            )
+
             if weight_spatial_averages is not None:
                 # Merge average weights with number density estimates
-                nasc_biology = nasc_biology.merge(weight_spatial_averages, 
-                                                  on=spatial_column)
+                nasc_biology = nasc_biology.merge(weight_spatial_averages, on=spatial_column)
 
                 # Compute biomass densities
                 nasc_biology["biomass_density"] = (
@@ -109,49 +101,65 @@ def acoustic_pipeline(acoustic_dict: dict,
                 )
 
             # Update the survey population estimate DataFrame with the newly computed densities
-            if (all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]) 
-                and not nasc_biology.empty):        
-                sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df", 
-                                 columns=["number_density", "biomass_density"], 
-                                 unique_columns=["id"])
-            
+            if (
+                all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]])
+                and not nasc_biology.empty
+            ):
+                sql_group_update(
+                    acoustic_db,
+                    dataframe=nasc_biology,
+                    table_name="survey_data_df",
+                    columns=["number_density", "biomass_density"],
+                    unique_columns=["id"],
+                )
+
                 # Summarize strata
                 summarize_strata(nasc_biology, strata_df, file_configuration)
 
                 # Update grid
-                update_population_grid(file_configuration, coordinates=["x", "y"], 
-                                       dataset=nasc_biology)
+                update_population_grid(
+                    file_configuration, coordinates=["x", "y"], dataset=nasc_biology
+                )
+
+
+def get_nasc_sql_data(db_file: str, data_dict: dict, unique_columns: List[str]):
 
-def get_nasc_sql_data(db_file: str,
-                      data_dict: dict, 
-                      unique_columns: List[str]):
-    
     # Add SELECTION columns
-    data_columns = (
-        unique_columns + ["longitude", "latitude", "ping_time", "nasc", "number_density", 
-                          "biomass_density", "id"]
-    )
+    data_columns = unique_columns + [
+        "longitude",
+        "latitude",
+        "ping_time",
+        "nasc",
+        "number_density",
+        "biomass_density",
+        "id",
+    ]
     # ----- Get the SQL dataset
-    nasc_sql_data = query_dataset(db_file, 
-                                  data_dict,
-                                  table_name="survey_data_df",
-                                  data_columns = data_columns,
-                                  unique_columns=unique_columns,
-                                  constraint="nasc > 0.0")
+    nasc_sql_data = query_dataset(
+        db_file,
+        data_dict,
+        table_name="survey_data_df",
+        data_columns=data_columns,
+        unique_columns=unique_columns,
+        constraint="nasc > 0.0",
+    )
     # ---- Use SQL table data if present
     if nasc_sql_data is not None and not nasc_sql_data.empty:
         return nasc_sql_data
     elif "nasc_df" in data_dict.keys():
         return data_dict["nasc_df"]
 
-def get_sigma_bs_sql_data(db_file: str,
-                          data_dict: dict,
-                          unique_columns: list):
+
+def get_sigma_bs_sql_data(db_file: str, data_dict: dict, unique_columns: list):
 
     # Get corresponding `sigma_bs` DataFrame
-    sigma_bs_sql_df = query_dataset(db_file, data_dict, table_name="sigma_bs_mean_df",
-                                    data_columns=unique_columns + ["sigma_bs", "sigma_bs_count"],
-                                    unique_columns=unique_columns)
+    sigma_bs_sql_df = query_dataset(
+        db_file,
+        data_dict,
+        table_name="sigma_bs_mean_df",
+        data_columns=unique_columns + ["sigma_bs", "sigma_bs_count"],
+        unique_columns=unique_columns,
+    )
     # ---- Use SQL table data if present
     if sigma_bs_sql_df is not None and not sigma_bs_sql_df.empty:
         # ---- Compute the weighted average
@@ -165,14 +173,15 @@ def get_sigma_bs_sql_data(db_file: str,
         return sigma_bs_mean_sql_df
     else:
         return None
-    
 
 
-def biology_pipeline(biology_dict: dict, 
-                     strata_df: pd.DataFrame, 
-                     file_configuration: dict, 
-                     verbose: bool,
-                     contrast_columns: List[str] = []):
+def biology_pipeline(
+    biology_dict: dict,
+    strata_df: pd.DataFrame,
+    file_configuration: dict,
+    verbose: bool,
+    contrast_columns: List[str] = [],
+):
 
     # Get spatial column
     spatial_column = file_configuration["spatial_column"]
@@ -186,43 +195,36 @@ def biology_pipeline(biology_dict: dict,
 
     # Check for data completion
     # ---- List of boolean values
-    full_biology_data = (
-        [True if (isinstance(df, pd.DataFrame) and not df.empty) or (isinstance(df, dict)) 
-         else False for _, df in biology_dict.items()]
-    )
+    full_biology_data = [
+        True if (isinstance(df, pd.DataFrame) and not df.empty) or (isinstance(df, dict)) else False
+        for _, df in biology_dict.items()
+    ]
     # ---- Validation
     if not all(full_biology_data):
         # ---- Print, if verbose
         if verbose:
-            print(
-                f"No new processed biology data available for processing."
-            )
+            print("No new processed biology data available for processing.")
     else:
         # Get related biology data
-        acoustic_df = get_nasc_sql_data(acoustic_db, 
-                                        biology_dict, 
-                                        unique_columns=unique_columns)        
+        acoustic_df = get_nasc_sql_data(acoustic_db, biology_dict, unique_columns=unique_columns)
 
         # Get the corresopding `sigma_bs` data (and also compute the sample-number weighted average)
-        sigma_bs_df = get_sigma_bs_sql_data(biology_db, 
-                                            biology_dict,
-                                            unique_columns=unique_columns)
+        sigma_bs_df = get_sigma_bs_sql_data(biology_db, biology_dict, unique_columns=unique_columns)
 
         # Calculate population estimates if valid data are available
-        if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]):    
+        if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]):
             # ---- Merge the NASC and sigma_bs datasets
             nasc_biology = acoustic_df.merge(sigma_bs_df, on=unique_columns)
             # ---- Compute the number densities (animals nmi^-2)
-            nasc_biology["number_density"] = (
-                nasc_biology["nasc"]
-                / (4.0 * np.pi * nasc_biology["sigma_bs_mean"])
+            nasc_biology["number_density"] = nasc_biology["nasc"] / (
+                4.0 * np.pi * nasc_biology["sigma_bs_mean"]
             )
 
             # Get the corresponding average strata weights (computed for all fish)
-            weight_spatial_averages = get_average_strata_weights(biology_db,
-                                                                 biology_dict,
-                                                                 unique_columns=unique_columns)
-            
+            weight_spatial_averages = get_average_strata_weights(
+                biology_db, biology_dict, unique_columns=unique_columns
+            )
+
             if weight_spatial_averages is not None:
                 # Merge average weights with number density estimates
                 nasc_biology = nasc_biology.merge(weight_spatial_averages, on=unique_columns)
@@ -233,14 +235,19 @@ def biology_pipeline(biology_dict: dict,
                 )
 
             # Update the survey population estimate DataFrame with the newly computed densities
-            if not nasc_biology.empty:        
-                sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df", 
-                                columns=["number_density", "biomass_density"], 
-                                unique_columns=["stratum", "longitude", "latitude", "ping_time"])
-            
+            if not nasc_biology.empty:
+                sql_group_update(
+                    acoustic_db,
+                    dataframe=nasc_biology,
+                    table_name="survey_data_df",
+                    columns=["number_density", "biomass_density"],
+                    unique_columns=["stratum", "longitude", "latitude", "ping_time"],
+                )
+
                 # Summarize strata
                 summarize_strata(nasc_biology, strata_df, file_configuration)
 
                 # Update population grid
-                update_population_grid(file_configuration, coordinates=["stratum"], 
-                                       dataset=nasc_biology)
+                update_population_grid(
+                    file_configuration, coordinates=["stratum"], dataset=nasc_biology
+                )
diff --git a/echopop/live/live_spatial_methods.py b/echopop/live/live_spatial_methods.py
index 2d7ac606..75b362f2 100644
--- a/echopop/live/live_spatial_methods.py
+++ b/echopop/live/live_spatial_methods.py
@@ -1,33 +1,40 @@
+from pathlib import Path
+from typing import List, Union
+
 import geopandas as gpd
-import pandas as pd
 import numpy as np
-from geopy.distance import distance
-from ..spatial.projection import utm_string_generator
+import pandas as pd
 import shapely.geometry
-from shapely.geometry import box
 import sqlalchemy as sqla
-from pathlib import Path
-from typing import Union, List
-from .sql_methods import sql_group_update, query_dataset
+from geopy.distance import distance
+from shapely.geometry import box
+
+from ..spatial.projection import utm_string_generator
+from .sql_methods import query_dataset, sql_group_update
+
 
 def create_inpfc_strata(spatial_config: dict):
 
     # Extract the INPFC definitions
-    inpfc_definitions = spatial_config["inpfc"]    
+    inpfc_definitions = spatial_config["inpfc"]
 
     # Create latitude bins
     latitude_bins = np.concatenate([[-90.0], inpfc_definitions["latitude_max"], [90.0]])
     # ---- Append 1 more stratum layer
-    bin_names = np.concatenate([inpfc_definitions["stratum_names"],
-                                [np.max(inpfc_definitions["stratum_names"]) + 1]])
-    
+    bin_names = np.concatenate(
+        [inpfc_definitions["stratum_names"], [np.max(inpfc_definitions["stratum_names"]) + 1]]
+    )
+
     # Create spatial key
-    inpfc_strata_df = pd.DataFrame({
-        "latitude_limit": np.concatenate([inpfc_definitions["latitude_max"], [90.0]]),
-        "latitude_interval": pd.cut(np.concatenate([inpfc_definitions["latitude_max"], [90.0]]), 
-                                    latitude_bins),
-        "stratum": bin_names,
-    })
+    inpfc_strata_df = pd.DataFrame(
+        {
+            "latitude_limit": np.concatenate([inpfc_definitions["latitude_max"], [90.0]]),
+            "latitude_interval": pd.cut(
+                np.concatenate([inpfc_definitions["latitude_max"], [90.0]]), latitude_bins
+            ),
+            "stratum": bin_names,
+        }
+    )
 
     # Add boundaries
     # ---- Lower
@@ -38,8 +45,9 @@ def create_inpfc_strata(spatial_config: dict):
     # Return the dataframe
     return inpfc_strata_df
 
+
 def apply_inpfc_definitions(dataset: pd.DataFrame, inpfc_df: pd.DataFrame):
-    
+
     # Create dataset copy
     dataset = dataset.copy()
 
@@ -48,35 +56,37 @@ def apply_inpfc_definitions(dataset: pd.DataFrame, inpfc_df: pd.DataFrame):
         dataset.loc[:, "stratum"] = pd.cut(
             dataset.loc[:, "latitude"],
             np.unique(np.hstack([inpfc_df.loc[:, "lower"], inpfc_df.loc[:, "upper"]])),
-            labels = inpfc_df.loc[:, "stratum"]
+            labels=inpfc_df.loc[:, "stratum"],
         ).astype(int)
-        
+
         return dataset
     else:
-        strata = pd.cut(dataset.copy(),
-                        np.unique(np.hstack([inpfc_df.loc[:, "lower"], 
-                                             inpfc_df.loc[:, "upper"]])),
-                        labels = inpfc_df.loc[:, "stratum"]
+        strata = pd.cut(
+            dataset.copy(),
+            np.unique(np.hstack([inpfc_df.loc[:, "lower"], inpfc_df.loc[:, "upper"]])),
+            labels=inpfc_df.loc[:, "stratum"],
         ).astype(int)
-        
+
         return strata
 
     # Return the INPFC-stratified dataset
     # return dataset
 
+
 def apply_spatial_definitions(dataset: Union[dict, pd.Series], spatial_dict: dict):
 
     # Get the acoustic-biology link method
     link_method = spatial_dict["link_method"]
-    
+
     # Apply spatial definitions
     if isinstance(dataset, dict) and link_method == "INPFC":
-        dataset.update({
-            k: apply_inpfc_definitions(d, spatial_dict["strata"]) for k, d in dataset.items()
-        })
+        dataset.update(
+            {k: apply_inpfc_definitions(d, spatial_dict["strata"]) for k, d in dataset.items()}
+        )
     elif isinstance(dataset, pd.Series) and link_method == "INPFC":
         return apply_inpfc_definitions(dataset, spatial_dict["strata"])
 
+
 # def apply_inpfc_definitions(acoustic_data: dict, biology_data: dict, spatial_config: dict):
 
 #     # Extract the INPFC definitions
@@ -87,7 +97,7 @@ def apply_spatial_definitions(dataset: Union[dict, pd.Series], spatial_dict: dic
 #     # ---- Append 1 more stratum layer
 #     bin_names = np.concatenate([inpfc_definitions["stratum_names"],
 #                                 [np.max(inpfc_definitions["stratum_names"]) + 1]])
-    
+
 #     # Create spatial key
 #     spatial_config["spatial_key"] = pd.DataFrame({
 #         "latitude_limit": inpfc_definitions["latitude_max"],
@@ -120,8 +130,9 @@ def apply_spatial_definitions(dataset: Union[dict, pd.Series], spatial_dict: dic
 #             labels = bin_names,
 #         )
 
+
 def define_boundary_box(boundary_dict: dict, projection: str):
-    
+
     # Get x-coordinates
     if "longitude" in boundary_dict.keys():
         x = np.array(boundary_dict["longitude"])
@@ -135,10 +146,12 @@ def define_boundary_box(boundary_dict: dict, projection: str):
         y = np.array(boundary_dict["eastings"])
 
     # Create a boundary DataFrame
-    bound_df = pd.DataFrame({
-        "x": np.array([x.min(), x.max(), x.max(), x.min(), x.min()]),
-        "y":np.array([y.min(), y.max(), y.max(), y.min(), y.min()]),
-    })
+    bound_df = pd.DataFrame(
+        {
+            "x": np.array([x.min(), x.max(), x.max(), x.min(), x.min()]),
+            "y": np.array([y.min(), y.max(), y.max(), y.min(), y.min()]),
+        }
+    )
 
     # Convert to a GeoDataFrame and return the GeoDataFrame
     return gpd.GeoDataFrame(
@@ -147,6 +160,7 @@ def define_boundary_box(boundary_dict: dict, projection: str):
         crs=projection,
     )
 
+
 def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict):
 
     # Extract the griddification definitions
@@ -161,8 +175,11 @@ def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict):
     # Convert the coordinates, if needed
     if not set(["northings", "eastings"]).intersection(set(griddify_definitions["bounds"].keys())):
         # ---- Compute the equivalent UTM string
-        utm_num = int(utm_string_generator(np.median(boundary_box.loc[0:3, "x"]), 
-                                           np.median(boundary_box.loc[0:3, "y"])))
+        utm_num = int(
+            utm_string_generator(
+                np.median(boundary_box.loc[0:3, "x"]), np.median(boundary_box.loc[0:3, "y"])
+            )
+        )
         # ---- Compute the boundary box GeoDataFrame with the new projection
         boundary_box = boundary_box.to_crs(utm_num)
         # ---- Create a new projection for later
@@ -184,8 +201,8 @@ def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict):
     # ---- Iterate through
     for y0 in np.arange(ymin, ymax, y_step):
         for x0 in np.arange(xmin, xmax, x_step):
-            x1 = x0-x_step
-            y1 = y0+y_step
+            x1 = x0 - x_step
+            y1 = y0 + y_step
             grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))
 
     # Convert to a GeoDataFrame
@@ -210,23 +227,25 @@ def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict):
     # Bin the longitude data
     dataset_gdf["stratum_x"] = pd.cut(
         dataset_gdf["x"],
-        np.arange(xmin, xmax+x_step, x_step),
-        right = False,
-        labels = np.arange(1, len(np.arange(xmin, xmax+x_step, x_step))),
+        np.arange(xmin, xmax + x_step, x_step),
+        right=False,
+        labels=np.arange(1, len(np.arange(xmin, xmax + x_step, x_step))),
     ).astype(int)
 
     # Bin the latitude data
-    dataset_gdf["stratum_y"] = pd.cut(
-        dataset_gdf["y"],
-        np.arange(ymin, ymax+y_step, y_step),
-        right = True,
-        labels = range(len(np.arange(ymin, ymax+y_step, y_step)) - 1),
-    ).astype(int) + 1
+    dataset_gdf["stratum_y"] = (
+        pd.cut(
+            dataset_gdf["y"],
+            np.arange(ymin, ymax + y_step, y_step),
+            right=True,
+            labels=range(len(np.arange(ymin, ymax + y_step, y_step)) - 1),
+        ).astype(int)
+        + 1
+    )
 
     # Update the original dataset
-    return (
-        dataset_gdf.loc[:, ["stratum_x", "stratum_y"]]
-        .rename(columns={"stratum_x": "x", "stratum_y": "y"})
+    return dataset_gdf.loc[:, ["stratum_x", "stratum_y"]].rename(
+        columns={"stratum_x": "x", "stratum_y": "y"}
     )
     # dataset.loc[:, "x"] = dataset_gdf.copy().loc[:, "stratum_x"]
     # dataset.loc[:, "y"] = dataset_gdf.copy().loc[:, "stratum_y"]
@@ -244,9 +263,9 @@ def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict):
 #     boundary_box = define_boundary_box(griddify_definitions["bounds"], projection)
 
 #     # Convert the coordinates, if needed
-#     if not set(["northings", "eastings"]).intersection(set(griddify_definitions["bounds"].keys())):
+#    if not set(["northings", "eastings"]).intersection(set(griddify_definitions["bounds"].keys())):
 #         # ---- Compute the equivalent UTM string
-#         utm_num = int(utm_string_generator(np.median(boundary_box.loc[0:3, "x"]), 
+#         utm_num = int(utm_string_generator(np.median(boundary_box.loc[0:3, "x"]),
 #                                            np.median(boundary_box.loc[0:3, "y"])))
 #         # ---- Compute the boundary box GeoDataFrame with the new projection
 #         boundary_box = boundary_box.to_crs(utm_num)
@@ -351,10 +370,11 @@ def apply_griddify_definitions(dataset: pd.DataFrame, spatial_config: dict):
 
 #         #
 #         biology_data["trawl_info_df"]["stratum"] = (
-#             trawl_info_new["stratum_x"].astype(str) + "-" + trawl_info_new["stratum_y"].astype(str)
+#            trawl_info_new["stratum_x"].astype(str) + "-" + trawl_info_new["stratum_y"].astype(str)
 #         )
 
-def initialize_grid(file_configuration = dict):
+
+def initialize_grid(file_configuration=dict):
 
     # Get root directory, if defined
     if "data_root_dir" in file_configuration:
@@ -382,7 +402,7 @@ def initialize_grid(file_configuration = dict):
 
         # Get projection
         projection = file_configuration["geospatial"]["projection"]
-        
+
         # Get grid settings
         grid_settings = file_configuration["geospatial"]["griddify"]
 
@@ -398,22 +418,26 @@ def initialize_grid(file_configuration = dict):
         # ---- y
         y = boundary["latitude"]
         # ---- Create DataFrame
-        boundary_df = pd.DataFrame({
-            "x": np.array([np.min(x), np.max(x), np.max(x), np.min(x), np.min(x)]),
-            "y": np.array([np.min(y), np.min(y), np.max(y), np.max(y), np.min(y)])
-        })
+        boundary_df = pd.DataFrame(
+            {
+                "x": np.array([np.min(x), np.max(x), np.max(x), np.min(x), np.min(x)]),
+                "y": np.array([np.min(y), np.min(y), np.max(y), np.max(y), np.min(y)]),
+            }
+        )
 
         # Create GeoDataFrame
         boundary_gdf = gpd.GeoDataFrame(
-            data = boundary_df,
+            data=boundary_df,
             geometry=gpd.points_from_xy(boundary_df["x"], boundary_df["y"]),
-            crs = projection
+            crs=projection,
         )
 
         # Convert to UTM (decimal degrees to m)
         # ---- Create UTM code
-        utm_code = utm_string_generator((boundary_df.x.min() + boundary_df.x.max()) / 2, 
-                                        (boundary_df.y.min() + boundary_df.y.max()) / 2)
+        utm_code = utm_string_generator(
+            (boundary_df.x.min() + boundary_df.x.max()) / 2,
+            (boundary_df.y.min() + boundary_df.y.max()) / 2,
+        )
         # ---- Create number code
         utm_num = int(utm_code)
         # ---- UTM conversion
@@ -432,7 +456,8 @@ def initialize_grid(file_configuration = dict):
         grid_cells = []
         # ---- Initialize coordinate counter
         y_ct = 0
-        x_coord = []; y_coord = []
+        x_coord = []
+        y_coord = []
         # ---- Iterate through to generate cells
         for y0 in np.arange(ymin, ymax, y_step):
             y_ct += 1
@@ -449,9 +474,9 @@ def initialize_grid(file_configuration = dict):
 
         # Convert to a GeoDataFrame
         cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=utm_code)
-        # ---- Add cordinates
+        # ---- Add coordinates
         cells_gdf.loc[:, "x"] = np.array(x_coord)
-        cells_gdf.loc[:, "y"] = np.array(y_coord)        
+        cells_gdf.loc[:, "y"] = np.array(y_coord)
 
         # Get coastline shapefile directory, if defined
         if "coastline" in file_configuration["input_directories"]:
@@ -459,14 +484,14 @@ def initialize_grid(file_configuration = dict):
             # Get coastline settings
             coast_settings = file_configuration["input_directories"]["coastline"]
             # ---- Get root folder directory
-            # coast_root = root_dir / coast_settings["directory"] / coast_settings["coastline_name"] 
-            coast_root = (
-                "/".join([root_dir, coast_settings["directory"], coast_settings["coastline_name"]])
+            # coast_root = root_dir / coast_settings["directory"] / coast_settings["coastline_name"]
+            coast_root = "/".join(
+                [root_dir, coast_settings["directory"], coast_settings["coastline_name"]]
             )
             # ---- Create filepath
             shp_filepath = (
-                # root_dir / coast_settings["directory"] 
-                # / coast_settings["coastline_name"] 
+                # root_dir / coast_settings["directory"]
+                # / coast_settings["coastline_name"]
                 # coast_root
                 # / f"{coast_settings['coastline_name']}.shp"
                 "/".join([coast_root, f"{coast_settings['coastline_name']}.shp"])
@@ -479,63 +504,67 @@ def initialize_grid(file_configuration = dict):
 
             # Get original lat/lon geometry boundaries
             xmin0, ymin0, xmax0, ymax0 = boundary_gdf.total_bounds
-            
+
             # Read in file
-            full_coast = gpd.read_file(shp_filepath, 
-                                       engine="pyogrio",
-                                       storage_options=file_configuration["storage_options"])
+            full_coast = gpd.read_file(
+                shp_filepath,
+                engine="pyogrio",
+                storage_options=file_configuration["storage_options"],
+            )
             # ---- Convert to UTM
             full_coast_utm = full_coast.to_crs(utm_code)
             # ---- Remove empty
             full_coast_utm = full_coast_utm[~full_coast_utm.is_empty]
 
-            # Create bouning box with a buffer
+            # Create bounding box with a buffer
             boundary_box = box(xmin0 - 5, ymin0 - 5, xmax0 + 5, ymax0 + 5)
             # ---- Create an unbuffered copy
             boundary_box_unbuffered = box(xmin0, ymin0, xmax0, ymax0)
             # ---- Convert to a GeoDataFrame
-            boundary_box_unbuffered_gdf = (
-                gpd.GeoDataFrame(geometry=[boundary_box_unbuffered], crs=projection)
+            boundary_box_unbuffered_gdf = gpd.GeoDataFrame(
+                geometry=[boundary_box_unbuffered], crs=projection
             )
             # ---- Clip the coastline for saving
-            clipped_coast_original = (
-                gpd.clip(full_coast, box(xmin0 + 1, ymin0 + 1, xmax0 + 1, ymax0 + 1))
+            clipped_coast_original = gpd.clip(
+                full_coast, box(xmin0 + 1, ymin0 + 1, xmax0 + 1, ymax0 + 1)
             )
 
             # Clip the coastline shapefile
             clipped_coast = gpd.clip(full_coast, boundary_box).to_crs(utm_code)
 
             # Clip the grid cells
-            cells_gdf.loc[:, "geometry"] = (
-                cells_gdf["geometry"].difference(clipped_coast.geometry.union_all())
+            cells_gdf.loc[:, "geometry"] = cells_gdf["geometry"].difference(
+                clipped_coast.geometry.union_all()
             )
 
             # Calculate area per cell
             cells_gdf.loc[:, "area"] = cells_gdf.area
             # ---- Convert back to nmi^2 from m^2
-            cells_gdf.loc[:, "area"] = cells_gdf.loc[:, "area"] / 1852 ** 2
+            cells_gdf.loc[:, "area"] = cells_gdf.loc[:, "area"] / 1852**2
 
-            # Convert back to original projection and clip 
-            clipped_cells_latlon = (
-                gpd.clip(cells_gdf.to_crs(projection), boundary_box_unbuffered_gdf)
-                .reset_index(drop=True)
-            )
+            # Convert back to original projection and clip
+            clipped_cells_latlon = gpd.clip(
+                cells_gdf.to_crs(projection), boundary_box_unbuffered_gdf
+            ).reset_index(drop=True)
 
             # Initialize empty columns that can be added to later on
-            clipped_cells_latlon.loc[:, ["number_density_mean", "biomass_density_mean", 
-                                         "abundance", "biomass"]] = 0.0
-            
+            clipped_cells_latlon.loc[
+                :, ["number_density_mean", "biomass_density_mean", "abundance", "biomass"]
+            ] = 0.0
+
             # Create output DataFrame
-            output_df = pd.DataFrame({
-                "geometry": clipped_cells_latlon["geometry"].apply(lambda geom: geom.wkt)
-            })
+            output_df = pd.DataFrame(
+                {"geometry": clipped_cells_latlon["geometry"].apply(lambda geom: geom.wkt)}
+            )
             # ---- Add the required columns
-            output_df = pd.concat([output_df, clipped_cells_latlon.loc[:, ["x", "y", "area"]]], 
-                                  axis=1) 
+            output_df = pd.concat(
+                [output_df, clipped_cells_latlon.loc[:, ["x", "y", "area"]]], axis=1
+            )
             # ---- Initialize empty columns that can be added to later on
-            output_df.loc[:, ["number_density_mean", "biomass_density_mean", "abundance", 
-                              "biomass"]] = 0.0
-           
+            output_df.loc[
+                :, ["number_density_mean", "biomass_density_mean", "abundance", "biomass"]
+            ] = 0.0
+
             # Write to the database file (for the grid)
             # ---- Create engine
             engine = sqla.create_engine(f"sqlite:///{db_filepath}")
@@ -544,33 +573,36 @@ def initialize_grid(file_configuration = dict):
 
             # Write to the database file (for the coastline shapefile)
             # ---- Create output copy
-            coastline_out = pd.DataFrame({
-                "geometry": clipped_coast_original["geometry"].apply(lambda geom: geom.wkt)
-            })
+            coastline_out = pd.DataFrame(
+                {"geometry": clipped_coast_original["geometry"].apply(lambda geom: geom.wkt)}
+            )
             # ---- Concatenate
-            coastline_out = (
-                pd.concat([coastline_out, clipped_coast_original.drop(columns="geometry")], axis=1)
+            coastline_out = pd.concat(
+                [coastline_out, clipped_coast_original.drop(columns="geometry")], axis=1
             )
             # ---- Connect and create table
             _ = coastline_out.to_sql("coastline_df", engine, if_exists="replace", index=False)
 
-def update_population_grid(file_configuration: dict,
-                           coordinates: Union[List[str], str],
-                           dataset: Union[dict, pd.DataFrame]):
+
+def update_population_grid(
+    file_configuration: dict, coordinates: Union[List[str], str], dataset: Union[dict, pd.DataFrame]
+):
 
     # Extract input directory settings
     file_settings = file_configuration["input_directories"]
 
     # Get filepath for grid
     grid_db = list(
-        Path(file_configuration["database_directory"])
-        .glob(pattern=f"{file_settings['grid']['database_name']}")
+        Path(file_configuration["database_directory"]).glob(
+            pattern=f"{file_settings['grid']['database_name']}"
+        )
     )[0]
 
     # Get filepath for acoustics
     survey_db = list(
-        Path(file_configuration["database_directory"])
-        .glob(pattern=f"{file_settings['acoustics']['database_name']}")
+        Path(file_configuration["database_directory"]).glob(
+            pattern=f"{file_settings['acoustics']['database_name']}"
+        )
     )[0]
 
     # Define the SQL tables that will be parsed and queries
@@ -578,29 +610,41 @@ def update_population_grid(file_configuration: dict,
     grid_table = "grid_df"
 
     # Get indexed survey data
-    indexed_data = query_dataset(survey_db, 
-                                 dataset, 
-                                 table_name=data_table, 
-                                 data_columns=coordinates + ["x", "y", "number_density", 
-                                                             "biomass_density"], 
-                                 unique_columns=coordinates)
-    
+    indexed_data = query_dataset(
+        survey_db,
+        dataset,
+        table_name=data_table,
+        data_columns=coordinates + ["x", "y", "number_density", "biomass_density"],
+        unique_columns=coordinates,
+    )
+
     # Get indexed grid data
-    indexed_grid = query_dataset(grid_db, 
-                                 indexed_data, 
-                                 table_name=grid_table, 
-                                 data_columns= ["x", "y", "area", "number_density_mean", 
-                                                "biomass_density_mean", "abundance", "biomass"], 
-                                 unique_columns=["x", "y"])
-    
+    indexed_grid = query_dataset(
+        grid_db,
+        indexed_data,
+        table_name=grid_table,
+        data_columns=[
+            "x",
+            "y",
+            "area",
+            "number_density_mean",
+            "biomass_density_mean",
+            "abundance",
+            "biomass",
+        ],
+        unique_columns=["x", "y"],
+    )
+
     # Set DataFrame index
     indexed_grid.set_index(["x", "y"], inplace=True)
 
-    # Update the areal density esitmates
+    # Update the areal density estimates
     # ---- Number (animals/nmi^2)
     indexed_grid["number_density_mean"] = indexed_data.groupby(["x", "y"])["number_density"].mean()
     # ---- Bioamss (kg/nmi^2)
-    indexed_grid["biomass_density_mean"] = indexed_data.groupby(["x", "y"])["biomass_density"].mean()
+    indexed_grid["biomass_density_mean"] = indexed_data.groupby(["x", "y"])[
+        "biomass_density"
+    ].mean()
 
     # Compute the abundance and biomass per grid cell
     # ---- Abundance (# animals)
@@ -612,12 +656,10 @@ def update_population_grid(file_configuration: dict,
     # ---- Reset index
     output_df = indexed_grid.reset_index()
     # ---- Grouped update
-    sql_group_update(grid_db, dataframe=output_df, table_name=grid_table, 
-                     columns=["number_density_mean", "biomass_density_mean", "abundance", 
-                              "biomass"], 
-                     unique_columns=["x", "y"])
-
-
-
-
-    
\ No newline at end of file
+    sql_group_update(
+        grid_db,
+        dataframe=output_df,
+        table_name=grid_table,
+        columns=["number_density_mean", "biomass_density_mean", "abundance", "biomass"],
+        unique_columns=["x", "y"],
+    )
diff --git a/echopop/live/live_survey.py b/echopop/live/live_survey.py
index 51cc4ba8..297d5be2 100644
--- a/echopop/live/live_survey.py
+++ b/echopop/live/live_survey.py
@@ -1,28 +1,12 @@
-from typing import Union, Optional, Literal
-from pathlib import Path
-from datetime import datetime
 import copy
-import pandas as pd
-
-from .sql_methods import query_processed_files
-
-from .live_core import(
-    LIVE_DATA_STRUCTURE,
-)
+from datetime import datetime
+from pathlib import Path
+from typing import Literal, Optional, Union
 
-from ..acoustics import (
-    ts_length_regression,
-    to_dB,
-    to_linear
-)
+import pandas as pd
 
-from .sql_methods import query_processed_files
-from .live_acoustics import (
-    compute_nasc,
-    format_acoustic_dataset,
-    preprocess_acoustic_data
-)
-    
+from . import live_data_loading as eldl, live_data_processing as eldp
+from .live_acoustics import compute_nasc, format_acoustic_dataset, preprocess_acoustic_data
 from .live_biology import (
     bin_length_data,
     compute_average_weights,
@@ -30,25 +14,24 @@
     length_bin_counts,
     length_bin_weights,
     length_weight_regression,
-    number_proportions,    
+    number_proportions,
     preprocess_biology_data,
-    weight_proportions
+    weight_proportions,
 )
-
+from .live_core import LIVE_DATA_STRUCTURE
 from .live_spatial_methods import initialize_grid
+from .sql_methods import query_processed_files
 
-from . import live_data_processing as eldp
-from . import live_data_loading as eldl
 
 class LiveSurvey:
     """
-    A real-time processing version of the `echopop` base `Survey` class that ingests biological, 
+    A real-time processing version of the `echopop` base `Survey` class that ingests biological,
     acoustic, and event meta data to provide population estimates when generated.
     """
 
     def __init__(
         self,
-        live_init_config_path: Union[str, Path], 
+        live_init_config_path: Union[str, Path],
         live_file_config_path: Union[str, Path],
         cloud_storage_options: dict = {},
         verbose: bool = True,
@@ -66,10 +49,8 @@ def __init__(
             {"database": {key: None for key in self.config["input_directories"].keys()}}
         )
         # ---- Add cloud storage options, if needed
-        self.config.update(
-            {"storage_options": cloud_storage_options}
-        )
-        
+        self.config.update({"storage_options": cloud_storage_options})
+
         # Initialize input attribute
         self.input = copy.deepcopy(LIVE_DATA_STRUCTURE["input"])
 
@@ -88,8 +69,8 @@ def __init__(
         # Configure the spatial settings
         self.input.update({"spatial": eldl.configure_spatial_settings(self.config)})
 
-        # TODO: Add verbosity for printing database filepaths/connections 
-        if verbose: 
+        # TODO: Add verbosity for printing database filepaths/connections
+        if verbose:
             pass
 
     def __repr__(self):
@@ -100,7 +81,9 @@ def __repr__(self):
             acoustic_filenames = self.meta["provenance"]["acoustic_files_read"]
             # ---- Subset if many files are being processed
             if len(acoustic_filenames) > 2:
-                acoustic_filenames = acoustic_filenames[:2] + ["..."] + [f"[n = {len(acoustic_filenames)}]"]
+                acoustic_filenames = (
+                    acoustic_filenames[:2] + ["..."] + [f"[n = {len(acoustic_filenames)}]"]
+                )
             # ---- Format string
             acoustic_files = ", ".join(acoustic_filenames)
         else:
@@ -119,8 +102,8 @@ def __repr__(self):
             biology_files = "None"
 
         # Get linked database names
-        linked_dbs = (
-            "\n   ".join([f"{key.title()}: {db}" for key, db in self.config["database"].items()])
+        linked_dbs = "\n   ".join(
+            [f"{key.title()}: {db}" for key, db in self.config["database"].items()]
         )
 
         return (
@@ -130,96 +113,96 @@ def __repr__(self):
             f"Biology files being processed: \n   {biology_files}\n"
             f"Linked databases: \n   {linked_dbs}"
         )
-    
+
     def __str__(self):
         return self.__repr__()
 
-    def load_acoustic_data(self,
-                           xarray_kwargs: dict = {},                           
-                           input_filenames: Optional[list] = None,                           
-                           verbose: bool = True):
-        
+    def load_acoustic_data(
+        self, xarray_kwargs: dict = {}, input_filenames: Optional[list] = None, verbose: bool = True
+    ):
+
         # Validate the data directory and format the filepaths
-        acoustic_files = eldl.validate_data_directory(self.config, dataset="acoustics",                                                       
-                                                      input_filenames=input_filenames)
-        
+        acoustic_files = eldl.validate_data_directory(
+            self.config, dataset="acoustics", input_filenames=input_filenames
+        )
+
         # Read in the acoustic data files
         if acoustic_files:
             # ! [REQUIRES DASK] ---- Read in the listed file
             # ---- Read in the acoustic data files
-            prc_nasc_df, acoustic_data_units = eldl.read_acoustic_files(acoustic_files, 
-                                                                        xarray_kwargs=xarray_kwargs)
+            prc_nasc_df, acoustic_data_units = eldl.read_acoustic_files(
+                acoustic_files, xarray_kwargs=xarray_kwargs
+            )
             # ---- Add the `acoustic_data_units` to the dictionary
-            self.config["acoustics"]["dataset_units"] = acoustic_data_units   
+            self.config["acoustics"]["dataset_units"] = acoustic_data_units
             # ---- Preprocess the acoustic dataset
             # TODO: SettingWithCopyWarning:
-            self.input["acoustics"]["prc_nasc_df"] = preprocess_acoustic_data(prc_nasc_df.copy(), 
-                                                                              self.input["spatial"],
-                                                                              self.config)  
+            self.input["acoustics"]["prc_nasc_df"] = preprocess_acoustic_data(
+                prc_nasc_df.copy(), self.input["spatial"], self.config
+            )
             # ---- Add meta key
-            self.meta["provenance"].update({
-                "acoustic_files_read":  acoustic_files,
-            })   
-            # TODO: Add verbosity for printing database filepaths/connections 
+            self.meta["provenance"].update(
+                {
+                    "acoustic_files_read": acoustic_files,
+                }
+            )
+            # TODO: Add verbosity for printing database filepaths/connections
             if verbose:
                 # ---- Create file list
                 file_list = "\n".join(acoustic_files)
-                print(
-                    f"The following acoustic files are being processed:\n"
-                    f"{file_list}."
-                )
+                print(f"The following acoustic files are being processed:\n" f"{file_list}.")
         else:
             self.input["acoustics"]["prc_nasc_df"] = None
 
-    def load_biology_data(self,
-                          pandas_kwargs: dict = {},
-                          input_filenames: Optional[list] = None,
-                          verbose: bool = True):
+    def load_biology_data(
+        self, pandas_kwargs: dict = {}, input_filenames: Optional[list] = None, verbose: bool = True
+    ):
 
         # Validate the data directory and format the filepaths
-        biology_files = eldl.validate_data_directory(self.config, dataset="biology", 
-                                                     input_filenames=input_filenames)
-        
-        # ! REMOVE 
+        biology_files = eldl.validate_data_directory(
+            self.config, dataset="biology", input_filenames=input_filenames
+        )
+
+        # ! REMOVE
         self.meta["provenance"]["biology_files_checkpoint1"] = biology_files
-        
-        # TODO: Add verbosity for printing database filepaths/connections 
+
+        # TODO: Add verbosity for printing database filepaths/connections
         if biology_files and verbose:
             # ---- Create file list
             file_list = "\n".join(biology_files)
-            print(  
-                f"The following biological files are being processed:\n"
-                f"{file_list}."
-            )
-        
+            print(f"The following biological files are being processed:\n" f"{file_list}.")
+
             # Read in the biology data files
-            initial_biology_output = eldl.read_biology_files(biology_files, self.config,
-                                                             pandas_kwargs=pandas_kwargs)
-            
-            # ! REMOVE 
-            self.meta["provenance"]["biology_files_checkpoint2"] =(
-                {key: df.shape for key, df in initial_biology_output.items()}
-            ) 
+            initial_biology_output = eldl.read_biology_files(
+                biology_files, self.config, pandas_kwargs=pandas_kwargs
+            )
+
+            # ! REMOVE
+            self.meta["provenance"]["biology_files_checkpoint2"] = {
+                key: df.shape for key, df in initial_biology_output.items()
+            }
 
             # Preprocess the biology dataset
-            self.input["biology"], self.input["biology_processed"] = (
-                preprocess_biology_data(initial_biology_output, self.input["spatial"], self.config)
+            self.input["biology"], self.input["biology_processed"] = preprocess_biology_data(
+                initial_biology_output, self.input["spatial"], self.config
             )
 
-            # ! REMOVE 
-            self.meta["provenance"]["biology_files_checkpoint3"] = (
-                {key: df.shape for key, df in self.input["biology_processed"].items()}
-            )
+            # ! REMOVE
+            self.meta["provenance"]["biology_files_checkpoint3"] = {
+                key: df.shape for key, df in self.input["biology_processed"].items()
+            }
 
             # Add meta key
-            self.meta["provenance"].update({
-                "biology_files_read": biology_files,
-            })  
+            self.meta["provenance"].update(
+                {
+                    "biology_files_read": biology_files,
+                }
+            )
 
     def process_biology_data(self):
 
         # TODO: How and when should the already processed data be imported?
-        # Separate out processed and unprocessed biological data 
+        # Separate out processed and unprocessed biological data
         # ----- Unprocessed
         biology_unprocessed = self.input["biology"]
 
@@ -227,77 +210,82 @@ def process_biology_data(self):
         root_directory = self.config["database_directory"]
 
         # Check if data are present
-        unprocess_data_dfs = (
-            [True if isinstance(df, pd.DataFrame) and not df.empty else False 
-             for _, df in biology_unprocessed.items()]
-        )
+        unprocess_data_dfs = [
+            True if isinstance(df, pd.DataFrame) and not df.empty else False
+            for _, df in biology_unprocessed.items()
+        ]
         # ---- Proceed in processing the unprocessed data
         if all(unprocess_data_dfs):
 
             # Compute `sigma_bs` by sending it to the appropriate database table
-            compute_sigma_bs(biology_unprocessed["specimen_df"], 
-                             biology_unprocessed["length_df"], 
-                             self.config)
+            compute_sigma_bs(
+                biology_unprocessed["specimen_df"], biology_unprocessed["length_df"], self.config
+            )
 
             # Bin the length measurements of the biological data
             bin_length_data(biology_unprocessed, self.config["length_distribution"])
 
             # Compute the length-weight regression and add it to the SQL table
-            length_weight_df = length_weight_regression(biology_unprocessed["specimen_df"], 
-                                                        self.config["length_distribution"],
-                                                        self.config)
-            
+            length_weight_df = length_weight_regression(
+                biology_unprocessed["specimen_df"], self.config["length_distribution"], self.config
+            )
+
             # Compute length-binned counts for the aggregated and individual-based measurements
-            specimen_binned, specimen_binned_filtered, length_binned = (
-                length_bin_counts(biology_unprocessed["length_df"], 
-                                biology_unprocessed["specimen_df"], 
-                                self.config)
+            specimen_binned, specimen_binned_filtered, length_binned = length_bin_counts(
+                biology_unprocessed["length_df"], biology_unprocessed["specimen_df"], self.config
             )
 
             # Compute the number proportions
             specimen_number_proportion, length_number_proportion, sex_number_proportions = (
-                number_proportions(specimen_binned, specimen_binned_filtered, 
-                                length_binned, self.config)
+                number_proportions(
+                    specimen_binned, specimen_binned_filtered, length_binned, self.config
+                )
             )
 
             # Compute the length-binned weights for the aggregated and individual-based measurements
-            length_weight_binned, specimen_weight_binned = (
-                length_bin_weights(biology_unprocessed["length_df"],
-                                biology_unprocessed["specimen_df"],
-                                length_weight_df,self.config)
+            length_weight_binned, specimen_weight_binned = length_bin_weights(
+                biology_unprocessed["length_df"],
+                biology_unprocessed["specimen_df"],
+                length_weight_df,
+                self.config,
             )
 
             # Calculate the average weights among male, female, and all fish
-            self.input["weight_stratum_df"] = (
-                compute_average_weights(specimen_number_proportion,
-                                        length_number_proportion, 
-                                        sex_number_proportions,
-                                        length_weight_df,
-                                        self.config["length_distribution"],
-                                        self.config)
+            self.input["weight_stratum_df"] = compute_average_weights(
+                specimen_number_proportion,
+                length_number_proportion,
+                sex_number_proportions,
+                length_weight_df,
+                self.config["length_distribution"],
+                self.config,
             )
-            
+
             # Compute the weight proportions
-            self.input["biology"].update({
-                    "proportions": weight_proportions(biology_unprocessed["catch_df"], 
-                                                    specimen_weight_binned,
-                                                    length_weight_binned,
-                                                    length_number_proportion,
-                                                    length_weight_df,
-                                                    self.config)
-            })
+            self.input["biology"].update(
+                {
+                    "proportions": weight_proportions(
+                        biology_unprocessed["catch_df"],
+                        specimen_weight_binned,
+                        length_weight_binned,
+                        length_number_proportion,
+                        length_weight_df,
+                        self.config,
+                    )
+                }
+            )
 
             # Update the database
-            query_processed_files(root_directory, 
-                                  self.config["input_directories"]["biology"],
-                                  self.meta["provenance"]["biology_files_read"],
-                                  processed=True)
-            
+            query_processed_files(
+                root_directory,
+                self.config["input_directories"]["biology"],
+                self.meta["provenance"]["biology_files_read"],
+                processed=True,
+            )
+
             # Add meta key
-            self.meta["provenance"].update({
-                "biology_files_processed": self.meta["provenance"]["biology_files_read"]
-            })  
-            
+            self.meta["provenance"].update(
+                {"biology_files_processed": self.meta["provenance"]["biology_files_read"]}
+            )
 
     def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True):
 
@@ -311,46 +299,45 @@ def process_acoustic_data(self, echometrics: bool = True, verbose: bool = True):
                     "No acoustic data located in `*.input['acoustics']['prc_nasc_df']"
                     " DataFrame. Data processing step will therefore be skipped."
                 )
-        else:        
+        else:
             # Get the unprocessed acoustic data
             acoustic_data_df = self.input["acoustics"]["prc_nasc_df"]
 
             # Integrate NASC (and compute the echometrics, if necessary)
             nasc_data_df = compute_nasc(acoustic_data_df, self.config, echometrics)
-            
+
             # Format the dataframe and insert into the LiveSurvey object
-            self.input["acoustics"]["nasc_df"] = format_acoustic_dataset(nasc_data_df, 
-                                                                         self.config,
-                                                                         self.meta)
+            self.input["acoustics"]["nasc_df"] = format_acoustic_dataset(
+                nasc_data_df, self.config, self.meta
+            )
 
             # Add meta key
-            self.meta["provenance"].update({
-                "acoustic_files_processed": self.meta["provenance"]["acoustic_files_read"]
-            })  
-    
-    def estimate_population(self,
-                            working_dataset: Literal["acoustic", "biology"],
-                            verbose: bool = True):
-    
+            self.meta["provenance"].update(
+                {"acoustic_files_processed": self.meta["provenance"]["acoustic_files_read"]}
+            )
+
+    def estimate_population(
+        self, working_dataset: Literal["acoustic", "biology"], verbose: bool = True
+    ):
+
         self.meta["provenance"][f"{working_dataset}_population"] = False
 
         # method
         if working_dataset == "acoustic":
-            eldp.acoustic_pipeline(self.input["acoustics"],
-                                    self.input["spatial"]["strata"],
-                                    self.config,
-                                    verbose=verbose,
-                                    contrast_columns=["ship_id"])   
+            eldp.acoustic_pipeline(
+                self.input["acoustics"],
+                self.input["spatial"]["strata"],
+                self.config,
+                verbose=verbose,
+                contrast_columns=["ship_id"],
+            )
             # --- Validate successful run
             self.meta["provenance"]["acoustic_population"] = True
-        
+
         # method
         if working_dataset == "biology":
-            eldp.biology_pipeline(self.input["biology"],
-                                    self.input["spatial"]["strata"],
-                                    self.config,
-                                    verbose=verbose)
+            eldp.biology_pipeline(
+                self.input["biology"], self.input["spatial"]["strata"], self.config, verbose=verbose
+            )
             # --- Validate successful run
             self.meta["provenance"]["biology_population"] = True
-
-        
diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py
index 4c59d975..a1d55a26 100644
--- a/echopop/live/live_visualizer.py
+++ b/echopop/live/live_visualizer.py
@@ -1,17 +1,21 @@
-from echopop.live.sql_methods import SQL
-from shapely import wkt
+from pathlib import Path
+from typing import Optional, Union
+
+import geopandas as gpd
 import matplotlib.pyplot as plt
-from matplotlib.colors import ListedColormap
 import numpy as np
 import pandas as pd
-import geopandas as gpd
-from typing import Union, Optional
-from pathlib import Path
-import matplotlib.gridspec as gridspec
+from matplotlib.colors import ListedColormap
+from shapely import wkt
+
+from echopop.live.sql_methods import SQL
 
-def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame],
-                         projection: str,
-                         coast_db: Optional[Union[Path, pd.DataFrame]] = None):
+
+def plot_livesurvey_grid(
+    grid_db: Union[Path, pd.DataFrame],
+    projection: str,
+    coast_db: Optional[Union[Path, pd.DataFrame]] = None,
+):
 
     # Extract grid data from database if needed
     if isinstance(grid_db, Path):
@@ -23,31 +27,31 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame],
         )
     else:
         grid_data = grid_db
-    
+
     # Extract coast data from database if needed
     if isinstance(coast_db, Path):
         # ---- SELECT
-        coast_data = SQL(coast_db, "select", table_name="coastline_df")  
+        coast_data = SQL(coast_db, "select", table_name="coastline_df")
     elif coast_data is None:
         # ---- SELECT from `grid_data`
-        coast_data = SQL(grid_db, "select", table_name="coastline_df")  
+        coast_data = SQL(grid_db, "select", table_name="coastline_df")
     elif not isinstance(coast_db, pd.DataFrame):
         raise TypeError(
             "Coast data input (`coast_data`) must either be a `Path` or `pandas.DataFrame` object, "
             "or exist within the SQL database as a table (`'coastline_df'`) within the `grid_data` "
             "input (i.e. `grid_data.db`)."
-        )      
+        )
     else:
-        coast_data = coast_db  
-    
+        coast_data = coast_db
+
     # Format columns if needed (well-known-text to Polygon)
     # ---- `grid_data`
     if isinstance(grid_data["geometry"][0], str):
         grid_data["geometry"] = grid_data["geometry"].apply(wkt.loads)
     # ---- `coastline_data`
     if isinstance(coast_data["geometry"][0], str):
-        coast_data["geometry"] = coast_data["geometry"].apply(wkt.loads)    
-    
+        coast_data["geometry"] = coast_data["geometry"].apply(wkt.loads)
+
     # Generate GeoDataFrames
     # ---- `grid`
     grid_gdf = gpd.GeoDataFrame(grid_data, geometry="geometry", crs=projection)
@@ -63,27 +67,21 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame],
             "name": "Mean number density",
             "units": "fish $\\mathregular{nmi^{-2}}$",
             "colormap": "viridis",
-            "color_threshold": {
-                "minimum": 1e1,
-                "maximum": 1e6
-            },
-        }, 
+            "color_threshold": {"minimum": 1e1, "maximum": 1e6},
+        },
         "biomass_density_mean": {
             "name": "Mean biomass density",
             "units": "kg $\\mathregular{nmi^{-2}}$",
             "colormap": "plasma",
-            "color_threshold": {
-                "minimum": 1e1,
-                "maximum": 1e6
-            },
-        },     
+            "color_threshold": {"minimum": 1e1, "maximum": 1e6},
+        },
         "biomass": {
             "name": "Biomass",
             "units": "kg",
             "colormap": "cividis",
             "color_threshold": {
                 "minimum": 1e1 * grid_gdf["area"].max(),
-                "maximum": 1e6 * grid_gdf["area"].max()
+                "maximum": 1e6 * grid_gdf["area"].max(),
             },
         },
         "abundance": {
@@ -92,9 +90,9 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame],
             "colormap": "inferno",
             "color_threshold": {
                 "minimum": 1e1 * grid_gdf["area"].max(),
-                "maximum": 1e6 * grid_gdf["area"].max()
+                "maximum": 1e6 * grid_gdf["area"].max(),
             },
-        }
+        },
     }
 
     # Create a figure and a 2x2 grid of subplots
@@ -108,7 +106,7 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame],
         # ---- Get the colormap
         colormap = plt.colormaps.get_cmap(VARIABLE_MAP[var]["colormap"]).resampled(256)
         # ---- Invert
-        newcolors = colormap (np.linspace(0, 1, 256))[::-1]
+        newcolors = colormap(np.linspace(0, 1, 256))[::-1]
         # ---- Define `white`
         white = np.array([1, 1, 1, 1])
         # ---- Replace "start" color
@@ -124,29 +122,35 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame],
             min_value = sub_grid_gdf[var].min()
             max_value = sub_grid_gdf[var].max()
         # ---- Normalize colorscale
-        norm=plt.Normalize(vmin=min_value, vmax=max_value)
+        norm = plt.Normalize(vmin=min_value, vmax=max_value)
         # ---- Plot the polygons with color fills based on the variable (non-zero)
-        grid_gdf.plot(column=var, ax=ax, edgecolor="gainsboro", legend=False, cmap=custom_cmap,
-                      norm=norm,
-                      markersize=0, linewidth=0.5)        
+        grid_gdf.plot(
+            column=var,
+            ax=ax,
+            edgecolor="gainsboro",
+            legend=False,
+            cmap=custom_cmap,
+            norm=norm,
+            markersize=0,
+            linewidth=0.5,
+        )
         # ---- Add coastline data layer
-        coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
+        coast_gdf.plot(ax=ax, linewidth=1.2, color="gray", edgecolor="black")
         # ---- Set axis limits
-        ax.set_xlim(axis_limits[0]*1.005, axis_limits[2]*1.01)
-        ax.set_ylim(axis_limits[1]*0.98, axis_limits[3]*1.005)
+        ax.set_xlim(axis_limits[0] * 1.005, axis_limits[2] * 1.01)
+        ax.set_ylim(axis_limits[1] * 0.98, axis_limits[3] * 1.005)
         # ---- Trim down the margins
-        ax.margins(0,0)
+        ax.margins(0, 0)
         # ---- Set adjustable aspect ratio
         # ax.set_aspect('equal', adjustable='box')
         # ---- Set the title and labels
         var_info = VARIABLE_MAP[var]
         ax.set_title(f"{var_info['name']}")
         # ---- Set axis labels
-        ax.set_xlabel(u'Longitude (\u00B0E)')
-        ax.set_ylabel(u'Latitude (\u00B0N)')
+        ax.set_xlabel("Longitude (\u00B0E)")
+        ax.set_ylabel("Latitude (\u00B0N)")
         # ---- Add colorbar
-        sm = plt.cm.ScalarMappable(cmap=custom_cmap, 
-                                   norm=norm)
+        sm = plt.cm.ScalarMappable(cmap=custom_cmap, norm=norm)
         sm._A = []  # fake up the array of the scalar mappable
         cbar = fig.colorbar(sm, ax=ax, shrink=0.5)
         cbar.set_label(f"{var_info['units']}")
@@ -161,17 +165,27 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame],
         x_scale = (x1 - x0) * 0.1
         y_scale = (y1 - y0) * 0.1
         # scalebar_y_offset = (axis_limits[3]*1.005 - axis_limits[1]*0.98) * 0.05
-        # ---- Plot scalebar        
-        # ax.plot([scalebar_x, scalebar_x + scalebar_length / 100], 
+        # ---- Plot scalebar
+        # ax.plot([scalebar_x, scalebar_x + scalebar_length / 100],
         #         [scalebar_y, scalebar_y], color='black', lw=2)
-        ax.plot([x0 + x_scale, x0 + x_scale + scalebar_length_in_degrees], 
-                [y0 + y_scale, y0 + y_scale], color='black', lw=2)
+        ax.plot(
+            [x0 + x_scale, x0 + x_scale + scalebar_length_in_degrees],
+            [y0 + y_scale, y0 + y_scale],
+            color="black",
+            lw=2,
+        )
         # ---- Add scale text
-        ax.text(x0 + x_scale + scalebar_length_in_degrees / 2, y0 + y_scale - (y1 - y0) * 0.025, 
-                f'{scalebar_length} km', ha='center', va='top', color='black')
+        ax.text(
+            x0 + x_scale + scalebar_length_in_degrees / 2,
+            y0 + y_scale - (y1 - y0) * 0.025,
+            f"{scalebar_length} km",
+            ha="center",
+            va="top",
+            color="black",
+        )
 
-        # ax.text(scalebar_x + (scalebar_length / 200), 
-        #         scalebar_y - scalebar_y_offset, 
+        # ax.text(scalebar_x + (scalebar_length / 200),
+        #         scalebar_y - scalebar_y_offset,
         #         f'{scalebar_length} km', ha='center', va='bottom', color='black')
 
     # Adjust layout
@@ -181,9 +195,12 @@ def plot_livesurvey_grid(grid_db: Union[Path, pd.DataFrame],
     # plt.show()
     return fig
 
-def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame],
-                          projection: str,
-                          coast_db: Optional[Union[Path, pd.DataFrame]] = None):
+
+def plot_livesurvey_track(
+    survey_data_db: Union[Path, pd.DataFrame],
+    projection: str,
+    coast_db: Optional[Union[Path, pd.DataFrame]] = None,
+):
 
     # Extract grid data from database if needed
     if isinstance(survey_data_db, Path):
@@ -195,29 +212,30 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame],
         )
     else:
         survey_data = survey_data_db
-    
+
     # Extract coast data from database if needed
     if isinstance(coast_db, Path):
         # ---- SELECT
-        coast_data = SQL(coast_db, "select", table_name="coastline_df")   
+        coast_data = SQL(coast_db, "select", table_name="coastline_df")
     elif not isinstance(coast_db, pd.DataFrame):
         raise TypeError(
             "Coast data input (`coast_data`) must either be a `Path` or `pandas.DataFrame` object."
-        )      
+        )
     else:
         coast_data = coast_db
-    
+
     # Format columns if needed (well-known-text to Polygon)
     # ---- `coastline_data`
     if isinstance(coast_data["geometry"][0], str):
-        coast_data["geometry"] = coast_data["geometry"].apply(wkt.loads)    
-    
+        coast_data["geometry"] = coast_data["geometry"].apply(wkt.loads)
+
     # Generate GeoDataFrames
     # ---- `grid`
-    survey_gdf = gpd.GeoDataFrame(survey_data, 
-                                  geometry=gpd.points_from_xy(survey_data["longitude"], 
-                                                              survey_data["latitude"]),
-                                                              crs=projection)
+    survey_gdf = gpd.GeoDataFrame(
+        survey_data,
+        geometry=gpd.points_from_xy(survey_data["longitude"], survey_data["latitude"]),
+        crs=projection,
+    )
     # ---- `coast`
     coast_gdf = gpd.GeoDataFrame(coast_data, geometry="geometry", crs=projection)
 
@@ -232,12 +250,12 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame],
             "colormap": "inferno",
             "minimum": 0.0,
             "cbar_reverse": True,
-                "color_threshold": {
+            "color_threshold": {
                 "minimum": 1e1,
                 "maximum": 1e6,
             },
-            "size": [25, 150]
-        }, 
+            "size": [25, 150],
+        },
         "biomass_density": {
             "name": "Mean biomass density",
             "units": "kg $\\mathregular{nmi^{-2}}$",
@@ -248,19 +266,16 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame],
                 "minimum": 1e1,
                 "maximum": 1e6,
             },
-            "size": [25, 150]
-        },     
+            "size": [25, 150],
+        },
         "nasc": {
             "name": "Nautical area scattering coefficient",
             "units": "$\\mathregular{m^{2}~nmi^{-2}}$",
             "colormap": "viridis",
             "minimum": 0.0,
             "cbar_reverse": False,
-            "color_threshold": {
-                "minimum": 1e2,
-                "maximum": 1e4
-            },
-            "size": [25, 150]
+            "color_threshold": {"minimum": 1e2, "maximum": 1e4},
+            "size": [25, 150],
         },
         "max_Sv": {
             "name": "Max $\\mathregular{S_V}$",
@@ -268,11 +283,8 @@ def plot_livesurvey_track(survey_data_db: Union[Path, pd.DataFrame],
             "colormap": "viridis",
             "minimum": -999,
             "cbar_reverse": True,
-            "color_threshold": {
-                "minimum": -80.0,
-                "maximum": -36.0
-            },
-            "size": [5, 100]
+            "color_threshold": {"minimum": -80.0, "maximum": -36.0},
+            "size": [5, 100],
         },
         # "mean_Sv": {
         #     "name": "$Mean \\mathregular{S_V}$",
@@ -300,15 +312,12 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
         sizes.loc[sizes < min_value] = min_value
         sizes.loc[sizes > max_value] = max_value
 
-        return (
-            ((sizes - min_value) / (max_value - min_value))
-            * (max_size - min_size) + min_size
-        )    
-    
+        return ((sizes - min_value) / (max_value - min_value)) * (max_size - min_size) + min_size
+
     # Define colors for ship_ids (you can customize these colors as needed)
     ship_id_colors = {
         ship_id: plt.cm.tab10(i)  # Use a colormap for distinct colors; adjust as needed
-        for i, ship_id in enumerate(survey_gdf['ship_id'].unique())
+        for i, ship_id in enumerate(survey_gdf["ship_id"].unique())
     }
 
     # Create a figure and a 2xn grid of subplots
@@ -331,17 +340,24 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
         custom_cmap = ListedColormap(newcolors)
         # ---- Plot cruisetrack
         # survey_gdf.plot(ax=ax, color="dimgray", linewidth=0.25, linestyle="-")
-        # ax.plot(survey_gdf.geometry.x, survey_gdf.geometry.y, color="dimgray", 
+        # ax.plot(survey_gdf.geometry.x, survey_gdf.geometry.y, color="dimgray",
         #         linewidth=0.25, linestyle="-")
         handles = []  # List to store legend handles
         for ship_id, group in survey_gdf.groupby("ship_id"):
             # Sort the group by latitude or longitude
-            # group = group.sort_values(by=["latitude", "longitude"])  
-            color = ship_id_colors.get(ship_id, 'gray')
-            line_handle, = ax.plot(group.geometry.x, group.geometry.y, color=color, 
-                        linewidth=0.25, linestyle="-", label=ship_id, zorder=1)
+            # group = group.sort_values(by=["latitude", "longitude"])
+            color = ship_id_colors.get(ship_id, "gray")
+            (line_handle,) = ax.plot(
+                group.geometry.x,
+                group.geometry.y,
+                color=color,
+                linewidth=0.25,
+                linestyle="-",
+                label=ship_id,
+                zorder=1,
+            )
             handles.append(line_handle)  # Add handle to legend
-            # ax.plot(group.geometry.x, group.geometry.y, label=ship_id, linewidth=0.25, 
+            # ax.plot(group.geometry.x, group.geometry.y, label=ship_id, linewidth=0.25,
             #         linestyle="-", zorder=1)
         # ---- Drop "empty" values
         sub_gdf = survey_gdf[survey_gdf[var] > VARIABLE_MAP[var]["minimum"]]
@@ -353,38 +369,40 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
             min_value = sub_gdf[var].min()
             max_value = sub_gdf[var].max()
         # ---- Normalize colorscale
-        norm=plt.Normalize(vmin=min_value, vmax=max_value)
+        norm = plt.Normalize(vmin=min_value, vmax=max_value)
         # ---- Plot the points with color fills based on the variable (non-zero)
         ax.scatter(
             [geom.x for geom in sub_gdf.geometry],
             [geom.y for geom in sub_gdf.geometry],
             c=sub_gdf[var],
-            s=scale_sizes(values=sub_gdf[var], 
-                          min_value=min_value, 
-                          max_value=max_value,
-                          min_size=VARIABLE_MAP[var]["size"][0],
-                          max_size=VARIABLE_MAP[var]["size"][1]),
+            s=scale_sizes(
+                values=sub_gdf[var],
+                min_value=min_value,
+                max_value=max_value,
+                min_size=VARIABLE_MAP[var]["size"][0],
+                max_size=VARIABLE_MAP[var]["size"][1],
+            ),
             cmap=custom_cmap,
             norm=norm,
-            zorder = 2
+            zorder=2,
             # edgecolor="black",
             # linewidths=0.1
-        )    
+        )
         # ---- Add coastline data layer
-        coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
+        coast_gdf.plot(ax=ax, linewidth=1.2, color="gray", edgecolor="black")
         # ---- Set axis limits
-        ax.set_xlim(axis_limits[0]*1.005, axis_limits[2]*0.995)
-        ax.set_ylim(axis_limits[1]*0.98, axis_limits[3]*1.005)
+        ax.set_xlim(axis_limits[0] * 1.005, axis_limits[2] * 0.995)
+        ax.set_ylim(axis_limits[1] * 0.98, axis_limits[3] * 1.005)
         # ---- Trim down the margins
-        ax.margins(0,0)
+        ax.margins(0, 0)
         # ---- Set adjustable aspect ratio
         # ax.set_aspect('equal', adjustable='box')
         # ---- Set the title and labels
         var_info = VARIABLE_MAP[var]
         ax.set_title(f"{var_info['name']}")
         # ---- Set axis labels
-        ax.set_xlabel(u'Longitude (\u00B0E)')
-        ax.set_ylabel(u'Latitude (\u00B0N)')
+        ax.set_xlabel("Longitude (\u00B0E)")
+        ax.set_ylabel("Latitude (\u00B0N)")
         # ---- Add colorbar
         sm = plt.cm.ScalarMappable(cmap=custom_cmap, norm=norm)
         sm._A = []  # fake up the array of the scalar mappable
@@ -401,18 +419,28 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
         x_scale = (x1 - x0) * 0.1
         y_scale = (y1 - y0) * 0.1
         # scalebar_y_offset = (axis_limits[3]*1.005 - axis_limits[1]*0.98) * 0.05
-        # ---- Plot scalebar        
-        # ax.plot([scalebar_x, scalebar_x + scalebar_length / 100], 
+        # ---- Plot scalebar
+        # ax.plot([scalebar_x, scalebar_x + scalebar_length / 100],
         #         [scalebar_y, scalebar_y], color='black', lw=2)
-        ax.plot([x0 + x_scale, x0 + x_scale + scalebar_length_in_degrees], 
-                [y0 + y_scale, y0 + y_scale], color='black', lw=2)
+        ax.plot(
+            [x0 + x_scale, x0 + x_scale + scalebar_length_in_degrees],
+            [y0 + y_scale, y0 + y_scale],
+            color="black",
+            lw=2,
+        )
         # ---- Add scale text
-        ax.text(x0 + x_scale + scalebar_length_in_degrees / 2, y0 + y_scale - (y1 - y0) * 0.025, 
-                f'{scalebar_length} km', ha='center', va='top', color='black')
+        ax.text(
+            x0 + x_scale + scalebar_length_in_degrees / 2,
+            y0 + y_scale - (y1 - y0) * 0.025,
+            f"{scalebar_length} km",
+            ha="center",
+            va="top",
+            color="black",
+        )
         # ax.legend(handles=handles, title='Ship ID')
 
-        # ax.text(scalebar_x + (scalebar_length / 200), 
-        #         scalebar_y - scalebar_y_offset, 
+        # ax.text(scalebar_x + (scalebar_length / 200),
+        #         scalebar_y - scalebar_y_offset,
         #         f'{scalebar_length} km', ha='center', va='bottom', color='black')
 
     # Adjust layout
@@ -422,137 +450,179 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
     # plt.show()
     return fig
 
-def plot_livesurvey_distributions(weight_table: pd.DataFrame, 
-                                  stratum_table: pd.DataFrame,
-                                  specimen_table: pd.DataFrame,
-                                  length_table: pd.DataFrame,
-                                  biology_db: Optional[Path] = None):
-    
+
+def plot_livesurvey_distributions(
+    weight_table: pd.DataFrame,
+    stratum_table: pd.DataFrame,
+    specimen_table: pd.DataFrame,
+    length_table: pd.DataFrame,
+    biology_db: Optional[Path] = None,
+):
+
     # If calling from SQL database
-    if biology_db is not None: 
+    if biology_db is not None:
         weight_table = SQL(biology_db, "select", table_name="length_weight_df")
         stratum_table = SQL(biology_db, "select", table_name="strata_summary_df")
         specimen_table = SQL(biology_db, "select", table_name="specimen_data_df")
         length_table = SQL(biology_db, "select", table_name="length_df")
-    elif not all([isinstance(df, pd.DataFrame) for df in [weight_table, stratum_table, 
-                                                          specimen_table, length_table]]):
-        raise TypeError(
-            "All tables must be a `pandas.DataFrame."
-        )
-    
+    elif not all(
+        [
+            isinstance(df, pd.DataFrame)
+            for df in [weight_table, stratum_table, specimen_table, length_table]
+        ]
+    ):
+        raise TypeError("All tables must be a `pandas.DataFrame.")
+
     # Organize the weight table data
     # ---- Sum weights by stratum, sex, and length_bin
     aggregated_data = (
-        weight_table.groupby(['stratum', 'sex', 'length_bin'])['weight'].sum().reset_index()
+        weight_table.groupby(["stratum", "sex", "length_bin"])["weight"].sum().reset_index()
     )
     # ---- Create a column to indicate 'all' sexes
     aggregated_data_all = (
-        aggregated_data.groupby(['stratum', 'length_bin'])['weight'].sum().reset_index()
+        aggregated_data.groupby(["stratum", "length_bin"])["weight"].sum().reset_index()
     )
-    aggregated_data_all['sex'] = 'all'
+    aggregated_data_all["sex"] = "all"
     # ---- Combine the male, female, and all data
     plot_weight_data = pd.concat([aggregated_data, aggregated_data_all], ignore_index=True)
-    
+
     # Define the sexes
     sexes = plot_weight_data.sex.unique().tolist()
-    
+
     # Organize the length table data
     bins = plot_weight_data.length_bin.unique() + 1
     full_bins = np.concatenate([[bins[0] - np.diff(bins).mean() / 2], bins])
-    length_table["length_bin"] = (
-        pd.cut(length_table["length"], bins=full_bins, labels=bins - 1).astype(float)
-    )
-    length_table_sex = (
-        length_table.groupby(["stratum", "sex", "length_bin"])["length_count"].sum().reset_index()
-    )
+    length_table["length_bin"] = pd.cut(
+        length_table["length"], bins=full_bins, labels=bins - 1
+    ).astype(float)
+    # length_table_sex = (
+    #     length_table.groupby(["stratum", "sex", "length_bin"])["length_count"].sum().reset_index()
+    # )
     length_table_all = (
         length_table.groupby(["stratum", "length_bin"])["length_count"].sum().reset_index()
     )
-    length_table_all['sex'] = 'all'
+    length_table_all["sex"] = "all"
     full_count = (
-        specimen_table.meld(length_table_all, contrasts=["stratum", "sex", "species_id", "length_bin"])
+        specimen_table.meld(
+            length_table_all, contrasts=["stratum", "sex", "species_id", "length_bin"]
+        )
         .loc[lambda x: x.sex.isin(sexes)]
-        .groupby(['stratum', 'sex', 'length_bin'])['length_count'].sum().reset_index()
+        .groupby(["stratum", "sex", "length_bin"])["length_count"]
+        .sum()
+        .reset_index()
     )
     full_count["total"] = full_count.groupby(["stratum", "sex"])["length_count"].transform("sum")
     full_count["number_proportion"] = full_count["length_count"] / full_count["total"]
     # ---- Combine into the full dataset for plotting
     plot_count_data = (
-        plot_weight_data
-        .merge(full_count.filter(["stratum", "sex", "length_bin", "number_proportion"]), 
-            on=["stratum", "sex", "length_bin"], how="left")
+        plot_weight_data.merge(
+            full_count.filter(["stratum", "sex", "length_bin", "number_proportion"]),
+            on=["stratum", "sex", "length_bin"],
+            how="left",
+        )
     ).fillna(0.0)
-    
+
     # Get a color map
-    colors = plt.colormaps['tab10']
-    num_strata = len(stratum_table['stratum'].unique())
+    colors = plt.colormaps["tab10"]
+    num_strata = len(stratum_table["stratum"].unique())
     num_sexes = len(sexes)
-    color_map = colors(num_strata)
-    
+    # color_map = colors(num_strata)
+
     # Plot
     fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(6, 8), sharex=True, sharey=True)
     plt.subplots_adjust(hspace=0.08, wspace=0.05, bottom=0.25)  # Adjust spacing between plots
-    
+
     # Plot weights and counts
     for i, sex in enumerate(sexes):
         # Weight plot (left column)
         ax_weight = axes[i, 0]
-        data_weight = plot_weight_data[plot_weight_data['sex'] == sex]
-        for j, (stratum, group) in enumerate(data_weight.groupby('stratum')):
-            # color = colors(i / num_strata) if num_strata > 1 else colors(0)        
+        data_weight = plot_weight_data[plot_weight_data["sex"] == sex]
+        for j, (stratum, group) in enumerate(data_weight.groupby("stratum")):
+            # color = colors(i / num_strata) if num_strata > 1 else colors(0)
             color = colors(j / num_strata) if num_strata > 1 else colors(0)
             total = group["weight"].sum()
             group["proportions"] = group["weight"] / total if total > 0.0 else 0.0
             ms = 5 if group["proportions"].max() > 0.0 else 0.1
-            # handle, = ax_weight.plot(group['length_bin'], group['proportions'], marker='o', 
+            # handle, = ax_weight.plot(group['length_bin'], group['proportions'], marker='o',
             #                          label=f'Stratum {stratum}', color=color, ms=ms)
-            ax_weight.plot(group['length_bin'], group['proportions'], marker='o', 
-                        label=f'Stratum {stratum}', color=color, ms=ms)
+            ax_weight.plot(
+                group["length_bin"],
+                group["proportions"],
+                marker="o",
+                label=f"Stratum {stratum}",
+                color=color,
+                ms=ms,
+            )
         if i == 0:
-            ax_weight.set_title(f'Weight')
+            ax_weight.set_title("Weight")
         if i < num_sexes - 1:  # No x-ticks for non-bottom plots
-            ax_weight.set_xlabel('')
+            ax_weight.set_xlabel("")
         if i == num_sexes // 2:
-            ax_weight.set_ylabel('Within-stratum proportion [0, 1]')
+            ax_weight.set_ylabel("Within-stratum proportion [0, 1]")
         if i == num_sexes - 1:  # Bottom plot
-            ax_weight.set_xlabel('Length bin (cm)')
+            ax_weight.set_xlabel("Length bin (cm)")
         ax_weight.set_ylim(0.0, 1.0)
         # Add label in the top-left corner
-        ax_weight.text(0.05, 1.00 - 0.05 * (num_sexes - 1), sex.title(), 
-                       transform=ax_weight.transAxes,
-                       fontsize=12, verticalalignment='top',
-                       bbox=dict(facecolor='white', alpha=0.8, 
-                                 edgecolor='none'))
-        
+        ax_weight.text(
+            0.05,
+            1.00 - 0.05 * (num_sexes - 1),
+            sex.title(),
+            transform=ax_weight.transAxes,
+            fontsize=12,
+            verticalalignment="top",
+            bbox=dict(facecolor="white", alpha=0.8, edgecolor="none"),
+        )
+
         # Count plot (right column)
         ax_count = axes[i, 1]
-        data_count = plot_count_data[plot_count_data['sex'] == sex]
-        for j, (stratum, group) in enumerate(data_count.groupby('stratum')):
+        data_count = plot_count_data[plot_count_data["sex"] == sex]
+        for j, (stratum, group) in enumerate(data_count.groupby("stratum")):
             color = colors(j / num_strata) if num_strata > 1 else colors(0)
             ms = 5 if group["number_proportion"].max() > 0.0 else 0.1
-            ax_count.plot(group['length_bin'], group['number_proportion'], 
-                        marker='o', label=f'Stratum {stratum}', color=color, ms=ms)
+            ax_count.plot(
+                group["length_bin"],
+                group["number_proportion"],
+                marker="o",
+                label=f"Stratum {stratum}",
+                color=color,
+                ms=ms,
+            )
         if i == 0:
-            ax_count.set_title(f"Number")
+            ax_count.set_title("Number")
         if i < num_sexes - 1:  # No x-ticks for non-bottom plots
-            ax_count.set_xlabel('')
+            ax_count.set_xlabel("")
         if i == num_sexes - 1:  # Bottom plot
-            ax_count.set_xlabel('Length bin (cm)')
+            ax_count.set_xlabel("Length bin (cm)")
         ax_count.set_ylim(0.0, 1.0)
         # Add label in the top-left corner
-        ax_count.text(0.05, 1.00 - 0.05 * (num_sexes - 1), sex.title(), 
-                      transform=ax_count.transAxes,
-                      fontsize=12, verticalalignment='top', 
-                      bbox=dict(facecolor='white', alpha=0.8, 
-                                edgecolor='none'))
+        ax_count.text(
+            0.05,
+            1.00 - 0.05 * (num_sexes - 1),
+            sex.title(),
+            transform=ax_count.transAxes,
+            fontsize=12,
+            verticalalignment="top",
+            bbox=dict(facecolor="white", alpha=0.8, edgecolor="none"),
+        )
     # Create a new axes for the legend
-    legend_ax = fig.add_axes([0.15, 0.05, 0.7, 0.1])  # Position the legend axes (left, bottom, width, height)
-    legend_ax.axis('off')  # Hide the new axes
-    
+    legend_ax = fig.add_axes(
+        [0.15, 0.05, 0.7, 0.1]
+    )  # Position the legend axes (left, bottom, width, height)
+    legend_ax.axis("off")  # Hide the new axes
+
     # Create a shared legend in the bottom-most subplot
-    handles, labels = axes[2, 1].get_legend_handles_labels() # Get handles and labels from the bottom-left plot
-    fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 0.2), 
-               ncol=num_strata // 2 + 1, fontsize='small', title='INPFC stratum')
+    handles, labels = axes[
+        2, 1
+    ].get_legend_handles_labels()  # Get handles and labels from the bottom-left plot
+    fig.legend(
+        handles,
+        labels,
+        loc="upper center",
+        bbox_to_anchor=(0.5, 0.2),
+        ncol=num_strata // 2 + 1,
+        fontsize="small",
+        title="INPFC stratum",
+    )
 
     # plt.show()
     return fig
diff --git a/echopop/live/sql_methods.py b/echopop/live/sql_methods.py
index 0e5f6a97..5a4765bb 100644
--- a/echopop/live/sql_methods.py
+++ b/echopop/live/sql_methods.py
@@ -1,13 +1,19 @@
-from sqlalchemy import create_engine, text, Engine, inspect
-import sqlalchemy as sqla
-import pandas as pd
-from typing import Optional, Literal, Union, List
-import numpy as np
-from pathlib import Path
 import re
+from pathlib import Path
+from typing import List, Optional, Union
 
-def sql_create(connection: sqla.Connection, dataframe: pd.DataFrame, table_name: str, 
-               primary_keys: Optional[list] = None):
+import numpy as np
+import pandas as pd
+import sqlalchemy as sqla
+from sqlalchemy import create_engine, inspect, text
+
+
+def sql_create(
+    connection: sqla.Connection,
+    dataframe: pd.DataFrame,
+    table_name: str,
+    primary_keys: Optional[list] = None,
+):
     """
     Generate a SQL command to create a table with dynamic columns, primary keys, and indices.
 
@@ -20,16 +26,15 @@ def sql_create(connection: sqla.Connection, dataframe: pd.DataFrame, table_name:
         str: The SQL command to create the table.
     """
     # Generate column definitions
-    column_definitions = (
-        ",\n".join(f"{col} {SQL_DTYPES[type(dataframe[col][0]).__name__]}" 
-                   for col in dataframe.columns)
-        )
-    
+    column_definitions = ",\n".join(
+        f"{col} {SQL_DTYPES[type(dataframe[col][0]).__name__]}" for col in dataframe.columns
+    )
+
     # Generate primary key definition
     primary_key_definition = ""
     if primary_keys:
         primary_key_definition = f",\nPRIMARY KEY ({', '.join(primary_keys)})"
-        
+
     # Combine all parts into the final SQL command
     create_table_command = f"""
     CREATE TABLE IF NOT EXISTS {table_name} (
@@ -37,13 +42,13 @@ def sql_create(connection: sqla.Connection, dataframe: pd.DataFrame, table_name:
         {primary_key_definition}
     );
     """
-    
+
     # Execute
     connection.execute(text(create_table_command.strip()))
 
+
 def sql_map_tables(connection: sqla.Connection):
-    """
-    """
+    """ """
     inspector = inspect(connection)
     table_names = inspector.get_table_names()
     # result = connection.execute(text("SELECT name FROM sqlite_master WHERE type='table';"))
@@ -52,7 +57,8 @@ def sql_map_tables(connection: sqla.Connection):
     # table_names = [name[0] for name in table_names]
     return table_names
 
-def sql_validate(connection: sqla.Connection, table_name: str): 
+
+def sql_validate(connection: sqla.Connection, table_name: str):
     """
     Check if a table exists in the database.
 
@@ -62,10 +68,11 @@ def sql_validate(connection: sqla.Connection, table_name: str):
 
     Returns:
         bool: True if the table exists, False otherwise.
-    """    
+    """
     inspector = inspect(connection)
     return table_name in inspector.get_table_names()
 
+
 def sql_inspect(connection: sqla.Connection, table_name: str, columns: List[str] = None):
     """
     Get a list of all tables present
@@ -75,17 +82,17 @@ def sql_inspect(connection: sqla.Connection, table_name: str, columns: List[str]
 
     Returns:
         list: True if the table exists, False otherwise.
-    """  
+    """
 
     # Inspect the columns from the table
     if columns is None:
         # ---- Create 'inspector' for the db file
         inspector = inspect(connection)
         # ---- Retrieve column information
-        column_info =  inspector.get_columns(table_name)
+        column_info = inspector.get_columns(table_name)
         # ---- Format as a dictionary and return the output
-        return {col['name']: {k: v for k, v in col.items() if k != 'name'} for col in column_info}
-    else: 
+        return {col["name"]: {k: v for k, v in col.items() if k != "name"} for col in column_info}
+    else:
         # Inspect unique values in specified columns
         # ---- Create SQL command
         sql_command = f"SELECT DISTINCT {', '.join(columns)} FROM {table_name};"
@@ -94,17 +101,23 @@ def sql_inspect(connection: sqla.Connection, table_name: str, columns: List[str]
         # ---- Extract unique values
         unique_values = table.fetchall()
         # ---- Format as a dictionary and return the output
-        return (
-            {col: list(set(row[idx] for row in unique_values)) for idx, col in enumerate(columns)}
-        )
+        return {
+            col: list(set(row[idx] for row in unique_values)) for idx, col in enumerate(columns)
+        }
+
 
 def sql_drop(connection: sqla.Connection, table_name: str):
-    """
-    """
+    """ """
     connection.execute(text(f"DROP TABLE IF EXISTS {table_name}"))
-    
-def sql_insert(connection: sqla.Connection, table_name: str, columns: list, dataframe: pd.DataFrame,
-               id_columns: Optional[list] = None):
+
+
+def sql_insert(
+    connection: sqla.Connection,
+    table_name: str,
+    columns: list,
+    dataframe: pd.DataFrame,
+    id_columns: Optional[list] = None,
+):
     """
     Insert data into a table.
 
@@ -115,11 +128,11 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data
         data (list of dict): List of dictionaries containing data to insert or update.
         conflict_columns (list): List of column names to use for conflict resolution.
     """
-    
+
     # Create 'inspector' for the db file
     inspector = inspect(connection)
     # ---- Get the column names from the db file
-    table_columns = [col['name'] for col in inspector.get_columns(table_name)]
+    table_columns = [col["name"] for col in inspector.get_columns(table_name)]
 
     # Prepare the SQL statement for insertion
     # ---- Check whether `columns` is '*'
@@ -139,7 +152,7 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data
     # Format `id_columns`
     if id_columns is not None and not isinstance(id_columns, list):
         id_columns = [id_columns]
-    
+
     # Convert the DataFrame into a tuple and then into a string
     # ---- Replace NaN with None
     dataframe = dataframe.replace([np.nan], [None])
@@ -147,27 +160,27 @@ def sql_insert(connection: sqla.Connection, table_name: str, columns: list, data
     dataframe = dataframe[columns]
     # ---- DataFrame to Tuple
     data_tuple = [tuple(row) for row in dataframe.itertuples(index=False)]
-    
+
     def format_value(x):
         if isinstance(x, str):
             return "'{}'".format(x.replace("'", "''"))
         elif isinstance(x, pd.Timestamp):
             return "'{}'".format(x)
         elif x is None:
-            return 'NULL'
+            return "NULL"
         else:
             return str(x)
-        
+
     # ---- Tuple to String
     # data_str = ", ".join(
     #     # f"({', '.join(map(lambda x: f'\'{x}\'' if isinstance(x, str) else str(x), row))})"
-    #             f"({', '.join(map(lambda x: f'\'{x}\'' 
-    #                                 if isinstance(x, str) or isinstance(x, pd.Timestamp) 
+    #             f"({', '.join(map(lambda x: f'\'{x}\''
+    #                                 if isinstance(x, str) or isinstance(x, pd.Timestamp)
     #                                 else 'NULL' if x is None else str(x), row))})"
     #     for row in data_tuple
     # )
     data_str = ", ".join(f"({','.join(map(lambda x: format_value(x), row))})" for row in data_tuple)
-    
+
     # Construct the "ON CONFLICT, DO UPDATE SET" if needed
     on_conflict_clause = ""
     if id_columns:
@@ -175,23 +188,29 @@ def format_value(x):
         ON CONFLICT ({', '.join(id_columns)})
         DO UPDATE SET {', '.join(f'{col}=excluded.{col}' for col in columns)}
         """
-    
+
     # Construct the SQL query
     sql_command = f"""
     INSERT INTO {table_name} ({column_names})
     VALUES {data_str}
     {on_conflict_clause}
-    """    
-    
+    """
+
     # Execute
     connection.execute(text(sql_command.strip()))
-    
+
     # Commit
     connection.commit()
 
-def sql_update(connection: sqla.Connection, table_name: str, columns: list, 
-               dataframe: Optional[pd.DataFrame] = None, operation: Optional[str] = None, 
-               condition: Optional[str] = None):
+
+def sql_update(
+    connection: sqla.Connection,
+    table_name: str,
+    columns: list,
+    dataframe: Optional[pd.DataFrame] = None,
+    operation: Optional[str] = None,
+    condition: Optional[str] = None,
+):
     """
     Insert data into a table.
 
@@ -202,14 +221,14 @@ def sql_update(connection: sqla.Connection, table_name: str, columns: list,
         data (list of dict): List of dictionaries containing data to insert or update.
         conflict_columns (list): List of column names to use for conflict resolution.
     """
-    
+
     # Prepare the SQL statement for insertion
     # ---- Check whether `columns` is '*'
     if "*" in columns:
         # ---- Create 'inspector' for the db file
         inspector = inspect(connection)
         # ---- Get the column names from the db file
-        columns = [col['name'] for col in inspector.get_columns(table_name)]
+        columns = [col["name"] for col in inspector.get_columns(table_name)]
     # ---- If not a List
     elif not isinstance(columns, list):
         columns = [columns]
@@ -220,15 +239,16 @@ def format_value(x):
         elif isinstance(x, pd.Timestamp):
             return "'{}'".format(x)
         elif x is None:
-            return 'NULL'
+            return "NULL"
         else:
             return str(x)
 
     # Format the SET command
     # ---- Update column by applying arithmetic between table and dataframe
     if operation is not None and dataframe is not None:
-        set_list = [f"{column} = {column} {operation} {dataframe[column].values[0]}" 
-                    for column in columns]
+        set_list = [
+            f"{column} = {column} {operation} {dataframe[column].values[0]}" for column in columns
+        ]
     # ---- Update column by applying arithmetic within table
     if dataframe is None and operation is not None:
         # ---- Make sure `operation` is a list
@@ -240,7 +260,7 @@ def format_value(x):
     if dataframe is not None and operation is None:
         set_list = [f"{column} = {dataframe[column].values[0]}" for column in columns]
     # ---- Join the list
-    set_clause = ', '.join(set_list)
+    set_clause = ", ".join(set_list)
 
     # Add the WHERE clause if a parsed condition is provided
     if condition is not None:
@@ -253,14 +273,18 @@ def format_value(x):
 
     # Execute
     connection.execute(text(sql_command.strip()))
-    
+
     # Commit
     connection.commit()
 
-def sql_select(connection: sqla.Connection, table_name: str, 
-               columns: Optional[Union[list, str]] = None, 
-               condition: Optional[str] = None, 
-               output_type: type = pd.DataFrame):
+
+def sql_select(
+    connection: sqla.Connection,
+    table_name: str,
+    columns: Optional[Union[list, str]] = None,
+    condition: Optional[str] = None,
+    output_type: type = pd.DataFrame,
+):
 
     # Columns
     if columns is None:
@@ -286,16 +310,16 @@ def sql_select(connection: sqla.Connection, table_name: str,
         parsed_condition = parse_condition(condition)
         sql_command += " WHERE " + parsed_condition
 
-    # Execute the command 
+    # Execute the command
     table = connection.execute(text(sql_command))
 
     # Fetch the data from the table
     data = table.fetchall()
-    
+
     # Inspect the table to construct a dictionary of expected datatypes for each column
     table_info = sql_inspect(connection, table_name=table_name)
     # ---- Whittle down the information dictionary to isolate just the column datatypes
-    table_dtypes = {col: info['type'] for col, info in table_info.items()}
+    table_dtypes = {col: info["type"] for col, info in table_info.items()}
 
     # Raise error if `output_type` is invalid
     if output_type not in [pd.DataFrame, np.ndarray, str, tuple]:
@@ -304,23 +328,28 @@ def sql_select(connection: sqla.Connection, table_name: str,
             f"`pandas.DataFrame`, or `numpy.ndarray`."
         )
 
-    # Format the output 
+    # Format the output
     # ---- DataFrame
     if output_type is pd.DataFrame:
         # ---- Create DataFrame
         output_df = pd.DataFrame(data, columns=table.keys())
         # ---- Format the expected datatypes
-        df_dtypes = {col: SQL_DTYPES[type(dtype).__name__] 
-                     for col, dtype in table_dtypes.items() if col in columns }
+        df_dtypes = {
+            col: SQL_DTYPES[type(dtype).__name__]
+            for col, dtype in table_dtypes.items()
+            if col in columns
+        }
         # ---- Apply the dtypes
         return output_df.astype(df_dtypes)
     else:
         # ---- Get the datatypes that will correspond to each value of the tuples
         tuple_dtypes = [SQL_DTYPES[type(dtype).__name__] for _, dtype in table_dtypes.items()]
-        # ---- Convert the `Row` objects to tuples 
+        # ---- Convert the `Row` objects to tuples
         converted_data = [
-            tuple(dtype(value) if value is not None else None
-                for value, dtype in zip(row, tuple_dtypes))
+            tuple(
+                dtype(value) if value is not None else None
+                for value, dtype in zip(row, tuple_dtypes)
+            )
             for row in data
         ]
         # ---- String
@@ -333,25 +362,25 @@ def sql_select(connection: sqla.Connection, table_name: str,
         else:
             return converted_data
 
-def validate_tables(db_file: str, table_name: Union[str, List[str]], 
-                    reference_dataframe: pd.DataFrame):
+
+def validate_tables(
+    db_file: str, table_name: Union[str, List[str]], reference_dataframe: pd.DataFrame
+):
 
     # Helper function
     def _validate_table(table):
         # ---- Check table existence
         if not SQL(db_file, "validate", table_name=table):
-            raise KeyError(
-            f"SQL database table `{table}` in `{db_file}` failed to initialize!"
-        )
+            raise KeyError(f"SQL database table `{table}` in `{db_file}` failed to initialize!")
         # ---- Get DataFrame dtypes (avoid 'object' and similar ambiguous typing)
-        expected_dtypes = (
-            {col: type(reference_dataframe[col][0]).__name__ for col in reference_dataframe.columns}
-        )
-        # ---- Inspect the table 
+        expected_dtypes = {
+            col: type(reference_dataframe[col][0]).__name__ for col in reference_dataframe.columns
+        }
+        # ---- Inspect the table
         inspected_table = SQL(db_file, "inspect", table_name=table)
         # ---- Get the column dtypes (with back-formatting via configuration mapping)
         table_dtypes = {
-            col: SQL_DTYPES[type(inspected_table["filepath"]["type"]).__name__].__name__ 
+            col: SQL_DTYPES[type(inspected_table["filepath"]["type"]).__name__].__name__
             for col in inspected_table.keys()
         }
         # ---- Compare keys
@@ -363,10 +392,11 @@ def _validate_table(table):
                 f"{', '.join(key_difference)}."
             )
         # ---- Compare dtypes
-        dtypes_comparison = (
-            {key: table_dtypes[key] for key in table_dtypes 
-             if table_dtypes[key] != expected_dtypes.get(key)}
-        )
+        dtypes_comparison = {
+            key: table_dtypes[key]
+            for key in table_dtypes
+            if table_dtypes[key] != expected_dtypes.get(key)
+        }
         # ---- Get key names
         dtypes_different_names = list(set(dtypes_comparison))
         # ---- Raise error, if needed
@@ -375,16 +405,17 @@ def _validate_table(table):
                 f"The following columns from table `{table}` in `{db_file}` had unexpected "
                 f"datatypes: {', '.join(dtypes_different_names)}."
             )
-        
+
     # Iterate through tables to validate
     if isinstance(table_name, list):
         _ = [_validate_table(table) for table in table_name]
     else:
         _validate_table(table_name)
 
+
 def initialize_database(root_directory: Path, file_settings: dict):
 
-    # Get the database name 
+    # Get the database name
     db_name = file_settings["database_name"]
 
     # Create filepath to the SQL database
@@ -401,22 +432,27 @@ def initialize_database(root_directory: Path, file_settings: dict):
 
     # Create two tables for 'files read' and 'files processed'
     # ---- Read files
-    SQL(db_file, "create", table_name="files_read", dataframe=template_df, 
-        primary_keys=["filepath"])
+    SQL(
+        db_file, "create", table_name="files_read", dataframe=template_df, primary_keys=["filepath"]
+    )
     # ---- Processed files
-    SQL(db_file, "create", table_name="files_processed", dataframe=template_df, 
-        primary_keys=["filepath"])    
-    
+    SQL(
+        db_file,
+        "create",
+        table_name="files_processed",
+        dataframe=template_df,
+        primary_keys=["filepath"],
+    )
+
     # Query the database ensure it exists
     # ---- File existence
     if not Path(db_file).exists():
-        raise FileExistsError(
-            f"SQL database file `{db_file}` failed to initialize!"
-        )
-    
+        raise FileExistsError(f"SQL database file `{db_file}` failed to initialize!")
+
     # Validate the created tables
     validate_tables(db_file, ["files_read", "files_processed"], template_df)
 
+
 SQL_COMMANDS = {
     "create": dict(function=sql_create, args=["table_name", "dataframe", "primary_keys"]),
     "drop": dict(function=sql_drop, args=["table_name"]),
@@ -424,37 +460,41 @@ def initialize_database(root_directory: Path, file_settings: dict):
     "inspect": dict(function=sql_inspect, args=["table_name", "columns"]),
     "map": dict(function=sql_map_tables, args=[]),
     "select": dict(function=sql_select, args=["table_name", "columns", "output_type", "condition"]),
-    "update": dict(function=sql_update, args=["table_name", "columns", "condition", "operation", 
-                                              "dataframe"]),
+    "update": dict(
+        function=sql_update, args=["table_name", "columns", "condition", "operation", "dataframe"]
+    ),
     "validate": dict(function=sql_validate, args=["table_name"]),
 }
-        
+
 SQL_DTYPES = {
-    'int32': 'INTEGER',
-    'int64': 'INTEGER',
-    'float64': 'FLOAT',
+    "int32": "INTEGER",
+    "int64": "INTEGER",
+    "float64": "FLOAT",
     "float": "FLOAT",
     "int": "INTEGER",
-    'bool': 'BOOLEAN',
+    "bool": "BOOLEAN",
     "Interval": "TEXT",
     "Timestamp": "DATETIME",
-    'object': 'TEXT',
+    "object": "TEXT",
     "str": "TEXT",
     "FLOAT": float,
     "INTEGER": int,
     "DATETIME": str,
     "TEXT": str,
     "BIGINT": int,
-} 
-
-def sql_group_update(db_file: str,
-                     dataframe: pd.DataFrame, 
-                     table_name: str,
-                     columns: List[str],
-                     unique_columns: List[str],
-                     operation: Optional[str] = None,
-                     id_columns: Optional[List[str]] = None):
-    
+}
+
+
+def sql_group_update(
+    db_file: str,
+    dataframe: pd.DataFrame,
+    table_name: str,
+    columns: List[str],
+    unique_columns: List[str],
+    operation: Optional[str] = None,
+    id_columns: Optional[List[str]] = None,
+):
+
     # Check for unique values contained within the table
     unique_values = SQL(db_file, "inspect", table_name=table_name, columns=unique_columns)
 
@@ -462,8 +502,9 @@ def sql_group_update(db_file: str,
     table_values = {col: dataframe[col].unique().tolist() for col in unique_columns}
 
     # Find mismatched indices
-    new_indices = {col: list(set(table_values[col]) - set(unique_values[col])) 
-                   for col in unique_columns}
+    new_indices = {
+        col: list(set(table_values[col]) - set(unique_values[col])) for col in unique_columns
+    }
 
     # Filter the DataFrame to include only rows with these missing values
     # ---- Create DataFrame copy
@@ -477,18 +518,20 @@ def sql_group_update(db_file: str,
             filtered_df = pd.DataFrame(columns=filtered_df.columns)
 
     # Insert into the table if not otherwise present
-    if not filtered_df.empty: 
+    if not filtered_df.empty:
         SQL(db_file, "insert", table_name=table_name, id_columns=id_columns, dataframe=filtered_df)
-       
+
     case_statements = []
     for col in columns:
         case_stmt = "CASE"
         for _, row in dataframe.iterrows():
             # Construct the filter condition based on unique_columns
-            filter_conditions = ' AND '.join([
-                f"{col} = '{row[col]}'" if isinstance(row[col], str) else f"{col} = {row[col]}"
-                for col in unique_columns
-            ])
+            filter_conditions = " AND ".join(
+                [
+                    f"{col} = '{row[col]}'" if isinstance(row[col], str) else f"{col} = {row[col]}"
+                    for col in unique_columns
+                ]
+            )
             # Add the WHEN condition to the CASE statement
             case_stmt += f" WHEN {filter_conditions} THEN {row[col]}"
         case_stmt += f" ELSE {col} END"
@@ -497,8 +540,7 @@ def sql_group_update(db_file: str,
             case_statements.append(f"{col} = {col} {operation} {case_stmt}")
         else:
             case_statements.append(f"{col} = {case_stmt}")
-        
-            
+
     # Update the table
     # ---- Format the conditional string
     # case_statements = []
@@ -519,7 +561,7 @@ def sql_group_update(db_file: str,
     update_clause = ", ".join(case_statements)
 
     # Format the SQL COMMAND string
-    # sql_command = f"""        
+    # sql_command = f"""
     # UPDATE {table_name}
     # SET {update_clause}
     # WHERE ({' OR '.join([
@@ -530,7 +572,7 @@ def sql_group_update(db_file: str,
     #     for _, row in dataframe.iterrows()
     # ])});
     # """
-    sql_command = f"""        
+    sql_command = f"""
     UPDATE {table_name}
     SET {update_clause};
     """
@@ -554,6 +596,7 @@ def sql_group_update(db_file: str,
     # Dispose engine
     engine.dispose()
 
+
 def get_table_key_names(db_file: Path, data_dict: dict, table_name: str) -> List[str]:
 
     # Get the data input column names
@@ -567,17 +610,15 @@ def get_table_key_names(db_file: Path, data_dict: dict, table_name: str) -> List
         table_columns = data_dict[table_name].columns
 
     # Create a list of the primary keys
-    key_columns = (
-           set(table_columns)
-           .intersection(["trawl_partition", "sex", "haul_num", "species_id", "longitude", 
-                          "latitude", "stratum"]) 
-        )
+    key_columns = set(table_columns).intersection(
+        ["trawl_partition", "sex", "haul_num", "species_id", "longitude", "latitude", "stratum"]
+    )
 
     # Return a list of the output
     return list(key_columns)
 
-def get_unique_identifiers(data_dict: dict,
-                           unique_columns: List[str]) -> pd.DataFrame:
+
+def get_unique_identifiers(data_dict: dict, unique_columns: List[str]) -> pd.DataFrame:
 
     # Gather all dataframes from a dictionary into a list
     if isinstance(data_dict, dict):
@@ -585,12 +626,16 @@ def get_unique_identifiers(data_dict: dict,
     else:
         df_list = [data_dict]
 
-    # Get unique values of each contrast column across the biological datasets    
+    # Get unique values of each contrast column across the biological datasets
     combined_df = pd.concat(
-        [df[unique_columns] for df in df_list if isinstance(df, pd.DataFrame) and all(col in df.columns for col in unique_columns)], 
-        ignore_index=True
+        [
+            df[unique_columns]
+            for df in df_list
+            if isinstance(df, pd.DataFrame) and all(col in df.columns for col in unique_columns)
+        ],
+        ignore_index=True,
     ).drop_duplicates()
-    
+
     # Reduce into a single DataFrame
     return combined_df
     # if len(unique_columns) > 1:
@@ -601,60 +646,80 @@ def get_unique_identifiers(data_dict: dict,
 
 def parse_condition(condition: str):
     # Replace logical operators with SQL equivalents
-    condition = condition.replace('&', ' AND ').replace('|', ' OR ')
-    
+    condition = condition.replace("&", " AND ").replace("|", " OR ")
+
     # Handle "IN" lists and replace square brackets with parentheses
-    condition = re.sub(r'(\w+)\s*IN\s*\[(.*?)\]', lambda m: f"{m.group(1)} IN ({m.group(2)})", condition, flags=re.IGNORECASE)
-    
+    condition = re.sub(
+        r"(\w+)\s*IN\s*\[(.*?)\]",
+        lambda m: f"{m.group(1)} IN ({m.group(2)})",
+        condition,
+        flags=re.IGNORECASE,
+    )
+
     # Handle range conditions for BETWEEN, including floats
-    condition = re.sub(r'(\d*\.\d+|\d+)\s*<=\s*(\w+)\s*<=\s*(\d*\.\d+|\d+)', 
-                       lambda m: f"{m.group(2)} BETWEEN {m.group(1)} AND {m.group(3)}", condition)
-    
+    condition = re.sub(
+        r"(\d*\.\d+|\d+)\s*<=\s*(\w+)\s*<=\s*(\d*\.\d+|\d+)",
+        lambda m: f"{m.group(2)} BETWEEN {m.group(1)} AND {m.group(3)}",
+        condition,
+    )
+
     # Handle individual comparisons
-    condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\d*\.\d+|\d+)', lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", condition)
-    condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\'[^\']*\')', lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", condition)
+    condition = re.sub(
+        r"(\w+)\s*([<>!=]+)\s*(\d*\.\d+|\d+)",
+        lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}",
+        condition,
+    )
+    condition = re.sub(
+        r"(\w+)\s*([<>!=]+)\s*(\'[^\']*\')",
+        lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}",
+        condition,
+    )
 
     # Return the parsed condition
     return condition
 
+
 def format_sql_select(table_name, column_names, condition_string):
     # Base SQL command to select columns from the table
     sql_command = f"SELECT {column_names} FROM {table_name}"
-    
+
     # Parse the condition string
     parsed_condition = parse_condition(condition_string)
-    
+
     # Add the WHERE clause if a parsed condition is provided
     if parsed_condition:
         sql_command += " WHERE " + parsed_condition
-    
+
     # Add a semicolon at the end of the SQL command
     sql_command += ";"
-    
+
     return sql_command
 
+
 def format_sql_columns(kwargs: dict):
 
     # Columns
     if "columns" in kwargs and "condition" not in kwargs:
         if isinstance(kwargs["columns"], list) or isinstance(kwargs["columns"], pd.Index):
             kwargs["columns"] = ", ".join(kwargs["columns"])
-    elif "columns" not in kwargs: 
+    elif "columns" not in kwargs:
         kwargs["columns"] = "*"
 
     # ID/Conflict columns
     if "id_columns" in kwargs:
         if isinstance(kwargs["id_columns"], list) or isinstance(kwargs["id_columns"], pd.Index):
-            kwargs["id_columns"] = ", ".join(kwargs["id_columns"])      
+            kwargs["id_columns"] = ", ".join(kwargs["id_columns"])
 
     # Return the updated `kwargs` dictionary
     return kwargs
 
+
 # TODO: Documentation
-def query_processed_files(root_directory: Path, file_settings: dict, files: List[Path],
-                          processed=False) -> dict:
+def query_processed_files(
+    root_directory: Path, file_settings: dict, files: List[Path], processed=False
+) -> dict:
 
-    # Get the database name 
+    # Get the database name
     db_name = file_settings["database_name"]
 
     # Create filepath to the SQL database
@@ -670,22 +735,33 @@ def query_processed_files(root_directory: Path, file_settings: dict, files: List
     files_str = [str(file) for file in files]
     # ---- Create DataFrame
     current_files = pd.DataFrame(files_str, columns=["filepath"])
-   
+
     # Check against `files_processed`
     previous_files = SQL(db_file, "select", table_name="files_processed", output_type=str)
 
     # Insert the files into the `files_read` table
-    if processed: 
-        SQL(db_file, "insert", table_name="files_processed", dataframe=current_files, 
-            id_columns=["filepath"])
+    if processed:
+        SQL(
+            db_file,
+            "insert",
+            table_name="files_processed",
+            dataframe=current_files,
+            id_columns=["filepath"],
+        )
     elif not current_files.empty:
-        SQL(db_file, "insert", table_name="files_read", dataframe=current_files, 
-            id_columns=["filepath"])
+        SQL(
+            db_file,
+            "insert",
+            table_name="files_read",
+            dataframe=current_files,
+            id_columns=["filepath"],
+        )
         # ---- Apply filter by comparing sets and return the output
         return list(set(files_str) - set(previous_files)), db_file
     else:
         return None, db_file
 
+
 # TODO: Documentation
 def sql_data_exchange(database_file: Path, **kwargs):
 
@@ -700,18 +776,21 @@ def sql_data_exchange(database_file: Path, **kwargs):
     if not table_exists:
         # ---- Create table
         SQL(database_file, "create", **kwargs)
-        # ---- Insert into table        
+        # ---- Insert into table
         SQL(database_file, "insert", **kwargs)
         # ---- Return the initial dataframe
         return kwargs.get("dataframe")
-    
+
     # Insert into the table
     SQL(database_file, "insert", **kwargs)
-    
+
     # Select existing data frame the database and return the output
     return SQL(database_file, "select", **kwargs)
 
-def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str, List[str]]] = None):
+
+def reset_db_files(
+    file_configuration: dict, table_exception: Optional[Union[str, List[str]]] = None
+):
 
     # Get all database files
     database_files = file_configuration["database"]
@@ -727,22 +806,23 @@ def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str
         if None not in table_exception:
             table_names = list(set(table_names) - set(table_exception))
         # ---- Iterate through
-        for table_name in table_names:    
+        for table_name in table_names:
             SQL(db_file, "drop", table_name=table_name)
-        # ---- Validate that all tables were removed  
-        remaining_tables = SQL(table_names, "map")      
+        # ---- Validate that all tables were removed
+        remaining_tables = SQL(table_names, "map")
         if set(table_names).intersection(set(remaining_tables)):
-            raise ValueError(
-                f"Attempted reset of [{str(db_file)}] failed."
-            )
+            raise ValueError(f"Attempted reset of [{str(db_file)}] failed.")
+
+
+def query_dataset(
+    db_file: str,
+    data_dict: dict,
+    table_name: str,
+    data_columns: List[str],
+    unique_columns: List[str],
+    constraint: Optional[str] = None,
+):
 
-def query_dataset(db_file: str,
-                  data_dict: dict,
-                  table_name: str,
-                  data_columns: List[str],
-                  unique_columns: List[str],
-                  constraint: Optional[str] = None):
-    
     # Validate that the desired table exists
     if SQL(db_file, "validate", table_name=table_name):
         # ---- Inspect the SQL table
@@ -753,51 +833,58 @@ def query_dataset(db_file: str,
         valid_keys = list(set(inspected_table.keys()).intersection(set(data_columns)))
         # ---- Get unique identifiers
         unique_keys_df = get_unique_identifiers(data_dict, unique_keys)
+
         # ---- Conditional string formatting helper function
         def format_value(x):
             if isinstance(x, str):
                 return "'{}'".format(x.replace("'", "''"))
             return str(x)
-        # ---- Create conditional string  
+
+        # ---- Create conditional string
         conditional_str = " | ".join(
-            [" & ".join([f"{col} = {format_value(val)}" for col, val in row.items()]) 
-            for _, row in unique_keys_df.iterrows()]
+            [
+                " & ".join([f"{col} = {format_value(val)}" for col, val in row.items()])
+                for _, row in unique_keys_df.iterrows()
+            ]
         )
         # conditional_str = " | ".join(
-        #     [" & ".join([f"{col} = {val}" for col, val in row.items()]) 
+        #     [" & ".join([f"{col} = {val}" for col, val in row.items()])
         #     for _, row in unique_keys_df.iterrows()]
-        # )          
+        # )
         # conditional_str = (
-        #    " & ".join([f"{col} in {np.unique(unique_keys_df[col]).tolist()}" 
-        #                for col in unique_keys_df.columns])  
+        #    " & ".join([f"{col} in {np.unique(unique_keys_df[col]).tolist()}"
+        #                for col in unique_keys_df.columns])
         # )
         # ---- Append the additional constraint statement if present
         if constraint is not None:
             conditional_str = f"({conditional_str})" + f" & {constraint}"
         # ---- SELECT the dataset using the conidtional statement
-        data_sql = SQL(db_file, "select", table_name=table_name, columns=valid_keys,
-                       condition=conditional_str).filter(data_columns)
+        data_sql = SQL(
+            db_file, "select", table_name=table_name, columns=valid_keys, condition=conditional_str
+        ).filter(data_columns)
     else:
         data_sql = None
 
     # Return the table DataFrame
     return data_sql
-def sql_update_strata_summary(source_db: str,
-                              target_db: str,
-                              source_table: str,
-                              target_table: str,
-                              data_columns: List[tuple[str, str]],
-                              strata: list):
-    
+
+
+def sql_update_strata_summary(
+    source_db: str,
+    target_db: str,
+    source_table: str,
+    target_table: str,
+    data_columns: List[tuple[str, str]],
+    strata: list,
+):
+
     # Format strata list as a string
-    strata_str = ', '.join(map(str, strata))
+    strata_str = ", ".join(map(str, strata))
 
     # Function reference map
     FUNCTION_MAP = {
-        "sum": {"function": "SUM", 
-                "suffix": "sum"},
-        "mean": {"function": "AVG",
-                "suffix": "mean"}
+        "sum": {"function": "SUM", "suffix": "sum"},
+        "mean": {"function": "AVG", "suffix": "mean"},
     }
 
     # Prepare the SQL script
@@ -830,7 +917,7 @@ def sql_update_strata_summary(source_db: str,
         )
         WHERE stratum IN ({strata_str});
         """
-    # ----- Append DETACH commands only once at the end   
+    # ----- Append DETACH commands only once at the end
     sql_script += """
     -- Detach the databases
     DETACH DATABASE source;
@@ -840,7 +927,7 @@ def sql_update_strata_summary(source_db: str,
     # Create the engine
     engine = create_engine(f"sqlite:///{target_db}")
 
-    # Create the SQL database connection and send the script 
+    # Create the SQL database connection and send the script
     with engine.connect() as connection:
         dbapi_conn = connection.connection
         _ = dbapi_conn.executescript(sql_script)
@@ -851,11 +938,11 @@ def SQL(db_file: str, command: str, **kwargs):
 
     # Create engine from `db_file` string
     engine = create_engine(f"sqlite:///{db_file}")
-    
+
     # Format the data columns, if necessary, to fit within the SQL commands
     if command not in ["inspect", "update"]:
         kwargs = format_sql_columns(kwargs)
-    
+
     # Run the command
     try:
         with engine.connect() as connection:
@@ -867,6 +954,6 @@ def SQL(db_file: str, command: str, **kwargs):
             kwargs = {key: value for key, value in kwargs.items() if key in command_args}
             # ---- Return output
             return command_function(connection, **kwargs)
-    finally: 
+    finally:
         # ---- Dispose of the engine to release any resources being pooled/used
         engine.dispose()
diff --git a/echopop/mesh_generation.py b/echopop/mesh_generation.py
index 7752fe63..257829db 100644
--- a/echopop/mesh_generation.py
+++ b/echopop/mesh_generation.py
@@ -1,2267 +1,2307 @@
-import numpy as np
-import pandas as pd
-from sqlalchemy import create_engine, text
-from pathlib import Path
-import os 
-
-SQL_COMMANDS["create"].format(**{"table_name": "A", "column_definitions": "B"})
-
-# Coordinates
-x = np.array([1, 2, 3, 4, 5])
-y = np.array([1, 2, 3, 4, 5])
-
-# Create the grid points
-grid_points = [(i, j, 0) for i in x for j in y]
-
-def initialize_grid():
-
-
-data_root_dir = Path("C:/Users/Brandyn/Documents/GitHub/EchoPro_data/")
-db_directory = data_root_dir / "database"
-# ---- Create the directory if it does not already exist
-db_directory.mkdir(parents=True, exist_ok=True)
-# ---- Complete path to `biology.db`
-db_file = db_directory / "grid.db"
-
-from sqlalchemy import create_engine, MetaData, Table, select, inspect, update, text, case
-
-# Initialize the database and create the table
-engine = create_engine(f"sqlite:///{db_file}")
-
-# Define metadata and the table to drop
-metadata = MetaData()
-grid_table = Table('grid', metadata, autoload_with=engine)
-# Drop the table
-with engine.connect() as connection:
-    grid_table.drop(connection)
-    print("Table 'grid' has been dropped.")
-
-# Inspect the database
-inspector = inspect(engine)
-tables = inspector.get_table_names()
-print(tables)
-
-def create_table_sql(table_name, columns, primary_keys=None, index_columns=None):
-    """
-    Generate a SQL command to create a table with dynamic columns, primary keys, and indices.
-
-    Args:
-        table_name (str): The name of the table.
-        columns (dict): A dictionary where keys are column names and values are data types.
-        primary_keys (list, optional): List of column names to be used as primary keys.
-        index_columns (list, optional): List of column names to be indexed.
-
-    Returns:
-        str: The SQL command to create the table.
-    """
-    # Generate column definitions
-    column_definitions = ",\n    ".join(f"{col} {dtype}" for col, dtype in columns.items())
-
-    # Generate primary key definition
-    primary_key_definition = ""
-    if primary_keys:
-        primary_key_definition = f",\n    PRIMARY KEY ({', '.join(primary_keys)})"
-
-    # Generate index definitions
-    index_definitions = ""
-    if index_columns:
-        index_definitions = "\n".join(
-            f"CREATE INDEX IF NOT EXISTS idx_{table_name}_{col} ON {table_name} ({col});"
-            for col in index_columns
-        )
-
-    # Combine all parts into the final SQL command
-    create_table_command = f"""
-    CREATE TABLE IF NOT EXISTS {table_name} (
-        {column_definitions}
-        {primary_key_definition}
-    );
-    """
-    # Return the command and any index definitions
-    return create_table_command.strip() + "\n" + index_definitions
-
-# Define metadata and the table to drop
-metadata = MetaData()
-grid_table = Table('grid', metadata, autoload_with=engine)
-# Drop the table
-with engine.connect() as connection:
-    grid_table.drop(connection)
-    print("Table 'grid' has been dropped.")
-    
-check_table_exists(engine, "grid")
-
-with engine.connect() as connection:
-    sql_create(connection, df, table_name, primary_keys)
-
-# Create the table
-table_name = "grid"
-columns = {"x": "INTEGER", "y": "INTEGER", "value": "REAL"}
-primary_keys = ["x", "y"]
-index_columns = ["x", "y"]
-
-create_sql = create_table_sql(table_name, columns, primary_keys, index_columns)
-print("Create Table SQL:\n", create_sql)
-
-with engine.connect() as connection:
-    connection.execute(text(create_sql))
-
-inspector = inspect(engine)
-tables = inspector.get_table_names()
-print(tables)
-
-check_table_exists(engine, "grid")
-
-sql_command = f"SELECT * FROM {table_name};"
-
-with engine.connect() as connection:
-    result = connection.execute(text(sql_command))
-    rows = result.fetchall()
-
-for row in rows:
-    print(row)
-    
-converted_data[0]
-check_table_exists(engine, "files_read")
-
-zarr_files_str = ["A", "B", "C", "D"]
-# ---- Create DataFrame
-current_files = pd.DataFrame(zarr_files_str, columns=["filepath"])
-
-with engine.connect() as connection:
-    sql_create(connection, table_name="files_read", df=current_files)
-    sql_insert(connection, table_name="files_read", columns=["filepath"], dataframe=current_files)
-
-table_name = "files_read"
-sql_command = f"SELECT * FROM {table_name};"
-
-with engine.connect() as connection:
-    result = connection.execute(text(sql_command))
-    rows = result.fetchall()
-
-for row in rows:
-    print(row)
-    
-    
-
-from sqlalchemy.exc import IntegrityError
-
-def insert_or_update(engine, table_name, columns, data, conflict_columns):
-    """
-    Insert or update data in a table.
-
-    Args:
-        engine (Engine): The SQLAlchemy engine instance.
-        table_name (str): The name of the table.
-        columns (list): List of column names.
-        data (list of dict): List of dictionaries containing data to insert or update.
-        conflict_columns (list): List of column names to use for conflict resolution.
-    """
-
-    # Prepare the SQL statement for insertion
-    column_names = ", ".join(columns)
-    placeholder = ", ".join(f":{col}" for col in columns)
-    # values_list = ", ".join(f"({', '.join(f':{col}' for col in columns)})" for _ in data)
-    values_str = ", ".join(
-        f"({', '.join(map(str, row))})"
-        for row in data
-    )
-    
-    
-    # Construct the SQL query
-    sql = f"""
-    INSERT INTO {table_name} ({column_names})
-    VALUES {values_str}
-    ON CONFLICT ({', '.join(conflict_columns)})
-    DO UPDATE SET {', '.join(f'{col}=excluded.{col}' for col in columns)}
-    """
-    
-    # Flatten the list of data for execution
-    # flattened_data = [item for sublist in [[(item[col] for col in columns)] for item in data] for item in sublist]
-
-    # Execute the SQL command
-    with engine.connect() as connection:
-        try:
-            connection.execute(text(sql))
-            # connection.commit()
-            print(f"Data inserted or updated successfully in table '{table_name}'.")
-        except IntegrityError as e:
-            print(f"IntegrityError: {e}")
-        except Exception as e:
-            print(f"An error occurred: {e}")
-            
-# Prepare data for insertion or update
-# data = [{'x': i, 'y': j, 'value': v} for i, j, v in grid_points]
-data = grid_points
-
-# Insert or update data
-insert_or_update(engine, table_name, columns.keys(), data, primary_keys)
-
-sql_command = f"SELECT * FROM {table_name};"
-
-with engine.connect() as connection:
-    result = connection.execute(text(sql_command))
-    rows = result.fetchall()
-
-for row in rows:
-    print(row)
-
-def update_specific_rows(engine, table_name, updates, conditions):
-    """
-    Update specific rows in a table based on conditions.
-
-    Args:
-        engine (Engine): The SQLAlchemy engine instance.
-        table_name (str): The name of the table.
-        updates (dict): Dictionary of columns and their new values to be updated.
-        conditions (dict): Dictionary of columns and their values to be used in the WHERE clause.
-    """
-
-    # Construct the SET clause for the update
-    set_clause = ", ".join(f"{col} = :{col}" for col in updates.keys())
-    
-    # Construct the WHERE clause for the update
-    where_clause = " AND ".join(f"{col} = :{col}_cond" for col in conditions.keys())
-    
-    # Construct the SQL query
-    sql = f"""
-    UPDATE {table_name}
-    SET {set_clause}
-    WHERE {where_clause}
-    """
-    
-    # Prepare parameters for the query
-    parameters = {**updates, **{f"{col}_cond": val for col, val in conditions.items()}}
-
-    # Execute the SQL command
-    with engine.connect() as connection:
-        try:
-            connection.execute(text(sql), parameters)
-            print(f"Rows updated successfully in table '{table_name}'.")
-        except IntegrityError as e:
-            print(f"IntegrityError: {e}")
-        except Exception as e:
-            print(f"An error occurred: {e}")
-
-# Define table name
-table_name = "grid"
-# Define the table and columns
-table_name = 'grid'
-condition_columns = ['x', 'y']
-
-# Define the updates and conditions
-dd = {"x": np.array([1, 2, 3 , 4, 5]),"" "y": np.array([1, 2, 3 , 4, 5]), "value": np.array([1, 2, 3 , 4, 5]).astype(float)}
-new_data = pd.DataFrame(dd)
-new_data
-df = new_data
-
-kwargs = {"table_name": "grid", "columns": df.columns, "df": df}
-
-with engine.connect() as connection: 
-    # sql_create(connection, table_name = "grid", df = df)
-    # sql_validate(connection, "grid")
-    # sql_drop(connection, "grid")
-    sql_insert(connection, table_name="grid", columns=df.columns, dataframe=df, id_columns=["x", "y"])
-
-
-data_tuples = [tuple(row) for row in df.itertuples(index=False)]
-
-all_columns = df.columns.tolist()
-if len(condition_columns) >= len(all_columns):
-    raise ValueError("The number of condition columns must be less than the number of columns in data.")
-
-# Prepare column names and conditions
-update_columns = [col for col in all_columns if col not in condition_columns]
-condition_str = " AND ".join(f"{col} = ?" for col in condition_columns)
-update_str = ", ".join(f"{col} = ?" for col in update_columns)
-data_tuples = [tuple(row) for row in df.itertuples(index=False)]
-# Generate values string for SQL command
-values_str = ", ".join(
-    f"({', '.join(map(str, row))})"
-    for row in data_tuples
-)
-
-# Construct the SQL query
-sql = f"""
-INSERT INTO {table_name} ({', '.join(all_columns)})
-VALUES {values_str}
-ON CONFLICT ({', '.join(condition_columns)})
-DO UPDATE SET {', '.join(f'{col} = {table_name}.{col} + excluded.{col}' for col in update_columns)}
-"""
-
-# Execute the SQL command
-with engine.connect() as connection:
-    try:
-        connection.execute(text(sql))
-        connection.commit()
-        print(f"Specific rows updated successfully in table '{table_name}'.")
-    except IntegrityError as e:
-        print(f"IntegrityError: {e}")
-    except Exception as e:
-        print(f"An error occurred: {e}")
-        
-sql_command = f"SELECT * FROM {table_name};"
-
-with engine.connect() as connection:
-    result = connection.execute(text(sql_command))
-    rows = result.fetchall()
-
-for row in rows:
-    print(row)     
-    
-       
-# Insert or update data
-insert_or_update(engine, table_name, columns.keys(), data, primary_keys)
-
-sql_command = f"SELECT * FROM {table_name};"
-
-with engine.connect() as connection:
-    result = connection.execute(text(sql_command))
-    rows = result.fetchall()
-
-for row in rows:
-    print(row)
-
-# Ensure that condition_columns match the length of data tuples minus the update column
-if len(condition_columns) != len(df.columns) - 1:
-    raise ValueError("The number of condition columns must match the number of columns in data minus the update column.")
-
-# Prepare the SQL statement for update
-update_columns = [col for col in df.columns if col not in condition_columns]
-condition_str = " AND ".join(f"{col} = ?" for col in condition_columns)
-update_str = ", ".join(f"{col} = ?" for col in update_columns)
-# Convert DataFrame rows to list of tuples
-data_tuples = [tuple(row) for row in df.itertuples(index=False)]
-
-# Generate a values string for the SQL command
-values_str = ", ".join(
-    f"({', '.join(map(str, row))})"
-    for row in data_tuples
-)
-# Construct the SQL query
-sql = f"""
-UPDATE {table_name}
-SET {update_str}
-WHERE {condition_str}
-"""
-
-# Flatten the list of data for execution
-flattened_data = []
-for row in data_tuples:
-    conditions = row[:len(condition_columns)]
-    update_values = row[len(condition_columns):]
-    flattened_data.extend(conditions + update_values)
-    
-# Execute the SQL command
-with engine.connect() as connection:
-    try:
-        connection.execute(text(sql), flattened_data)
-        print(f"Specific rows updated successfully in table '{table_name}'.")
-    except IntegrityError as e:
-        print(f"IntegrityError: {e}")
-    except Exception as e:
-        print(f"An error occurred: {e}")
-
-# Execute the SQL command
-with engine.connect() as connection:
-    try:
-        connection.execute(text(sql), flattened_data)
-        print(f"Specific rows updated successfully in table '{table_name}'.")
-    except IntegrityError as e:
-        print(f"IntegrityError: {e}")
-    except Exception as e:
-        print(f"An error occurred: {e}")
-# Update specific rows
-update_specific_rows(engine, table_name, updates, conditions)
-
-# Verify the update
-sql_command = f"SELECT * FROM {table_name};"
-with engine.connect() as connection:
-    result = connection.execute(text(sql_command))
-    rows = result.fetchall()
-
-for row in rows:
-    print(row)
-# Construct the full SQL command
-sql_command = f"""
-INSERT INTO {table_name} ({columns_str})
-VALUES {values_str};
-"""
-
-# Execute the SQL command
-with engine.connect() as connection:
-    connection.execute(text(sql_command))
-    connection.commit()
-
-check_table_exists(engine, "grid")
-
-# Define table name, columns, and data
-table_name = 'grid'
-columns = ['x', 'y', 'value']
-data = [
-    (1, 1, 1.0),
-    (2, 2, 1.5),
-    (3, 3, 2.0)
-]
-
-# Prepare the columns part of the SQL statement
-columns_str = ", ".join(columns)
-
-# Prepare the values part of the SQL statement
-values_str = ", ".join(
-    f"({', '.join(map(str, row))})"
-    for row in data
-)
-
-
-
-
-    
-    
-print("Generated SQL Command:")
-print(sql_command)
-
-# Execute the SQL command
-with engine.connect() as connection:
-    connection.execute(text(sql_command))
-
-def insert_values_sql(table_name, columns, values, filter_clause=""):
-    """
-    Generate a SQL command to insert values into a table.
-
-    Args:
-        table_name (str): The name of the table.
-        columns (list): List of column names to be inserted.
-        values (list of tuples): List of tuples where each tuple represents a row of values to be inserted.
-        filter_clause (str, optional): Optional filter clause to specify conditions for insertion.
-
-    Returns:
-        str: The SQL command to insert values into the table.
-    """
-    # Generate column names
-    column_names = ", ".join(columns)
-    
-    # Generate value placeholders
-    value_placeholders = ", ".join("?" * len(columns))
-
-    # Generate values part
-    values_part = ", ".join(f"({', '.join('?' * len(columns))})" for _ in values)
-    
-    # Flatten the values list for insertion
-    flattened_values = [item for sublist in values for item in sublist]
-
-    # Create the SQL command
-    insert_command = f"""
-    INSERT INTO {table_name} ({column_names})
-    VALUES {values_part}
-    {filter_clause}
-    """
-    return insert_command.strip(), flattened_values
-
-# Define the values for insertion
-insert_columns = ["x", "y", "value"]
-insert_values = [(1, 1, 10.0)]
-
-insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values)
-print("Insert Values SQL:\n", insert_sql)
-print("Data:\n", insert_data)
-
-insrt_stmt = 
-
-with engine.connect() as connection:
-    connection.execute(text(insert_sql), tuple(insert_data))
-
-# Define the values for insertion
-insert_columns = ["x", "y", "value"]
-insert_values = [(1, 1, 10.0), (2, 2, 20.0), (3, 3, 30.0)]
-
-# Call the function
-insert_or_update_table(engine, table_name, columns, data, conflict_columns)
-
-# Example usage
-table_name = "grid"
-columns = ["x", "y", "value"]
-data = [
-    (1, 1, 1.0),
-    (2, 2, 1.5),
-    (3, 3, 2.0),
-]
-
-sql_command = "INSERT INTO grid (x, y, value) VALUES (:x, :y, :value)"
-test_data = [{'x': 1, 'y': 1, 'value': 1.0}]
-
-with engine.connect() as connection:
-    connection.execute(text(sql_command), test_data)
-
-# Generate the SQL command and data
-insert_stmt = insert_into_table(table_name, columns, data)
-
-# Print the generated SQL command (for validation)
-print("Insert SQL Command:")
-print(insert_stmt)
-
-# Print for validation
-print("Insert SQL Command:")
-print(insert_sql)
-print("Data:")
-print(insert_data)
-
-# Example execution with SQLAlchemy
-with engine.connect() as connection:
-    connection.execute(insert_stmt)
-
-def insert_values_sql(table_name, columns, values):
-    """
-    Generate SQL command for inserting values into a table.
-
-    Args:
-        table_name (str): The name of the table.
-        columns (list): List of column names.
-        values (list of tuples): List of values to insert.
-
-    Returns:
-        str: The SQL command to insert the values.
-        list: Flattened list of values for binding to the SQL command.
-    """
-    column_names = ", ".join(columns)
-    value_placeholders = ", ".join("?" * len(columns))
-    values_part = ", ".join(f"({value_placeholders})" for _ in values)
-    flattened_values = [item for sublist in values for item in sublist]
-    
-    insert_command = f"""
-    INSERT INTO {table_name} ({column_names})
-    VALUES {values_part}
-    """
-    return insert_command.strip(), flattened_values
-
-def check_table_exists(engine, table_name):
-    """
-    Check if a table exists in the database.
-
-    Args:
-        engine: SQLAlchemy engine object.
-        table_name (str): The name of the table to check.
-
-    Returns:
-        bool: True if the table exists, False otherwise.
-    """
-    inspector = inspect(engine)
-    return table_name in inspector.get_table_names()
-
-with engine.connect() as connection:
-    # sql_validate(connection, "grid")
-    sql_inspect(connection)
-    sql_drop(connection, table_name)
-
-def select_from_table(engine, table_name, columns='*'):
-    """
-    Select data from a table.
-
-    Args:
-        engine: SQLAlchemy engine object.
-        table_name (str): The name of the table to select from.
-        columns (str or list): Columns to select. '*' selects all columns.
-
-    Returns:
-        list: List of rows returned by the query.
-    """
-    metadata = MetaData(bind=engine)
-    table = Table(table_name, metadata, autoload_with=engine)
-
-    if columns == '*':
-        columns = [col.name for col in table.columns]
-    elif isinstance(columns, str):
-        columns = [columns]
-
-    stmt = select([table.c[col] for col in columns])
-    
-    with engine.connect() as connection:
-        result = connection.execute(stmt)
-        return result.fetchall()
-    
-# Create table
-table_name = "grid"
-columns = {"x": "INTEGER", "y": "INTEGER", "value": "REAL"}
-primary_keys = ["x", "y"]
-index_columns = ["value"]
-
-create_sql = create_table_sql(table_name, columns, primary_keys, index_columns)
-print("Create Table SQL:\n", create_sql)
-
-with engine.connect() as connection:
-    connection.execute(create_sql)
-
-insert_columns = ["x", "y", "value"]
-insert_values = [(1, 1, 10.0), (2, 2, 20.0), (3, 3, 30.0)]
-
-# Insert data function
-def insert_values_sql(table_name, columns, values):
-    column_names = ", ".join(columns)
-    value_placeholders = ", ".join("?" * len(columns))
-    values_part = ", ".join(f"({value_placeholders})" for _ in values)
-    
-    insert_command = f"""
-    INSERT INTO {table_name} ({column_names})
-    VALUES {values_part}
-    """
-    # Flatten the list of values into a single list
-    flattened_values = [value for sublist in values for value in sublist]
-    
-    return insert_command.strip(), flattened_values
-
-
-table_name = 'grid'
-columns = ['x', 'y', 'value']
-data = [
-    (1, 1, 1.0),
-    (2, 2, 1.5),
-    (3, 3, 2.0)
-]
-
-# Prepare the columns part of the SQL statement
-columns_str = ", ".join(columns)
-
-# Prepare the values part of the SQL statement
-values_str = ", ".join(
-    f"({', '.join(map(str, row))})"
-    for row in data
-)
-
-# Construct the full SQL command
-sql_command = f"""
-INSERT INTO {table_name} ({columns_str})
-VALUES {values_str};
-"""
-
-# Execute the SQL command
-with engine.connect() as connection:
-    connection.execute(text(sql_command))
-
-sql_command = f"SELECT * FROM {table_name};"
-
-with engine.connect() as connection:
-    result = connection.execute(text(sql_command))
-    rows = result.fetchall()
-    
-print(f"Data in table {table_name}:")
-for row in rows:
-    print(row)
-# Construct the full SQL command
-sql_command = f"""
-INSERT INTO {table_name} ({columns_str})
-VALUES {values_str};
-"""
-
-
-insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values)
-print("Insert Values SQL:\n", insert_sql)
-print("Insert Data:\n", insert_data)
-
-with engine.connect() as connection:
-    connection.execute(insert_sql, [insert_data])
-
-# Check table existence
-exists = check_table_exists(engine, table_name)
-print(f"Table '{table_name}' exists: {exists}")
-
-# Select data from table
-data = select_from_table(engine, table_name, insert_columns)
-print(f"Data from '{table_name}':")
-for row in data:
-    print(row)
-
-
-
-
-create_sql = create_table_sql(table_name, columns, primary_keys, index_columns)
-print("Create Table SQL:\n", create_sql)
-
-# Define the values for insertion
-insert_columns = ["x", "y", "value"]
-insert_values = [(1, 1, 10.0)]
-
-insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values)
-print("Insert Values SQL:\n", insert_sql)
-print("Data:\n", insert_data)
-
-# Example usage
-table_name = "grid"
-columns = {
-    "x": "INTEGER",
-    "y": "INTEGER",
-    "value": "REAL"
-}
-primary_keys = ["x", "y"]
-index_columns = ["value"]
-
-sql_command = create_table_sql(table_name, columns, primary_keys, index_columns)
-print(sql_command)
-
-# Create the table
-create_table_sql = """
-CREATE TABLE IF NOT EXISTS grid (
-    x INTEGER,
-    y INTEGER,
-    value REAL,
-    PRIMARY KEY (x, y)
-);
-"""
-
-# Insert grid points
-insert_values = ", ".join(f"({i}, {j}, {v})" for i, j, v in grid_points)
-insert_sql = f"""
-INSERT INTO grid (x, y, value) VALUES {insert_values};
-"""
-
-# Connect to the database and execute the commands
-with engine.connect() as connection:
-    try:
-        # Create table if it does not exist
-        connection.execute(text(create_table_sql))
-        # Insert grid points
-        connection.execute(text(insert_sql))
-        connection.commit()
-        print("Grid points successfully inserted.")
-    except Exception as e:
-        print(f"An error occurred: {e}")
-
-
-engine = create_engine(f"sqlite:///{db_file}")
-metadata = MetaData()
-grid_table = Table('grid', metadata, autoload_with=engine)
-# Read existing grid values from the database into a DataFrame
-with engine.connect() as connection:
-    select_stmt = select(grid_table.c.x, grid_table.c.y, grid_table.c.value)
-    result = connection.execute(select_stmt)
-    existing_data = pd.DataFrame(result.fetchall(), columns=['x', 'y', 'value'])
-
-# Coordinates to update
-update_coords = {(1,1), (2,2), (3,3), (4,4), (5,5)}
-
-# Create a dictionary for fast lookup
-update_dict = {(i, j): 1.0 for i, j in update_coords}
-
-# Update the grid_points with new values where applicable
-updated_grid_points = [
-    (i, j, update_dict.get((i, j), value))
-    for i, j, value in grid_points
-]
-
-# Convert the list of tuples to a DataFrame
-df_updated_grid_points = pd.DataFrame(updated_grid_points, columns=['x', 'y', 'value'])
-
-# Print the DataFrame
-print(df_updated_grid_points)
-
-# Merge existing and updated data to find differences
-merged_data = pd.merge(existing_data, df_updated_grid_points, on=['x', 'y'], suffixes=('_existing', '_updated'))
-differences = merged_data[merged_data['value_existing'] != merged_data['value_updated']]
-
-# Assuming 'differences' is your DataFrame with updated values
-# Create a dictionary for batch updating
-update_dict = differences.set_index(['x', 'y'])['value_updated'].to_dict()
-
-# Generate the SQLAlchemy update statement
-update_stmt = update(grid_table).where(
-    grid_table.c.x.in_(update_dict.keys())
-).values({
-    grid_table.c.value: update_dict.get((grid_table.c.x, grid_table.c.y), grid_table.c.value)
-})
-
-# Create the CASE statement
-case_stmt = case(
-    { 
-        (grid_table.c.x == x) & (grid_table.c.y == y): value 
-        for (x, y), value in update_dict.items()
-    },
-    else_=grid_table.c.value
-)
-
-# Convert the DataFrame into a dictionary of case statements
-case_stmt = case(
-    [(grid_table.c.x == x) & (grid_table.c.y == y), value]
-    for (x, y), value in update_dict.items()
-)
-
-# Create the case statement
-case_stmt = case(
-    { (x, y): value for (x, y), value in update_dict.items() },
-    value=grid_table.c.x,  # Assuming `x` is the column being compared
-    else_=grid_table.c.value
-)
-
-case_stmt = case(
-    {
-        (x, y): value
-        for (x, y), value in update_dict.items()
-    },
-    value=grid_table.c.x,
-    else_=grid_table.c.value
-)
-
-# Create the case statement
-# Create a CASE statement using a dictionary
-case_stmt = case(
-    {
-        (grid_table.c.x == x) & (grid_table.c.y == y): value
-        for (x, y), value in update_dict.items()
-    },
-    else_=grid_table.c.value
-)
-case_stmt = case(
-    {((grid_table.c.x == x) & (grid_table.c.y == y)): value
-     for (x, y), value in update_dict.items()},
-    else_=grid_table.c.value
-)
-print("Case Statement:", str(case_stmt.compile(engine, compile_kwargs={"literal_binds": True})))
-
-
-# Create the update statement
-update_stmt = (
-    update(grid_table).
-    where(grid_table.c.value != case_stmt).
-    values(value=case_stmt)
-)    
-    
-print("Update Statement:", str(update_stmt.compile(engine, compile_kwargs={"literal_binds": True})))
-
-
-# Print the SQL for each update
-for (x, y), value in update_dict.items():
-    update_stmt = (
-        update(grid_table)
-        .where((grid_table.c.x == x) & (grid_table.c.y == y))
-        .values(value=value)
-    )
-    # Print the SQL statement with literal values for debugging
-    print("Update Statement:", str(update_stmt.compile(engine, compile_kwargs={"literal_binds": True})))
-
-    # Execute the update statement
-    with engine.connect() as connection:
-        result = connection.execute(update_stmt)
-        print(f"Updated {result.rowcount} entries for coordinates ({x}, {y}).")
-
-# Execute the update
-with engine.connect() as connection:
-    result = connection.execute(update_stmt)
-    print(f"Updated {result.rowcount} entries.")
-
-engine.dispose()
-
-engine = create_engine(f"sqlite:///{db_file}")
-metadata = MetaData()
-grid_table = Table('grid', metadata, autoload_with=engine)
-# Verify the updated rows
-select_stmt = select(grid_table)
-
-with engine.connect() as connection:
-    result = connection.execute(select_stmt)
-    rows = result.fetchall()
-
-for row in rows:
-    print(row)
-    
-# Define your SQLite engine and metadata
-engine = create_engine(F'sqlite:///{db_file}')
-metadata = MetaData()
-
-# Reflect the grid table
-grid_table = Table('grid', metadata, autoload_with=engine)
-
-# Define your update dictionary
-update_dict = {(1, 1): 1.0, (2, 2): 1.0, (3, 3): 1.0, (4, 4): 1.0, (5, 5): 1.0}
-
-# Execute updates
+# import os
+# from pathlib import Path
+
+# import numpy as np
+# import pandas as pd
+# from sqlalchemy import create_engine, text
+
+# SQL_COMMANDS["create"].format(**{"table_name": "A", "column_definitions": "B"})
+
+# # Coordinates
+# x = np.array([1, 2, 3, 4, 5])
+# y = np.array([1, 2, 3, 4, 5])
+
+# # Create the grid points
+# grid_points = [(i, j, 0) for i in x for j in y]
+
+# def initialize_grid():
+
+
+# data_root_dir = Path("C:/Users/Brandyn/Documents/GitHub/EchoPro_data/")
+# db_directory = data_root_dir / "database"
+# # ---- Create the directory if it does not already exist
+# db_directory.mkdir(parents=True, exist_ok=True)
+# # ---- Complete path to `biology.db`
+# db_file = db_directory / "grid.db"
+
+# from sqlalchemy import MetaData, Table, case, create_engine, inspect, select, text, update
+
+# engine = create_engine(f"sqlite:///{db_file}")
+
+# # Define metadata and the table to drop
+# metadata = MetaData()
+# grid_table = Table('grid', metadata, autoload_with=engine)
+# # Drop the table
 # with engine.connect() as connection:
-connection = engine.connect()
-# for (x, y), value in update_dict.items():
-(x,y) = (1, 1)
-value = update_dict[(1,1)]
-
-update_stmt = (
-    update(grid_table)
-    .where((grid_table.c.x == x) & (grid_table.c.y == y))
-    .values(value=value)
-)
-# Print the SQL statement for debugging
-print("Executing Update Statement:", str(update_stmt.compile(engine, compile_kwargs={"literal_binds": True})))
-    
-# Execute the update statement
-result = connection.execute(update_stmt)
-print(f"Updated {result.rowcount} entries for coordinates ({x}, {y}).")
-connection.close()
-
-select_stmt = select(grid_table.c.x)
-
-# Execute the SELECT statement
-with engine.connect() as connection:
-    result = connection.execute(select_stmt)
-    x_values = result.fetchall()
-
-type(x_values[0])
-
-select_stmt = select(grid_table.c.y)
-
-# Execute the SELECT statement
-with engine.connect() as connection:
-    result = connection.execute(select_stmt)
-    y_values = result.fetchall()
-    
-select_stmt = select(grid_table.c.value)
-
-# Execute the SELECT statement
-with engine.connect() as connection:
-    result = connection.execute(select_stmt)
-    values = result.fetchall()  
-    
-case_stmt = case(
-    *[(grid_table.c.x == x) & (grid_table.c.y == y, value)
-      for (x, y), value in update_dict.items()],
-    else_=grid_table.c.value
-)
-
-update_dict = {(1, 2): 1.0, (3, 2): 1.0, (1, 5): 1.0, (4, 5): 1.0, (3, 5): 4.0}
-
-with engine.connect() as connection:
-    # Select all values to check the current state
-    result = connection.execute(select(grid_table.c.x, grid_table.c.y, grid_table.c.value))
-    current_values = result.fetchall()
-    print("Current Values:", current_values)
-    
-with engine.connect() as connection:
-    with connection.begin():  # Begin a transaction
-        for (x, y), value in update_dict.items():
-            stmt = (
-                update(grid_table)
-                .where((grid_table.c.x == x) & (grid_table.c.y == y))
-                .values(value=grid_table.c.value + value)
-            )
-            connection.execute(stmt)
-            
-with engine.connect() as connection:
-    # Re-select to check the updated state
-    result = connection.execute(select(grid_table.c.x, grid_table.c.y, grid_table.c.value))
-    updated_values = result.fetchall()
-    print("Updated Values:", updated_values)
-    
-    
-# Confirm the updates
-with engine.connect() as connection:
-    select_stmt = select([grid_table])
-    result = connection.execute(select_stmt)
-    rows = result.fetchall()
-
-# Print all rows to verify updates
-print("Database contents after update:")
-for row in rows:
-    print(row)
-    
-    
-# Construct the update statement
-update_stmt = (
-    update(grid_table)
-    .values(value=case_stmt)
-    .where(grid_table.c.value != case_stmt)
-)
-
-# Create a SELECT statement to fetch all rows from the grid_table
-select_stmt = select(grid_table)
-
-# Execute the SELECT statement and fetch results
-with engine.connect() as connection:
-    result = connection.execute(select_stmt)
-    rows = result.fetchall()
-
-# Print or inspect the fetched rows
-for row in rows:
-    print(row)
-
-# Create the update statement
-update_stmt = (
-    update(grid_table)
-    .where(grid_table.c.value != case_stmt)
-    .values(value=case_stmt)
-)
-
-# Execute the update
-with engine.connect() as connection:
-    result = connection.execute(update_stmt)
-    print(f"Updated {result.rowcount} entries.")
-
-case(
-    [
-        ((grid_table.c.x == x) & (grid_table.c.y == y), value)
-        for (x, y), value in update_dict.items()
-    ],
-    else_=grid_table.c.value
-)
-
-# Create a case statement for conditional update
-case_statements = {
-    (x, y): case(
-        [(grid_table.c.x == x) & (grid_table.c.y == y, value)],
-        else_=grid_table.c.value
-    ) 
-    for (x, y), value in update_dict.items()
-}
-
-
-# Define SQL command to select all data from the grid table
-select_sql = "SELECT * FROM grid;"
-
-# Connect to the database and execute the query
-with engine.connect() as connection:
-    try:
-        # Execute the select command
-        result = connection.execute(text(select_sql))
-        # Fetch all rows from the result
-        rows = result.fetchall()
-        # Print the results
-        print("Data in grid table:")
-        for row in rows:
-            print(row)
-    except Exception as e:
-        print("An error occurred: {}".format(e))
-
-# Coordinates to update
-update_coords = {(1,1), (2,2), (3,3), (4,4), (5,5)}
-
-# Create a copy of grid_points and update specific coordinates
-updated_grid_points = [
-    (i, j, 1.0) if (i, j) in update_coords else (i, j, value)
-    for i, j, value in grid_points
-]
-
-# Retrieve current data from the database
-with engine.connect() as connection:
-    result = connection.execute(text("SELECT x, y, value FROM grid;"))
-    current_data = result.fetchall()
-    
-# Convert to a dictionary for easy comparison
-current_values = {(x, y): value for x, y, value in current_data}   
-
-# Convert updated_grid_points to a dictionary
-updated_values = {(i, j): value for i, j, value in updated_grid_points}
-
-# Find differences
-differences = [
-    (i, j, value)
-    for i, j, value in updated_grid_points
-    if (i, j) in updated_values and (i, j) not in current_values or
-    (i, j) in current_values and current_values[(i, j)] != value
-]
-
-# Update differing values in the database
-with engine.connect() as connection:
-    for i, j, value in differences:
-        connection.execute(
-            text(f"UPDATE grid SET value = {value} WHERE x = {i} AND y = {j}"),
-        )
-    print(f"Updated {len(differences)} entries.")
-
-# Step 8: Read the table into Python
-with engine.connect() as connection:
-    # Query to select all rows from the table
-    result = connection.execute(text("SELECT x, y, value FROM grid;"))
-    df = pd.DataFrame(result.fetchall(), columns=['x', 'y', 'value'])
-
-# Print the DataFrame to validate the changes
-print(df)
-
-# Check current values
-with engine.connect() as connection:
-    result = connection.execute(text("SELECT x, y, value FROM grid;"))
-    current_values = {(row[0], row[1]): row[2] for row in result.fetchall()}
-
-print("Current grid points in database:")
-for row in current_values.items():
-    print(row)
-    
-print("Updated grid points with changes:")
-for row in updated_grid_points:
-    print(row)
-
-# Determine differences
-differences = [
-    (i, j, value)
-    for i, j, value in updated_grid_points
-    if (i, j) in current_values and current_values[(i, j)] != value
-]
-
-print(f"Differences to update: {differences}")
-
-# Step 6: Update the database with INSERT OR REPLACE
-with engine.connect() as connection:
-    with connection.begin():  # Ensure transactions are committed
-        for i, j, value in updated_grid_points:
-            sql = """
-            INSERT OR REPLACE INTO grid (x, y, value) 
-            VALUES (:x, :y, :value)
-            """
-            print(f"Executing SQL: {sql} with values: x={i}, y={j}, value={value}")
-            connection.execute(
-                text(sql),
-                {"x": i, "y": j, "value": value}
-            )
-        print(f"Updated entries with INSERT OR REPLACE.")
-
-# Step 8: Read the table into Python
-with engine.connect() as connection:
-    result = connection.execute(text("SELECT x, y, value FROM grid;"))
-    rows = result.fetchall()
-    df = pd.DataFrame(rows, columns=['x', 'y', 'value'])
-
-# Print the DataFrame to validate the changes
-print("Updated table data:")
-print(df)
-
-
-engine.dispose()
-
-# Check if the file exists and then remove it
-if db_file.exists():
-    db_file.unlink()
-    print(f"Deleted the file: {db_file}")
-else:
-    print(f"The file does not exist: {db_file}")
-
-with engine.connect() as connection:
-    connection.execute(text("""
-    CREATE TABLE IF NOT EXISTS grid (
-        x INTEGER,
-        y INTEGER,
-        value REAL,
-        PRIMARY KEY (x, y)
-    );
-    """))
-    
-    connection.execute(text("""
-    INSERT OR REPLACE INTO grid (x, y, value) VALUES
-    (1, 1, 0), (1, 2, 0), (1, 3, 0), (1, 4, 0), (1, 5, 0),
-    (2, 1, 0), (2, 2, 0), (2, 3, 0), (2, 4, 0), (2, 5, 0),
-    (3, 1, 0), (3, 2, 0), (3, 3, 0), (3, 4, 0), (3, 5, 0),
-    (4, 1, 0), (4, 2, 0), (4, 3, 0), (4, 4, 0), (4, 5, 0),
-    (5, 1, 0), (5, 2, 0), (5, 3, 0), (5, 4, 0), (5, 5, 0);
-    """))
-    
-    # Insert initial values (0) into the grid table
-    values = ",".join(["({}, {}, {})".format(i, j, 0) for i, j, _ in grid_points])
-    connection.execute(text("INSERT INTO grid (x, y, value) VALUES {values};".format(values=values)))
-    
-    # Commit
-    connection.commit() 
-    
-    # Verify data insertion
-    result = connection.execute(text("SELECT * FROM grid;"))
-    rows = result.fetchall()
-    print("Data in grid table:", rows)
-    
-    connection.execute(text("""
-    INSERT INTO grid (x, y, value) VALUES
-    """ + ",".join(["({}, {}, {})".format(i, j, 0) for i, j, _ in grid_points]) + ";"))
-
-engine.dispose()
-    
-    
-    result = connection.execute(text("SELECT * FROM grid;"))
-    rows = result.fetchall()
-    print("Data in grid table:", rows)
-
-with engine.connect() as connection:
-    result = connection.execute(text("SELECT name FROM sqlite_master WHERE type='table';"))
-    print(result.fetchall())
-
-with engine.connect() as connection:
-    # Describe the table schema
-    result = connection.execute(text("PRAGMA table_info(grid);"))
-    columns = result.fetchall()
-    print("Table schema:", columns)
-
-with engine.connect() as connection:
-    result = connection.execute(text("SELECT * FROM grid;"))
-    rows = result.fetchall()
-    for row in rows:
-        print(row)
-
-SQL(db_file, command="select")
-
-
-
-
-
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plt
-import geopandas as gpd
-from geopy.distance import distance
-from shapely.geometry import Polygon, Point, box
-import geopandas as gpd
-from shapely.ops import unary_union
-import pyproj
-import geopy
-from echopop.spatial.projection import wgs84_to_utm, utm_string_generator
-import shapely.geometry
-from echopop.survey import Survey
-survey = Survey( init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/initialization_config.yml" ,
-                 survey_year_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/survey_year_2019_config.yml" )
-
-
-grid_settings = file_configuration["geospatial"]["griddify"]
-# lat_min = grid_settings["bounds"]["latitude"][0]
-lat_min = 33.75
-# lat_max = grid_settings["bounds"]["latitude"][1]
-lat_max = 55.50
-# lon_min = grid_settings["bounds"]["longitude"][0]
-lon_min = -134.25
-lon_max = grid_settings["bounds"]["longitude"][1]
-
-projection = file_configuration["geospatial"]["projection"]
-
-utm_code = utm_string_generator((lon_max + lon_min)/2, (lat_max + lat_min)/2)
-utm_num = int(utm_code)
-utm_str = f"epsg:{utm_num}"
-
-biology_data = filtered_biology_output
-
-from sqlalchemy import create_engine, text, Engine, inspect
-root_dir = file_configuration["data_root_dir"]
-db_directory = Path(root_dir) / "database"
-db_directory.mkdir(parents=True, exist_ok=True)
-db_file = db_directory / "biology.db"
-# Create the engine with the full path
-engine = create_engine(f'sqlite:///{db_file}')
-
-SQL_COMMANDS = {
-    "create": "CREATE TABLE IF NOT EXISTS {table_name} ({column_definitions});",
-    "check": "SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';",
-    "drop": "DROP TABLE IF EXISTS {table_name};",
-    "select": "SELECT {columns} FROM {table_name};",
-    "index": "CREATE UNIQUE INDEX IF NOT EXISTS {index_name} ON {table_name} ({columns})",
-    # "insert": "INSERT INTO {table_name} ({columns});",
-    "insert": """
-        INSERT INTO {table_name} ({columns}) 
-        SELECT {columns} 
-        FROM (SELECT VALUES {values} FROM (VALUES {value_placeholder})) AS source ({columns}) 
-        {filter_clause};
-        """,
-    "inspect": None,
-}
-
-SQL_DTYPES = {
-    'int32': 'INTEGER',
-    'int64': 'INTEGER',
-    'float64': 'FLOAT',
-    'bool': 'BOOLEAN',
-    'datetime64[ns]': 'DATETIME',
-    'object': 'TEXT'
-}
-
-def SQL(db_file: str, command: str, **kwargs):
-
-    # Create engine from `db_file` string
-    engine = create_engine(f"sqlite:///{db_file}")
-
-    # Format `columns`, if there are any and more than 1
-    if "columns" in kwargs.keys():
-        if isinstance(kwargs["columns"], list):
-            kwargs["columns"] = ", ".join(kwargs["columns"])
-    else:
-        kwargs["columns"] = "*"
-
-    # Format `columns`, if there are any and more than 1
-    # if "filter_columns" in kwargs.keys():
-    #     # ---- Store the value for later
-    #     kwargs["filter_columns_store"] = kwargs["filter_columns"]
-    #     if isinstance(kwargs["filter_columns"], list):
-    #         kwargs["filter_columns"] = ", ".join(kwargs["filter_columns"])
-
-    # Run the command
-    try:
-        with engine.connect() as connection:
-            # ---- SELECT
-            if command == "select":
-                return pd.read_sql(text(SQL_COMMANDS[command].format(**kwargs)), con=connection)
-            # ---- CREATE
-            elif command == "create":
-                # ---- Extract dataframe
-                df_to_add = kwargs["dataframe"]
-                # ---- Check whether the table already exists or not
-                table_exists = (
-                    connection.execute(text(SQL_COMMANDS["check"].format(**kwargs))).fetchone()
-                )
-                # ---- If it doesn't, pre-allocate the table 
-                if table_exists is None:
-                    # ---- Get column definitions as a string
-                    column_def_dict = {
-                        col: SQL_DTYPES.get(str(dtype), 'TEXT') 
-                        for col, dtype in zip(df_to_add.columns, df_to_add.dtypes)
-                    }
-                    # ---- Convert to a single string
-                    kwargs["column_definitions"] = (
-                        ", ".join([f"{col} {dtype}" for col, dtype in column_def_dict.items()])
-                    )
-                    # ---- Create table
-                    connection.execute(text(SQL_COMMANDS["create"].format(**kwargs)))
-            # ---- REPLACE
-            elif command == "replace":
-                # ---- Extract dataframe
-                df_to_add = kwargs["dataframe"]
-                # ---- Replace current
-                df_to_add.to_sql(name=kwargs["table_name"], 
-                                 con=connection, 
-                                 if_exists="replace", index=False)
-
-            # ---- INSERT
-            elif command == "insert": 
-                # ---- Extract dataframe
-                df_to_add = kwargs["dataframe"]
-                # ---- Check if 
-                # table_exists = (
-                #     connection.execute(text(SQL_COMMANDS["check"].format(**kwargs))).fetchone()
-                # )
-                # tables = SQL(db_file, "inspect")
-                # ---- If it doesn't, pre-allocate the table 
-                # if kwargs["table_name"] not in tables and "filter_columns" in kwargs.keys():
-                df_to_add.to_sql(name=kwargs["table_name"], 
-                                    con=connection, 
-                                    if_exists="append", index=False)
-                # else:
-                    #     # ---- Format `filter_columns` command if present
-                    # if "filter_columns" in kwargs.keys():
-                    #     # ---- Fetch table
-                    #     fetch_table = (
-                    #         connection.execute(text(
-                    #             ("SELECT DISTINCT {filter_columns} FROM {table_name}")
-                    #             .format(**kwargs))
-                    #         )
-                    #     )
-                    #     # ---- Format the SQL data into a DataFrame
-                    #     fetched_df = pd.DataFrame(fetch_table.fetchall(), columns=fetch_table.keys())              
-                    #     # ---- Create an index tuples
-                    #     index_tuples = (
-                    #         set(fetched_df[kwargs["filter_columns_store"]]
-                    #             .itertuples(index=False, name=None))
-                    #     )
-                    #     # ---- Filter the dataframe
-                    #     filtered_df = (
-                    #         df_to_add[
-                    #             ~df_to_add[fetched_df.columns].apply(tuple, axis=1)
-                    #             .isin(index_tuples)
-                    #             ]
-                    #     )
-                    #     # ---- Insert the data
-                    #     filtered_df.to_sql(name=kwargs["table_name"], 
-                    #                         con=connection, 
-                    #                         if_exists="append", index=False)
-                    # else:
-                    # df_to_add.to_sql(name=kwargs["table_name"], 
-                    #                 con=connection, 
-                    #                 if_exists="append", index=False)
-            # ---- INSPECT
-            elif command == "inspect":
-                return inspect(engine).get_table_names()
-            else: 
-                connection.execute(text(SQL_COMMANDS[command].format(**kwargs)))
-    finally: 
-        # ---- Dispose of the engine to release any resources being pooled/used
-        engine.dispose()
-
-_ = SQL(db_file, "drop", table_name="catch_df")
-_ = SQL(db_file, "drop", table_name="specimen_df")
-_ = SQL(db_file, "drop", table_name="length_df")
-_ = SQL(db_file, "drop", table_name="files_read")
-
-_ = SQL(db_file, "insert", table_name="files_read", dataframe=current_files)
-current = SQL(db_file, "select", table_name="files_read", columns="filepath")
-current
-
-
-# Get acoustic directory and initialization settings
-# ---- Files
-biology_file_settings = file_configuration["input_directories"]["biological"]
-# ---- General settings
-biology_analysis_settings = file_configuration["biology"]
-
-# Get the file-specific settings, datatypes, columns, etc.
-# ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
-biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"]
-# ---- Extract the expected file name ID's
-biology_file_ids = biology_file_settings["file_name_formats"]
-# ---- Extract all of the file ids
-biology_config_ids = list(biology_file_ids.keys())
-# ---- Initialize the dictionary that will define this key in the `input` attribute
-biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
-# ---- Initialize the SQL dictionary
-sql_biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
-
-# Create full filepath
-biology_directory_path = (
-    Path(file_configuration["data_root_dir"]) / biology_file_settings["directory"]
-)
-# ---- Directory check
-directory_existence = biology_directory_path.exists()
-# ---- Error evaluation (if applicable)
-if not directory_existence:
-    raise FileNotFoundError(
-        f"The acoustic data directory [{biology_directory_path}] does not exist."
-    )
-# ---- Get the defined file extension
-file_extension = biology_file_settings["extension"]
-# ---- Create Path.glob generator object
-file_path_obj = biology_directory_path.glob(f"*{'.'+file_extension}")
-#---- Create list of `*.csv`` files
-csv_files = list(file_path_obj)
-# ---- Ensure files exist or raise error otherwise
-if len(csv_files) < 1:
-    raise FileNotFoundError(
-        f"No `*.csv` files found in [{biology_directory_path}]!"
-    )
-else: 
-    # ---- Create Path to SQL database file
-    db_directory = Path(file_configuration["data_root_dir"]) / "database"
-    # ---- Create the directory if it does not already exist
-    db_directory.mkdir(parents=True, exist_ok=True)
-    # ---- Complete path to `biology.db`
-    db_file = db_directory / "biology.db"
-    # ---- Query the external SQL database to see if the file tracking table exists
-    tables = SQL(db_file, "inspect")
-    # ---- Create a list of string-formatted Path names
-    csv_files_str = [str(file) for file in csv_files]
-    # ---- Create DataFrame
-    current_files = pd.DataFrame(csv_files_str, columns=["filepath"])
-    # ---- Create if it is missing and then advance `csv_files`
-    if "files_read" not in tables:
-        # ---- Insert into the SQL database file
-        _ = SQL(db_file, "insert", table_name="files_read", columns="filepath",
-                    dataframe=current_files)        
-        # ---- Create empty list for later comparison
-        new_files = []
-    else:
-        # ---- Pull already processed filenames
-        previous_files = SQL(db_file, "select", table_name="files_read")
-        # ---- Compare against the current filelist 
-        new_files = (
-            [file for file in csv_files_str if file not in set(previous_files["filepath"])]
-        )  
-        # ---- Create a DataFrame for the new files
-        new_files_df = pd.DataFrame(new_files, columns=["filepath"])
-        # ---- Insert into the SQL database file
-        _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df) 
-
-# Iterate through each of the file ids and read in the data 
-for id in list(biology_file_ids.keys()): 
-    # ---- Extract the specific config mapping for this tag/id
-    sub_config_map = biology_config_map[id]
-    # ---- Drop the `{FIELD_ID}` tag identifier
-    file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id])
-    # ---- Replace all other tags with `*` placeholders
-    file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
-    # ---- Create Path object with the generalized format
-    subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}")
-    # ---- List all files that match this pattern
-    subcsv_files_str = [str(file) for file in list(subfile_path_obj)]
-    # ---- Filter for only new files
-    subset_files = set(subcsv_files_str).intersection(set(new_files))
-    # ---- Pull from SQL database, if applicable
-    if f"{id}_df" in tables:
-        # ---- SELECT
-        sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*")
-        # ---- Concatenate to the dictionary
-        sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df])
-    # ---- Add data files not stored in SQL database
-    if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables:
-        if len(subset_files) > 0:
-            file_list = subset_files
-        else:
-            file_list = subcsv_files_str
-        # ---- Create a list of relevant dataframes
-        sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map) 
-                        for file in file_list]
-        # ---- Concatenate into a single DataFrame
-        sub_df = pd.concat(sub_df_lst, ignore_index=True)
-        # ---- Concatenate to the dictionary DataFrame
-        biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df])
-
-# Get contrasts used for filtering the dataset
-# ---- Species
-species_filter = file_configuration["species"]["number_code"]
-# ---- Trawl partition information
-trawl_filter = biology_analysis_settings["catch"]["partition"]
-# ---- Apply the filter
-filtered_biology_output = {
-    key: df[
-        (df['species_id'] == species_filter if 'species_id' in df.columns else True) &
-        (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns else True)
-    ]
-    for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty
-}
-
-# Update the SQL database
-for table_name, df in filtered_biology_output.items():
-    # ---- Update        
-    _ = SQL(db_file, "insert", table_name=table_name, columns="*", 
-            dataframe=df)
-    
-# Combine the two datasets 
-merged_output = {
-    key: pd.concat([
-        sql_biology_output.get(key, pd.DataFrame()), 
-        filtered_biology_output.get(key, pd.DataFrame())
-    ]).drop_duplicates().reset_index(drop=True)
-    for key in set(sql_biology_output) | set(filtered_biology_output)
-}
-# ---- Return output
-merged_output
-
-coordinate_metadata.attrs[]
-
-SQL(biology_db, command="drop", table_name="catch_df")
-SQL(biology_db, command="drop", table_name="specimen_df")
-SQL(biology_db, command="drop", table_name="length_df")
-SQL(biology_db, command="drop", table_name="files_read")
-_ = SQL(db_file=db_file, command="create", table_name="files_read", columns="filepath")
-tables = SQL(db_file, "inspect")
-tables
-current = SQL(db_file, "select", table_name="files_read", columns=["filepath"])
-current
-
-SQL(db_file, "select", table_name="catch_df", columns="*")
-new_files_df = pd.DataFrame(csv_files_str, columns=['file_path'])
-_ = SQL("insert", engine, table_name="files_read",dataframe=new_files_df)
-current = SQL("select", engine, table_name="csv_files_read", columns="file_path")
-current
-for table_name, df in biology_data.items():
-    df.to_sql(table_name, con=engine, if_exists='append', index=False)
-command = "read"
-engine = create_engine(f'sqlite:///{db_file}')
-table_name = "files_read"
-columns = "file_path"
-
-kwargs = {
-    "table_name": table_name,
-    "columns": columns, 
-}
-
-zarr_data_ds["depth"].diff(dim="depth")
-
-prc_nasc_df.groupby(["longitude", "latitude"])
-
-from pandas.core.groupby import DataFrameGroupBy
-
-def estimate_echometrics(acoustic_data_df: pd.DataFrame):
-
-    # Create copy
-    acoustic_df = acoustic_data_df.copy().reset_index(drop=True)
-
-    # Pre-compute the change in depth
-    acoustic_df["dz"] = acoustic_df["depth"].diff()
-
-    # Initialize echometrics dictionary
-    echometrics = {}
-
-    # Compute the metrics center-of-mass
-    if acoustic_df["NASC"].sum() == 0.0:
-        echometrics.update({
-            "n_layers": 0,
-            "mean_Sv": -999,
-            "max_Sv": -999,
-            "nasc_db": np.nan,
-            "center_of_mass": np.nan,
-            "dispersion": np.nan,
-            "evenness": np.nan,
-            "aggregation": np.nan,    
-            "occupied_area": 0.0,        
-        })
-    else:
-        
-        # Compute the number of layers
-        echometrics.update({
-            "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size
-        })
-
-        # Compute ABC
-        # ---- Convert NASC to ABC
-        acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2)
-        # ---- Estimate mean Sv
-        echometrics.update({
-            "mean_Sv": 10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max())
-        })
-        # --- Estimate max Sv (i.e. )
-        echometrics.update({
-            "max_Sv": 10 * np.log10(acoustic_df["ABC"].max() 
-                                    / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"])
-        })
-
-        # Compute (acoustic) abundance
-        echometrics.update({
-            "nasc_db": 10 * np.log10(acoustic_df["ABC"].sum())
-        })
-
-        # Compute center of mass
-        echometrics.update({
-            "center_of_mass": (
-                (acoustic_df["depth"] * acoustic_df["NASC"]).sum()
-                / (acoustic_df["NASC"]).sum()
-            )
-        })
-
-        # Compute the dispersion
-        echometrics.update({
-            "dispersion": (
-                ((acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2 
-                * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum()                
-            )
-        })
-
-        # Compute the evenness
-        echometrics.update({
-            "evenness": (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2
-        })
-
-        # Compute the index of aggregation
-        echometrics.update({
-            "aggregation": 1 / echometrics["evenness"]
-        })
-
-        # Get the occupied area
-        echometrics.update({
-            "occupied_area": (
-                acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max()
-            )
-        })
-
-    # Return the dictionary
-    return echometrics
-
-def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True):
-
-    # Vertically integrate PRC NASC
-    nasc_dict = {"nasc": acoustic_data_df["NASC"].sum()}
-    
-    # Horizontally concatenate `echometrics`, if `True`
-    if echometrics:
-        # ---- Compute values
-        # NOTE: This uses NASC instead of linear `sv`
-        echometrics_dict = estimate_echometrics(acoustic_data_df)
-        # ---- Merge
-        nasc_dict.update(echometrics_dict)
-
-    # Convert `nasc_dict` to a DataFrame and return the output
-    return pd.Series(nasc_dict)
-
-def process_group(group):
-    result = integrate_nasc(group, echometrics=True)
-    result = result.reset_index(drop=True)
-    # Concatenate the result back to the original group for alignment
-    group = group.reset_index(drop=True)
-    combined = pd.concat([group, result], axis=1)
-    return combined
-
-acoustic_data_df = acoustic_data["prc_nasc_df"]
-
-
-rc_nasc_df[prc_nasc_df["distance"] == 0.0]
-acoustic_data_df = mek[mek["distance"] == 0.0]
-pd.DataFrame(nasc_dict, index=[0]).reset_index(drop=True).unstack()
-nasc_data_df = (
-    prc_nasc_df.groupby(["longitude", "latitude", "ping_time"])
-    .apply(lambda group: integrate_nasc(group, echometrics=False), include_groups=False)
-    .reset_index()
-)
-
-
-
-
-kwargs = {
-    "table_name": "csv_files_read",
-    "columns": "file_path",
-    "dataframe": new_files_df
-}
-
-current_process = psutil.Process()
-import logging 
-
-# Create a session
-Session = sessionmaker(bind=engine)
-session = Session()
-
-# Perform database operations
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-logger.info("Performing database operations")
-
-# Create a session
-Session = sessionmaker(bind=engine)
-session = Session()
-
-# Perform database operations
-logger.info("Performing database operations")
-
-# Close the session
-session.close()
-logger.info("Session closed")
-
-# Dispose the engine
-engine.dispose()
-logger.info("Engine disposed")
-
-# Force garbage collection
-import gc
-gc.collect()
-logger.info("Garbage collection performed")
-
-import psutil
-
-pid = psutil.Process().pid
-process = psutil.Process(pid)
-open_files = process.open_files()
-db_path = r'C:\Users\Brandyn\Documents\GitHub\EchoPro_data\live_2019_files\database\biology.db'
-
-# Check if the file is still in use
-for file in open_files:
-    if db_path in file.path:
-        logger.info(f"File {db_path} is still in use.")
-    else:
-        logger.info(f"File {db_path} is not in use.")
-
-# Define the SQL to drop the table
-drop_table_sql = "DROP TABLE IF EXISTS csv_files_read;"
-# Execute the drop table SQL
-with engine.connect() as connection:
-    _ = connection.execute(text(drop_table_sql))
-
-import sqlite3
-if os.path.exists(db_path):
-    conn = sqlite3.connect(db_path)
-    conn.close()
-    # Force the file to be removed
-    try:
-        os.remove(db_path)
-        print(f"Database file {db_path} has been deleted.")
-    except PermissionError:
-        print(f"Failed to delete {db_path}. The file is still in use.")
-        
-create_table_sql = """
-CREATE TABLE IF NOT EXISTS csv_files_read (
-    file_path TEXT UNIQUE
-);
-"""
-# Execute the create table SQL
-with engine.connect() as connection:
-    _ = connection.execute(text(create_table_sql))
-
-root_directory =  Path(root_dir)
-dataset = "biology"
-
-# Convert to strings
-csv_files_str = [str(file) for file in csv_files]
-
-existing_files_df = pd.read_sql('SELECT file_path FROM csv_files_read', con=engine)
-existing_files_set = set(existing_files_df['file_path'])
-# Filter out duplicates from the csv_files list
-new_files = [file for file in csv_files_str if file not in existing_files_set]
-# Insert only new file paths into the SQL table
-if new_files:
-    new_files_df = pd.DataFrame(new_files, columns=['file_path'])
-    _ = new_files_df.to_sql('csv_files_read', con=engine, if_exists='append', index=False)
-
-
-with engine.connect() as conn:
-    conn.execute("""
-        CREATE TABLE IF NOT EXISTS csv_files_read (
-            file_path TEXT UNIQUE
-        )
-    """)
-
-csv_files
-files_df.to_sql('csv_files_read', con=engine, if_exists='append', index=False)
-file_name_format = biology_file_ids[id]
-def compile_filename_format(file_name_format: str):
-
-    # Create a copy of `file_name_format`
-    regex_pattern = file_name_format
-    
-    # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern
-    for key, value in LIVE_FILE_FORMAT_MAP.items():
-        regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"])
-    # ---- Replace the `FILE_ID` tag
-    regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P<FILE_ID>\1)', regex_pattern)
-
-    # Compile the regex pattern and return the output
-    return re.compile(regex_pattern)
-
-from sqlalchemy.orm import sessionmaker
-Session = sessionmaker(bind=engine)
-session = Session()
-session.close()
-engine.pool.status()
-# Dispose the engine to close all connections
-engine.dispose()
-import gc
-gc.collect()
-import psutil
-dbapi_conn = engine.raw_connection()
-dbapi_conn.close()
-# Get the process ID of the current process
-pid = psutil.Process().pid
-
-# List all open files for the current process
-process = psutil.Process(pid)
-open_files = process.open_files()
-
-for file in open_files:
-    print(file.path)
-
-
-pattern = filename_format
-config_settings = sub_config_map
-regex_pattern = pattern
-
-# Replace patterns based on LIVE_FILE_FORMAT_MAP
-for key, value in LIVE_FILE_FORMAT_MAP.items():
-    regex_pattern = regex_pattern.replace(f'{{{key}}}', value['expression'])
-regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P<FILE_ID>\1)', regex_pattern)
-new_pattern = compile_filename_format(regex_pattern)
-match_obj = new_pattern.search(file.name)
-# Get substring components as a list
-filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern)
-valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings)))
-
-for i in valid_tags: 
-    matched_key = LIVE_FILE_FORMAT_MAP[i]
-    df[matched_key["name"]] = matched_key["dtype"](match_obj.group(i))
-
-
-
-# Assign the data as new columns to the DataFrame
-for key, value in data_to_add.items():
-    df[key] = value
-
-for i in valid_tags: 
-    matched_key = LIVE_FILE_FORMAT_MAP[i]
-    df[matched_key["name"]] = matched_key["dtype"](match_obj.group(i))
-biology_analysis_settings
-species_id_value = 22500
-trawl_partition_value = 'Codend'  # Adjust as needed
-{
-    key: df[
-        (('species_id' not in df.columns) or (df['species_id'] == species_id_value)) &
-        (('trawl_partition' not in df.columns) or (df['trawl_partition'] == trawl_partition_value))
-    ]
-    for key, df in biology_output.items() if isinstance(df, pd.DataFrame)
-}
-
-(match_obj.group(i)).astype(matched_key["dtype"])
-pattern = '{DATE:YYYYMM}_{HAUL}_{FILE_ID:catch_perc}'
-modified_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'\1', pattern)
-# Create the regex pattern
-regex_pattern = modified_pattern.replace('{', '(?P<').replace('}', '>.+?)')
-re.compile(regex_pattern)
-
-modified_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'\1', pattern)
-    
-# Create the regex pattern
-regex_pattern = modified_pattern.replace('{', '(?P<').replace('}', '>.+?)')
-compile_filename_format(regex_pattern)
-# Regular expression to capture values inside the curly braces
-regex = r'\{([^:}]+):([^}]+)\}'
-
-# Find all matches
-matches = re.findall(regex, modified_pattern)
-
-# Get substring components as a list
-filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern)
-
-pattern_changed = pattern.replace("FILE_ID:", "")
-
-# Compilte the filename regular expression format
-compiled_regex = compile_filename_format(pattern_changed)
-
-file_id_tag = pattern.split('{FILE_ID:')[1].split('}')[0]
-
- # Get the file name and produce a `re.Match` object
-match_obj = compiled_regex.search(file.name)
-
-
-def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict):
-
-    # Get the file name and produce a `re.Match` object
-    match_obj = pattern.search(file.name)
-
-    # Read in the `*.csv` file
-    df = pd.read_csv(file, usecols=list(config_settings["dtypes"].keys()))
-
-    # Validate the dataframe
-    # ---- Check for any missing columns
-    missing_columns = (
-        [key for key in config_settings["dtypes"].keys() if key not in df.columns]
-    )
-    # ---- Raise Error, if needed
-    if missing_columns: 
-        raise ValueError(
-            f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!"
-        )
-    # ---- Ensure the correct datatypes
-    df_validated = df.astype(config_settings["dtypes"])
-
-    # Replace column names and drop 
-    df_validated = df_validated.rename(columns=config_settings["names"])
-
-    # Get the haul number and add the the dataframe
-    # ---- Extract the haul number and convert to an integer
-    haul_num = int(match_obj.group("HAUL"))
-    # ---- Add the column
-    df_validated["haul_num"] = haul_num
-
-    # Return the resulting DataFrame
-    return df_validated
-
-boundary_dict = griddify_definitions["bounds"]
-
-from geopy.distance import distance
-import numpy as np
-import pandas as pd
-import geopandas as gpd
-from echopop.spatial.projection import utm_string_generator
-
-##
-grid_settings["grid_resolution"]["x"] = 50
-grid_settings["grid_resolution"]["y"] = 50
-lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters
-lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters
-
-# CREATE BOUNDING
-bound_df = pd.DataFrame({
-    "lon": np.array([lon_min, lon_max, lon_max, lon_min, lon_min]),
-    "lat": np.array([lat_min, lat_min, lat_max, lat_max, lat_min])
-})
-
-bound_gdf = gpd.GeoDataFrame(
-    data=bound_df,
-    geometry=gpd.points_from_xy(bound_df["lon"], bound_df["lat"]),
-    crs = projection
-)
-from echopop.spatial.projection import utm_string_generator
-import shapely.geometry
-utm_string_generator(-117.0, 33.75)
-bound_gdf.total_bounds
-# Convert to UTM
-bound_utm = bound_gdf.to_crs(utm_num)
-bound_utm.total_bounds
-y_step = lat_step
-x_step = lon_step
-# bound_utm = bound_gdf
-# y_step = grid_settings["grid_resolution"]["y"] * 1852 / 110574
-# x_step = grid_settings["grid_resolution"]["x"] * 1852 / 60.0
-
-xmin, ymin, xmax, ymax = bound_utm.total_bounds
-
-# Get number of cells
-n_x_cells = int(np.ceil((xmax - xmin) / x_step))
-n_y_cells = int(np.ceil((ymax - ymin) / y_step))
-
-import pyproj
-# create the cells in a loop
-# grid_cells = []
-# for x0 in np.arange(xmin, xmax, x_step):
-#     for y0 in np.arange(ymin, ymax, y_step):
-#         # bounds
-#         utm_zone = utm_string_generator(x0, y0)
-#         proj = pyproj.Proj(f"epsg:{utm_code}")
-#         x1 = x0-x_step
-#         y1 = y0+y_step
-#         grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))
+#     grid_table.drop(connection)
+#     print("Table 'grid' has been dropped.")
+
+# # Inspect the database
+# inspector = inspect(engine)
+# tables = inspector.get_table_names()
+# print(tables)
+
+# def create_table_sql(table_name, columns, primary_keys=None, index_columns=None):
+#     """
+#     Generate a SQL command to create a table with dynamic columns, primary keys, and indices.
+
+#     Args:
+#         table_name (str): The name of the table.
+#         columns (dict): A dictionary where keys are column names and values are data types.
+#         primary_keys (list, optional): List of column names to be used as primary keys.
+#         index_columns (list, optional): List of column names to be indexed.
+
+#     Returns:
+#         str: The SQL command to create the table.
+#     """
+#     # Generate column definitions
+#     column_definitions = ",\n    ".join(f"{col} {dtype}" for col, dtype in columns.items())
+
+#     # Generate primary key definition
+#     primary_key_definition = ""
+#     if primary_keys:
+#         primary_key_definition = f",\n    PRIMARY KEY ({', '.join(primary_keys)})"
+
+#     # Generate index definitions
+#     index_definitions = ""
+#     if index_columns:
+#         index_definitions = "\n".join(
+#             f"CREATE INDEX IF NOT EXISTS idx_{table_name}_{col} ON {table_name} ({col});"
+#             for col in index_columns
+#         )
+
+#     # Combine all parts into the final SQL command
+#     create_table_command = f"""
+#     CREATE TABLE IF NOT EXISTS {table_name} (
+#         {column_definitions}
+#         {primary_key_definition}
+#     );
+#     """
+#     # Return the command and any index definitions
+#     return create_table_command.strip() + "\n" + index_definitions
+
+# # Define metadata and the table to drop
+# metadata = MetaData()
+# grid_table = Table('grid', metadata, autoload_with=engine)
+# # Drop the table
+# with engine.connect() as connection:
+#     grid_table.drop(connection)
+#     print("Table 'grid' has been dropped.")
 
-grid_cells = []
-for y0 in np.arange(ymin, ymax, y_step):
-
-    # x_step = grid_settings["grid_resolution"]["x"] * 1852 / (1852 * 60 * np.cos(np.radians(y0)))
-
-    for x0 in np.arange(xmin, xmax, x_step):
-        # bounds
-        # utm_zone = utm_string_generator(x0, y0)
-        # proj = pyproj.Proj(f"epsg:{utm_code}")
-        # x1, y1 = proj(x0, y0)
-        # x2, y2 = proj(x0 - x_step, y0 + y_step)
-        # grid_cells.append(box(x1, y1, x2, y2))
-        x1 = x0-x_step
-        y1 = y0+y_step
-        grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))
-
-cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=utm_code)
-cells_gdf.shape
-n_x_cells * n_y_cells
-# cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"])
-cells_gdf.total_bounds
-cells_gdf.to_crs(projection).total_bounds
-from shapely.validation import make_valid
-from shapely.geometry import mapping
-########
-world = gpd.read_file("C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files/coastline/ne_10m_land/ne_10m_land.shp")
-bb_orig = box(lon_min, lat_min, lon_max, lat_max)
-boundary_box = box(lon_min - 5, lat_min - 5, lon_max + 5, lat_max + 5)
-world_orig = gpd.clip(world, box(lon_min-1, lat_min-1, lon_max+1, lat_max+1))
-world_clipped_latlon = gpd.clip(world, boundary_box)
-world_clipped = gpd.clip(world, boundary_box).to_crs(utm_code)
-
-world_utm = world.to_crs(utm_code)
-world_utm = world_utm[~world_utm.is_empty]
-
-bbox_latlon = box(lon_min, lat_min, lon_max, lat_max)
-
-gpd.GeoDataFrame(geometry=[bbox_latlon], crs=projection).to_crs(utm_code)
-
-bbox_utm = bound_utm.total_bounds
-
-buffer = [-lon_step * 1.01, -lat_step * 1.01, lon_step * 1.01, lat_step * 1.01]
-array_buffer = bbox_utm + buffer
-array_names = ["minx", "miny", "maxx", "maxy"]
-buffered = dict(zip(array_names, array_buffer))
-buffer_boundary = box(**buffered)
-# box(array_buffer[0], array_buffer[1], array_buffer[2], array_buffer[3])
-# buffer_boundary = buffer_boundary.to_crs(world_utm.crs)
-
-buffer_boundary_gdf = gpd.GeoDataFrame(geometry=[buffer_boundary], crs=world_utm.crs)  # Replace with the correct EPSG code
-bb_orig_gdf = gpd.GeoDataFrame(geometry=[bb_orig], crs=projection) 
-# sub_clipped = gpd.clip(world_utm, buffer_boundary)
-# sub_clipped = gpd.clip(world_utm, bbox_utm) 
+# check_table_exists(engine, "grid")
 
-# fig, ax = plt.subplots(figsize=(10, 10))
-# # Plot the buffer_boundary
-# world.plot(ax=ax, linewidth=2, color='gray')
-# buffer_boundary_gdf.to_crs(projection).plot(ax=ax, facecolor='none', edgecolor='blue')
-# bb_orig_gdf.plot(ax=ax, facecolor='none', edgecolor='red')
-# plt.xlim(lon_min-3, lon_max+3)
-# plt.ylim(lat_min-3, lat_max+3)
-# plt.show()
-from echopop.live.sql_methods import SQL
-from shapely import wkt
-import matplotlib.pyplot as plt
-import geopandas as gpd
-import matplotlib.colors as colors
-import matplotlib.cm as cm
-import numpy as np
-from matplotlib.colors import ListedColormap
-import matplotlib.dates as mdates
-from datetime import datetime
-db_filepath = realtime_survey.config["database"]["grid"]
-survey_db = realtime_survey.config["database"]["acoustics"]
-grid_df = SQL(db_filepath, "select", table_name="grid_df")
-# grid_df[grid_df.abundance > 0]
-grid_df[grid_df.abundance > 1e10]
-# grid_df[grid_df.abundance > 0]
-coast_df = SQL(db_filepath, "select", table_name="coastline_df")
-survey_df = SQL(survey_db, "select", table_name="survey_data_df")
-
-# def parse_datetime(date_str):
-#     # List of possible formats
-#     formats = [
-#         '%Y-%m-%d %H:%M:%S.%f',  # With fractional seconds
-#         '%Y-%m-%d %H:%M:%S',     # Without fractional seconds
-#         '%Y-%m-%dT%H:%M:%S.%f',  # ISO 8601 format with fractional seconds
-#         '%Y-%m-%dT%H:%M:%S'      # ISO 8601 format without fractional seconds
-#     ]
-    
-#     for fmt in formats:
+# with engine.connect() as connection:
+#     sql_create(connection, df, table_name, primary_keys)
+
+# # Create the table
+# table_name = "grid"
+# columns = {"x": "INTEGER", "y": "INTEGER", "value": "REAL"}
+# primary_keys = ["x", "y"]
+# index_columns = ["x", "y"]
+
+# create_sql = create_table_sql(table_name, columns, primary_keys, index_columns)
+# print("Create Table SQL:\n", create_sql)
+
+# with engine.connect() as connection:
+#     connection.execute(text(create_sql))
+
+# inspector = inspect(engine)
+# tables = inspector.get_table_names()
+# print(tables)
+
+# check_table_exists(engine, "grid")
+
+# sql_command = f"SELECT * FROM {table_name};"
+
+# with engine.connect() as connection:
+#     result = connection.execute(text(sql_command))
+#     rows = result.fetchall()
+
+# for row in rows:
+#     print(row)
+
+# converted_data[0]
+# check_table_exists(engine, "files_read")
+
+# zarr_files_str = ["A", "B", "C", "D"]
+# # ---- Create DataFrame
+# current_files = pd.DataFrame(zarr_files_str, columns=["filepath"])
+
+# with engine.connect() as connection:
+#     sql_create(connection, table_name="files_read", df=current_files)
+#     sql_insert(connection, table_name="files_read", columns=["filepath"], dataframe=current_files)
+
+# table_name = "files_read"
+# sql_command = f"SELECT * FROM {table_name};"
+
+# with engine.connect() as connection:
+#     result = connection.execute(text(sql_command))
+#     rows = result.fetchall()
+
+# for row in rows:
+#     print(row)
+
+
+# from sqlalchemy.exc import IntegrityError
+
+
+# def insert_or_update(engine, table_name, columns, data, conflict_columns):
+#     """
+#     Insert or update data in a table.
+
+#     Args:
+#         engine (Engine): The SQLAlchemy engine instance.
+#         table_name (str): The name of the table.
+#         columns (list): List of column names.
+#         data (list of dict): List of dictionaries containing data to insert or update.
+#         conflict_columns (list): List of column names to use for conflict resolution.
+#     """
+
+#     # Prepare the SQL statement for insertion
+#     column_names = ", ".join(columns)
+#     placeholder = ", ".join(f":{col}" for col in columns)
+#     # values_list = ", ".join(f"({', '.join(f':{col}' for col in columns)})" for _ in data)
+#     values_str = ", ".join(
+#         f"({', '.join(map(str, row))})"
+#         for row in data
+#     )
+
+
+#     # Construct the SQL query
+#     sql = f"""
+#     INSERT INTO {table_name} ({column_names})
+#     VALUES {values_str}
+#     ON CONFLICT ({', '.join(conflict_columns)})
+#     DO UPDATE SET {', '.join(f'{col}=excluded.{col}' for col in columns)}
+#     """
+
+#     # Flatten the list of data for execution
+#     # flattened_data = [item for sublist in [[(item[col] for col in columns)] for item
+# in data] for item in sublist]
+
+#     # Execute the SQL command
+#     with engine.connect() as connection:
 #         try:
-#             return pd.to_datetime(date_str, format=fmt)
-#         except (ValueError, TypeError):
-#             continue  # Try the next format
-    
-#     return pd.NaT  # Return NaT if no formats match
+#             connection.execute(text(sql))
+#             # connection.commit()
+#             print(f"Data inserted or updated successfully in table '{table_name}'.")
+#         except IntegrityError as e:
+#             print(f"IntegrityError: {e}")
+#         except Exception as e:
+#             print(f"An error occurred: {e}")
 
-# survey_df["ping_time"] = survey_df["ping_time"].apply(parse_datetime)
+# # Prepare data for insertion or update
+# # data = [{'x': i, 'y': j, 'value': v} for i, j, v in grid_points]
+# data = grid_points
 
-# pd.to_datetime(survey_df["ping_time"], format='%Y-%m-%d %H:%M:%S.%f', errors="coerce")
+# # Insert or update data
+# insert_or_update(engine, table_name, columns.keys(), data, primary_keys)
 
-# fig, ax = plt.subplots(figsize=(5, 8))
-# ax.scatter(survey_df.ping_time, survey_df.nasc)
-# plt.ylabel("NASC")
-# # ax.xaxis.set_major_locator(mdates.DayLocator(5, 10, 15))
-# plt.show()
+# sql_command = f"SELECT * FROM {table_name};"
+
+# with engine.connect() as connection:
+#     result = connection.execute(text(sql_command))
+#     rows = result.fetchall()
 
+# for row in rows:
+#     print(row)
 
-# times = np.arange(np.datetime64('2001-01-02'),
-#                   np.datetime64('2002-02-03'), np.timedelta64(75, 'm'))
-# y = np.random.randn(len(times))
-# survey_df[(survey_df.nasc > 0) & (survey_df.nasc < 1e5)]["nasc"].mean()
-# survey_df[(survey_df.nasc > 0) & (survey_df.nasc > 1e5)]["nasc"].mean()
+# def update_specific_rows(engine, table_name, updates, conditions):
+#     """
+#     Update specific rows in a table based on conditions.
 
-# fig, ax = plt.subplots()
-# ax.plot(times, y)
-# survey_df[(survey_df.number_density > 0) & (survey_df.x == 21)]
-# # a = self.input["acoustics"]["prc_nasc_df"]
-# # survey_df[(survey_df.x) == 24 & (survey_df.y == 13)]
+#     Args:
+#         engine (Engine): The SQLAlchemy engine instance.
+#         table_name (str): The name of the table.
+#         updates (dict): Dictionary of columns and their new values to be updated.
+#         conditions (dict): Dictionary of columns and their values to be used in the WHERE clause.
+#     """
 
-grid_df["geometry"] = grid_df["geometry"].apply(wkt.loads)
-coast_df["geometry"] = coast_df["geometry"].apply(wkt.loads)
+#     # Construct the SET clause for the update
+#     set_clause = ", ".join(f"{col} = :{col}" for col in updates.keys())
 
-projection = realtime_survey.config["geospatial"]["projection"]
+#     # Construct the WHERE clause for the update
+#     where_clause = " AND ".join(f"{col} = :{col}_cond" for col in conditions.keys())
+
+#     # Construct the SQL query
+#     sql = f"""
+#     UPDATE {table_name}
+#     SET {set_clause}
+#     WHERE {where_clause}
+#     """
+
+#     # Prepare parameters for the query
+#     parameters = {**updates, **{f"{col}_cond": val for col, val in conditions.items()}}
+
+#     # Execute the SQL command
+#     with engine.connect() as connection:
+#         try:
+#             connection.execute(text(sql), parameters)
+#             print(f"Rows updated successfully in table '{table_name}'.")
+#         except IntegrityError as e:
+#             print(f"IntegrityError: {e}")
+#         except Exception as e:
+#             print(f"An error occurred: {e}")
+
+# # Define table name
+# table_name = "grid"
+# # Define the table and columns
+# table_name = 'grid'
+# condition_columns = ['x', 'y']
+
+# # Define the updates and conditions
+# dd = {"x": np.array([1, 2, 3 , 4, 5]),"" "y": np.array([1, 2, 3 , 4, 5]), "value":
+# np.array([1, 2, 3 , 4, 5]).astype(float)}
+# new_data = pd.DataFrame(dd)
+# new_data
+# df = new_data
+
+# kwargs = {"table_name": "grid", "columns": df.columns, "df": df}
+
+# with engine.connect() as connection:
+#     # sql_create(connection, table_name = "grid", df = df)
+#     # sql_validate(connection, "grid")
+#     # sql_drop(connection, "grid")
+#     sql_insert(connection, table_name="grid", columns=df.columns, dataframe=df,
+# id_columns=["x", "y"])
+
+
+# data_tuples = [tuple(row) for row in df.itertuples(index=False)]
+
+# all_columns = df.columns.tolist()
+# if len(condition_columns) >= len(all_columns):
+#     raise ValueError("The number of condition columns must be less than the number of
+# columns in data.")
+
+# # Prepare column names and conditions
+# update_columns = [col for col in all_columns if col not in condition_columns]
+# condition_str = " AND ".join(f"{col} = ?" for col in condition_columns)
+# update_str = ", ".join(f"{col} = ?" for col in update_columns)
+# data_tuples = [tuple(row) for row in df.itertuples(index=False)]
+# # Generate values string for SQL command
+# values_str = ", ".join(
+#     f"({', '.join(map(str, row))})"
+#     for row in data_tuples
+# )
+
+# # Construct the SQL query
+# sql = f"""
+# INSERT INTO {table_name} ({', '.join(all_columns)})
+# VALUES {values_str}
+# ON CONFLICT ({', '.join(condition_columns)})
+# DO UPDATE SET {', '.join(f'{col} = {table_name}.{col} + excluded.{col}' for col inupdate_columns)}
+# """
+
+# # Execute the SQL command
+# with engine.connect() as connection:
+#     try:
+#         connection.execute(text(sql))
+#         connection.commit()
+#         print(f"Specific rows updated successfully in table '{table_name}'.")
+#     except IntegrityError as e:
+#         print(f"IntegrityError: {e}")
+#     except Exception as e:
+#         print(f"An error occurred: {e}")
+
+# sql_command = f"SELECT * FROM {table_name};"
+
+# with engine.connect() as connection:
+#     result = connection.execute(text(sql_command))
+#     rows = result.fetchall()
+
+# for row in rows:
+#     print(row)
+
+
+# # Insert or update data
+# insert_or_update(engine, table_name, columns.keys(), data, primary_keys)
+
+# sql_command = f"SELECT * FROM {table_name};"
+
+# with engine.connect() as connection:
+#     result = connection.execute(text(sql_command))
+#     rows = result.fetchall()
+
+# for row in rows:
+#     print(row)
+
+# # Ensure that condition_columns match the length of data tuples minus the update column
+# if len(condition_columns) != len(df.columns) - 1:
+#     raise ValueError("The number of condition columns must match the number of columns in
+# data minus the update column.")
+
+# # Prepare the SQL statement for update
+# update_columns = [col for col in df.columns if col not in condition_columns]
+# condition_str = " AND ".join(f"{col} = ?" for col in condition_columns)
+# update_str = ", ".join(f"{col} = ?" for col in update_columns)
+# # Convert DataFrame rows to list of tuples
+# data_tuples = [tuple(row) for row in df.itertuples(index=False)]
+
+# # Generate a values string for the SQL command
+# values_str = ", ".join(
+#     f"({', '.join(map(str, row))})"
+#     for row in data_tuples
+# )
+# # Construct the SQL query
+# sql = f"""
+# UPDATE {table_name}
+# SET {update_str}
+# WHERE {condition_str}
+# """
+
+# # Flatten the list of data for execution
+# flattened_data = []
+# for row in data_tuples:
+#     conditions = row[:len(condition_columns)]
+#     update_values = row[len(condition_columns):]
+#     flattened_data.extend(conditions + update_values)
+
+# # Execute the SQL command
+# with engine.connect() as connection:
+#     try:
+#         connection.execute(text(sql), flattened_data)
+#         print(f"Specific rows updated successfully in table '{table_name}'.")
+#     except IntegrityError as e:
+#         print(f"IntegrityError: {e}")
+#     except Exception as e:
+#         print(f"An error occurred: {e}")
+
+# # Execute the SQL command
+# with engine.connect() as connection:
+#     try:
+#         connection.execute(text(sql), flattened_data)
+#         print(f"Specific rows updated successfully in table '{table_name}'.")
+#     except IntegrityError as e:
+#         print(f"IntegrityError: {e}")
+#     except Exception as e:
+#         print(f"An error occurred: {e}")
+# # Update specific rows
+# update_specific_rows(engine, table_name, updates, conditions)
+
+# # Verify the update
+# sql_command = f"SELECT * FROM {table_name};"
+# with engine.connect() as connection:
+#     result = connection.execute(text(sql_command))
+#     rows = result.fetchall()
+
+# for row in rows:
+#     print(row)
+# # Construct the full SQL command
+# sql_command = f"""
+# INSERT INTO {table_name} ({columns_str})
+# VALUES {values_str};
+# """
+
+# # Execute the SQL command
+# with engine.connect() as connection:
+#     connection.execute(text(sql_command))
+#     connection.commit()
+
+# check_table_exists(engine, "grid")
+
+# # Define table name, columns, and data
+# table_name = 'grid'
+# columns = ['x', 'y', 'value']
+# data = [
+#     (1, 1, 1.0),
+#     (2, 2, 1.5),
+#     (3, 3, 2.0)
+# ]
+
+# # Prepare the columns part of the SQL statement
+# columns_str = ", ".join(columns)
+
+# # Prepare the values part of the SQL statement
+# values_str = ", ".join(
+#     f"({', '.join(map(str, row))})"
+#     for row in data
+# )
+
+
+# print("Generated SQL Command:")
+# print(sql_command)
+
+# # Execute the SQL command
+# with engine.connect() as connection:
+#     connection.execute(text(sql_command))
+
+# def insert_values_sql(table_name, columns, values, filter_clause=""):
+#     """
+#     Generate a SQL command to insert values into a table.
+
+#     Args:
+#         table_name (str): The name of the table.
+#         columns (list): List of column names to be inserted.
+#         values (list of tuples): List of tuples where each tuple represents a row of values
+# to be inserted.
+#         filter_clause (str, optional): Optional filter clause to specify conditions for insertion.
+
+#     Returns:
+#         str: The SQL command to insert values into the table.
+#     """
+#     # Generate column names
+#     column_names = ", ".join(columns)
+
+#     # Generate value placeholders
+#     value_placeholders = ", ".join("?" * len(columns))
+
+#     # Generate values part
+#     values_part = ", ".join(f"({', '.join('?' * len(columns))})" for _ in values)
+
+#     # Flatten the values list for insertion
+#     flattened_values = [item for sublist in values for item in sublist]
+
+#     # Create the SQL command
+#     insert_command = f"""
+#     INSERT INTO {table_name} ({column_names})
+#     VALUES {values_part}
+#     {filter_clause}
+#     """
+#     return insert_command.strip(), flattened_values
+
+# # Define the values for insertion
+# insert_columns = ["x", "y", "value"]
+# insert_values = [(1, 1, 10.0)]
+
+# insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values)
+# print("Insert Values SQL:\n", insert_sql)
+# print("Data:\n", insert_data)
+
+# insrt_stmt =
+
+# with engine.connect() as connection:
+#     connection.execute(text(insert_sql), tuple(insert_data))
+
+# # Define the values for insertion
+# insert_columns = ["x", "y", "value"]
+# insert_values = [(1, 1, 10.0), (2, 2, 20.0), (3, 3, 30.0)]
+
+# # Call the function
+# insert_or_update_table(engine, table_name, columns, data, conflict_columns)
+
+# # Example usage
+# table_name = "grid"
+# columns = ["x", "y", "value"]
+# data = [
+#     (1, 1, 1.0),
+#     (2, 2, 1.5),
+#     (3, 3, 2.0),
+# ]
+
+# sql_command = "INSERT INTO grid (x, y, value) VALUES (:x, :y, :value)"
+# test_data = [{'x': 1, 'y': 1, 'value': 1.0}]
+
+# with engine.connect() as connection:
+#     connection.execute(text(sql_command), test_data)
+
+# # Generate the SQL command and data
+# insert_stmt = insert_into_table(table_name, columns, data)
+
+# # Print the generated SQL command (for validation)
+# print("Insert SQL Command:")
+# print(insert_stmt)
+
+# # Print for validation
+# print("Insert SQL Command:")
+# print(insert_sql)
+# print("Data:")
+# print(insert_data)
+
+# # Example execution with SQLAlchemy
+# with engine.connect() as connection:
+#     connection.execute(insert_stmt)
+
+# def insert_values_sql(table_name, columns, values):
+#     """
+#     Generate SQL command for inserting values into a table.
+
+#     Args:
+#         table_name (str): The name of the table.
+#         columns (list): List of column names.
+#         values (list of tuples): List of values to insert.
+
+#     Returns:
+#         str: The SQL command to insert the values.
+#         list: Flattened list of values for binding to the SQL command.
+#     """
+#     column_names = ", ".join(columns)
+#     value_placeholders = ", ".join("?" * len(columns))
+#     values_part = ", ".join(f"({value_placeholders})" for _ in values)
+#     flattened_values = [item for sublist in values for item in sublist]
+
+#     insert_command = f"""
+#     INSERT INTO {table_name} ({column_names})
+#     VALUES {values_part}
+#     """
+#     return insert_command.strip(), flattened_values
+
+# def check_table_exists(engine, table_name):
+#     """
+#     Check if a table exists in the database.
+
+#     Args:
+#         engine: SQLAlchemy engine object.
+#         table_name (str): The name of the table to check.
+
+#     Returns:
+#         bool: True if the table exists, False otherwise.
+#     """
+#     inspector = inspect(engine)
+#     return table_name in inspector.get_table_names()
+
+# with engine.connect() as connection:
+#     # sql_validate(connection, "grid")
+#     sql_inspect(connection)
+#     sql_drop(connection, table_name)
 
-grid_gdf = gpd.GeoDataFrame(grid_df, geometry="geometry", crs=projection)
-grid_gdf_1 = grid_gdf[grid_gdf.abundance > 0]
-coast_gdf = gpd.GeoDataFrame(coast_df, geometry="geometry", crs=projection)
+# def select_from_table(engine, table_name, columns='*'):
+#     """
+#     Select data from a table.
 
-lims = grid_gdf.total_bounds
-# nu = dataset_gdf[(dataset_gdf.stratum_x == 25) & (dataset_gdf.stratum_y == 11)]
-# dataset_gdf.stratum_x.max() 
-# # np.linspace(1, 1, len(np.arange(xmin, xmax+x_step, x_step))-1)
+#     Args:
+#         engine: SQLAlchemy engine object.
+#         table_name (str): The name of the table to select from.
+#         columns (str or list): Columns to select. '*' selects all columns.
+
+#     Returns:
+#         list: List of rows returned by the query.
+#     """
+#     metadata = MetaData(bind=engine)
+#     table = Table(table_name, metadata, autoload_with=engine)
+
+#     if columns == '*':
+#         columns = [col.name for col in table.columns]
+#     elif isinstance(columns, str):
+#         columns = [columns]
+
+#     stmt = select([table.c[col] for col in columns])
+
+#     with engine.connect() as connection:
+#         result = connection.execute(stmt)
+#         return result.fetchall()
+
+# # Create table
+# table_name = "grid"
+# columns = {"x": "INTEGER", "y": "INTEGER", "value": "REAL"}
+# primary_keys = ["x", "y"]
+# index_columns = ["value"]
+
+# create_sql = create_table_sql(table_name, columns, primary_keys, index_columns)
+# print("Create Table SQL:\n", create_sql)
+
+# with engine.connect() as connection:
+#     connection.execute(create_sql)
+
+# insert_columns = ["x", "y", "value"]
+# insert_values = [(1, 1, 10.0), (2, 2, 20.0), (3, 3, 30.0)]
+
+# # Insert data function
+# def insert_values_sql(table_name, columns, values):
+#     column_names = ", ".join(columns)
+#     value_placeholders = ", ".join("?" * len(columns))
+#     values_part = ", ".join(f"({value_placeholders})" for _ in values)
+
+#     insert_command = f"""
+#     INSERT INTO {table_name} ({column_names})
+#     VALUES {values_part}
+#     """
+#     # Flatten the list of values into a single list
+#     flattened_values = [value for sublist in values for value in sublist]
+
+#     return insert_command.strip(), flattened_values
+
+
+# table_name = 'grid'
+# columns = ['x', 'y', 'value']
+# data = [
+#     (1, 1, 1.0),
+#     (2, 2, 1.5),
+#     (3, 3, 2.0)
+# ]
+
+# # Prepare the columns part of the SQL statement
+# columns_str = ", ".join(columns)
+
+# # Prepare the values part of the SQL statement
+# values_str = ", ".join(
+#     f"({', '.join(map(str, row))})"
+#     for row in data
+# )
+
+# # Construct the full SQL command
+# sql_command = f"""
+# INSERT INTO {table_name} ({columns_str})
+# VALUES {values_str};
+# """
+
+# # Execute the SQL command
+# with engine.connect() as connection:
+#     connection.execute(text(sql_command))
+
+# sql_command = f"SELECT * FROM {table_name};"
+
+# with engine.connect() as connection:
+#     result = connection.execute(text(sql_command))
+#     rows = result.fetchall()
+
+# print(f"Data in table {table_name}:")
+# for row in rows:
+#     print(row)
+# # Construct the full SQL command
+# sql_command = f"""
+# INSERT INTO {table_name} ({columns_str})
+# VALUES {values_str};
+# """
+
+
+# insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values)
+# print("Insert Values SQL:\n", insert_sql)
+# print("Insert Data:\n", insert_data)
+
+# with engine.connect() as connection:
+#     connection.execute(insert_sql, [insert_data])
+
+# # Check table existence
+# exists = check_table_exists(engine, table_name)
+# print(f"Table '{table_name}' exists: {exists}")
+
+# # Select data from table
+# data = select_from_table(engine, table_name, insert_columns)
+# print(f"Data from '{table_name}':")
+# for row in data:
+#     print(row)
+
+
+# create_sql = create_table_sql(table_name, columns, primary_keys, index_columns)
+# print("Create Table SQL:\n", create_sql)
+
+# # Define the values for insertion
+# insert_columns = ["x", "y", "value"]
+# insert_values = [(1, 1, 10.0)]
+
+# insert_sql, insert_data = insert_values_sql(table_name, insert_columns, insert_values)
+# print("Insert Values SQL:\n", insert_sql)
+# print("Data:\n", insert_data)
+
+# # Example usage
+# table_name = "grid"
+# columns = {
+#     "x": "INTEGER",
+#     "y": "INTEGER",
+#     "value": "REAL"
+# }
+# primary_keys = ["x", "y"]
+# index_columns = ["value"]
+
+# sql_command = create_table_sql(table_name, columns, primary_keys, index_columns)
+# print(sql_command)
+
+# # Create the table
+# create_table_sql = """
+# CREATE TABLE IF NOT EXISTS grid (
+#     x INTEGER,
+#     y INTEGER,
+#     value REAL,
+#     PRIMARY KEY (x, y)
+# );
+# """
+
+# # Insert grid points
+# insert_values = ", ".join(f"({i}, {j}, {v})" for i, j, v in grid_points)
+# insert_sql = f"""
+# INSERT INTO grid (x, y, value) VALUES {insert_values};
+# """
+
+# # Connect to the database and execute the commands
+# with engine.connect() as connection:
+#     try:
+#         # Create table if it does not exist
+#         connection.execute(text(create_table_sql))
+#         # Insert grid points
+#         connection.execute(text(insert_sql))
+#         connection.commit()
+#         print("Grid points successfully inserted.")
+#     except Exception as e:
+#         print(f"An error occurred: {e}")
+
+
+# engine = create_engine(f"sqlite:///{db_file}")
+# metadata = MetaData()
+# grid_table = Table('grid', metadata, autoload_with=engine)
+# # Read existing grid values from the database into a DataFrame
+# with engine.connect() as connection:
+#     select_stmt = select(grid_table.c.x, grid_table.c.y, grid_table.c.value)
+#     result = connection.execute(select_stmt)
+#     existing_data = pd.DataFrame(result.fetchall(), columns=['x', 'y', 'value'])
+
+# # Coordinates to update
+# update_coords = {(1,1), (2,2), (3,3), (4,4), (5,5)}
+
+# # Create a dictionary for fast lookup
+# update_dict = {(i, j): 1.0 for i, j in update_coords}
+
+# # Update the grid_points with new values where applicable
+# updated_grid_points = [
+#     (i, j, update_dict.get((i, j), value))
+#     for i, j, value in grid_points
+# ]
+
+# # Convert the list of tuples to a DataFrame
+# df_updated_grid_points = pd.DataFrame(updated_grid_points, columns=['x', 'y', 'value'])
+
+# # Print the DataFrame
+# print(df_updated_grid_points)
+
+# # Merge existing and updated data to find differences
+# merged_data = pd.merge(existing_data, df_updated_grid_points,
+# on=['x', 'y'], suffixes=('_existing', '_updated'))
+# differences = merged_data[merged_data['value_existing'] != merged_data['value_updated']]
+
+# # Assuming 'differences' is your DataFrame with updated values
+# # Create a dictionary for batch updating
+# update_dict = differences.set_index(['x', 'y'])['value_updated'].to_dict()
+
+# # Generate the SQLAlchemy update statement
+# update_stmt = update(grid_table).where(
+#     grid_table.c.x.in_(update_dict.keys())
+# ).values({
+#     grid_table.c.value: update_dict.get((grid_table.c.x, grid_table.c.y), grid_table.c.value)
+# })
+
+# # Create the CASE statement
+# case_stmt = case(
+#     {
+#         (grid_table.c.x == x) & (grid_table.c.y == y): value
+#         for (x, y), value in update_dict.items()
+#     },
+#     else_=grid_table.c.value
+# )
+
+# # Convert the DataFrame into a dictionary of case statements
+# case_stmt = case(
+#     [(grid_table.c.x == x) & (grid_table.c.y == y), value]
+#     for (x, y), value in update_dict.items()
+# )
+
+# # Create the case statement
+# case_stmt = case(
+#     { (x, y): value for (x, y), value in update_dict.items() },
+#     value=grid_table.c.x,  # Assuming `x` is the column being compared
+#     else_=grid_table.c.value
+# )
+
+# case_stmt = case(
+#     {
+#         (x, y): value
+#         for (x, y), value in update_dict.items()
+#     },
+#     value=grid_table.c.x,
+#     else_=grid_table.c.value
+# )
+
+# # Create the case statement
+# # Create a CASE statement using a dictionary
+# case_stmt = case(
+#     {
+#         (grid_table.c.x == x) & (grid_table.c.y == y): value
+#         for (x, y), value in update_dict.items()
+#     },
+#     else_=grid_table.c.value
+# )
+# case_stmt = case(
+#     {((grid_table.c.x == x) & (grid_table.c.y == y)): value
+#      for (x, y), value in update_dict.items()},
+#     else_=grid_table.c.value
+# )
+# print("Case Statement:", str(case_stmt.compile(engine,
+# compile_kwargs={"literal_binds": True})))
+
+
+# # Create the update statement
+# update_stmt = (
+#     update(grid_table).
+#     where(grid_table.c.value != case_stmt).
+#     values(value=case_stmt)
+# )
+
+# print("Update Statement:", str(update_stmt.compile(engine,
+# compile_kwargs={"literal_binds": True})))
+
+
+# # Print the SQL for each update
+# for (x, y), value in update_dict.items():
+#     update_stmt = (
+#         update(grid_table)
+#         .where((grid_table.c.x == x) & (grid_table.c.y == y))
+#         .values(value=value)
+#     )
+#     # Print the SQL statement with literal values for debugging
+#     print("Update Statement:", str(update_stmt.compile(engine,
+# compile_kwargs={"literal_binds": True})))
+
+#     # Execute the update statement
+#     with engine.connect() as connection:
+#         result = connection.execute(update_stmt)
+#         print(f"Updated {result.rowcount} entries for coordinates ({x}, {y}).")
+
+# # Execute the update
+# with engine.connect() as connection:
+#     result = connection.execute(update_stmt)
+#     print(f"Updated {result.rowcount} entries.")
+
+# engine.dispose()
+
+# engine = create_engine(f"sqlite:///{db_file}")
+# metadata = MetaData()
+# grid_table = Table('grid', metadata, autoload_with=engine)
+# # Verify the updated rows
+# select_stmt = select(grid_table)
+
+# with engine.connect() as connection:
+#     result = connection.execute(select_stmt)
+#     rows = result.fetchall()
+
+# for row in rows:
+#     print(row)
+
+# # Define your SQLite engine and metadata
+# engine = create_engine(F'sqlite:///{db_file}')
+# metadata = MetaData()
+
+# # Reflect the grid table
+# grid_table = Table('grid', metadata, autoload_with=engine)
+
+# # Define your update dictionary
+# update_dict = {(1, 1): 1.0, (2, 2): 1.0, (3, 3): 1.0, (4, 4): 1.0, (5, 5): 1.0}
+
+# # Execute updates
+# # with engine.connect() as connection:
+# connection = engine.connect()
+# # for (x, y), value in update_dict.items():
+# (x,y) = (1, 1)
+# value = update_dict[(1,1)]
+
+# update_stmt = (
+#     update(grid_table)
+#     .where((grid_table.c.x == x) & (grid_table.c.y == y))
+#     .values(value=value)
+# )
+# # Print the SQL statement for debugging
+# print("Executing Update Statement:", str(update_stmt.compile(engine,
+# compile_kwargs={"literal_binds": True})))
+
+# # Execute the update statement
+# result = connection.execute(update_stmt)
+# print(f"Updated {result.rowcount} entries for coordinates ({x}, {y}).")
+# connection.close()
+
+# select_stmt = select(grid_table.c.x)
+
+# # Execute the SELECT statement
+# with engine.connect() as connection:
+#     result = connection.execute(select_stmt)
+#     x_values = result.fetchall()
+
+# type(x_values[0])
+
+# select_stmt = select(grid_table.c.y)
+
+# # Execute the SELECT statement
+# with engine.connect() as connection:
+#     result = connection.execute(select_stmt)
+#     y_values = result.fetchall()
+
+# select_stmt = select(grid_table.c.value)
+
+# # Execute the SELECT statement
+# with engine.connect() as connection:
+#     result = connection.execute(select_stmt)
+#     values = result.fetchall()
+
+# case_stmt = case(
+#     *[(grid_table.c.x == x) & (grid_table.c.y == y, value)
+#       for (x, y), value in update_dict.items()],
+#     else_=grid_table.c.value
+# )
+
+# update_dict = {(1, 2): 1.0, (3, 2): 1.0, (1, 5): 1.0, (4, 5): 1.0, (3, 5): 4.0}
+
+# with engine.connect() as connection:
+#     # Select all values to check the current state
+#     result = connection.execute(select(grid_table.c.x, grid_table.c.y, grid_table.c.value))
+#     current_values = result.fetchall()
+#     print("Current Values:", current_values)
+
+# with engine.connect() as connection:
+#     with connection.begin():  # Begin a transaction
+#         for (x, y), value in update_dict.items():
+#             stmt = (
+#                 update(grid_table)
+#                 .where((grid_table.c.x == x) & (grid_table.c.y == y))
+#                 .values(value=grid_table.c.value + value)
+#             )
+#             connection.execute(stmt)
+
+# with engine.connect() as connection:
+#     # Re-select to check the updated state
+#     result = connection.execute(select(grid_table.c.x, grid_table.c.y, grid_table.c.value))
+#     updated_values = result.fetchall()
+#     print("Updated Values:", updated_values)
+
+
+# # Confirm the updates
+# with engine.connect() as connection:
+#     select_stmt = select([grid_table])
+#     result = connection.execute(select_stmt)
+#     rows = result.fetchall()
+
+# # Print all rows to verify updates
+# print("Database contents after update:")
+# for row in rows:
+#     print(row)
+
+
+# # Construct the update statement
+# update_stmt = (
+#     update(grid_table)
+#     .values(value=case_stmt)
+#     .where(grid_table.c.value != case_stmt)
+# )
+
+# # Create a SELECT statement to fetch all rows from the grid_table
+# select_stmt = select(grid_table)
+
+# # Execute the SELECT statement and fetch results
+# with engine.connect() as connection:
+#     result = connection.execute(select_stmt)
+#     rows = result.fetchall()
+
+# # Print or inspect the fetched rows
+# for row in rows:
+#     print(row)
+
+# # Create the update statement
+# update_stmt = (
+#     update(grid_table)
+#     .where(grid_table.c.value != case_stmt)
+#     .values(value=case_stmt)
+# )
+
+# # Execute the update
+# with engine.connect() as connection:
+#     result = connection.execute(update_stmt)
+#     print(f"Updated {result.rowcount} entries.")
+
+# case(
+#     [
+#         ((grid_table.c.x == x) & (grid_table.c.y == y), value)
+#         for (x, y), value in update_dict.items()
+#     ],
+#     else_=grid_table.c.value
+# )
+
+# # Create a case statement for conditional update
+# case_statements = {
+#     (x, y): case(
+#         [(grid_table.c.x == x) & (grid_table.c.y == y, value)],
+#         else_=grid_table.c.value
+#     )
+#     for (x, y), value in update_dict.items()
+# }
+
+
+# # Define SQL command to select all data from the grid table
+# select_sql = "SELECT * FROM grid;"
+
+# # Connect to the database and execute the query
+# with engine.connect() as connection:
+#     try:
+#         # Execute the select command
+#         result = connection.execute(text(select_sql))
+#         # Fetch all rows from the result
+#         rows = result.fetchall()
+#         # Print the results
+#         print("Data in grid table:")
+#         for row in rows:
+#             print(row)
+#     except Exception as e:
+#         print("An error occurred: {}".format(e))
+
+# # Coordinates to update
+# update_coords = {(1,1), (2,2), (3,3), (4,4), (5,5)}
+
+# # Create a copy of grid_points and update specific coordinates
+# updated_grid_points = [
+#     (i, j, 1.0) if (i, j) in update_coords else (i, j, value)
+#     for i, j, value in grid_points
+# ]
+
+# # Retrieve current data from the database
+# with engine.connect() as connection:
+#     result = connection.execute(text("SELECT x, y, value FROM grid;"))
+#     current_data = result.fetchall()
+
+# # Convert to a dictionary for easy comparison
+# current_values = {(x, y): value for x, y, value in current_data}
+
+# # Convert updated_grid_points to a dictionary
+# updated_values = {(i, j): value for i, j, value in updated_grid_points}
+
+# # Find differences
+# differences = [
+#     (i, j, value)
+#     for i, j, value in updated_grid_points
+#     if (i, j) in updated_values and (i, j) not in current_values or
+#     (i, j) in current_values and current_values[(i, j)] != value
+# ]
+
+# # Update differing values in the database
+# with engine.connect() as connection:
+#     for i, j, value in differences:
+#         connection.execute(
+#             text(f"UPDATE grid SET value = {value} WHERE x = {i} AND y = {j}"),
+#         )
+#     print(f"Updated {len(differences)} entries.")
+
+# # Step 8: Read the table into Python
+# with engine.connect() as connection:
+#     # Query to select all rows from the table
+#     result = connection.execute(text("SELECT x, y, value FROM grid;"))
+#     df = pd.DataFrame(result.fetchall(), columns=['x', 'y', 'value'])
+
+# # Print the DataFrame to validate the changes
+# print(df)
+
+# # Check current values
+# with engine.connect() as connection:
+#     result = connection.execute(text("SELECT x, y, value FROM grid;"))
+#     current_values = {(row[0], row[1]): row[2] for row in result.fetchall()}
+
+# print("Current grid points in database:")
+# for row in current_values.items():
+#     print(row)
+
+# print("Updated grid points with changes:")
+# for row in updated_grid_points:
+#     print(row)
+
+# # Determine differences
+# differences = [
+#     (i, j, value)
+#     for i, j, value in updated_grid_points
+#     if (i, j) in current_values and current_values[(i, j)] != value
+# ]
+
+# print(f"Differences to update: {differences}")
+
+# # Step 6: Update the database with INSERT OR REPLACE
+# with engine.connect() as connection:
+#     with connection.begin():  # Ensure transactions are committed
+#         for i, j, value in updated_grid_points:
+#             sql = """
+#             INSERT OR REPLACE INTO grid (x, y, value)
+#             VALUES (:x, :y, :value)
+#             """
+#             print(f"Executing SQL: {sql} with values: x={i}, y={j}, value={value}")
+#             connection.execute(
+#                 text(sql),
+#                 {"x": i, "y": j, "value": value}
+#             )
+#         print(f"Updated entries with INSERT OR REPLACE.")
+
+# # Step 8: Read the table into Python
+# with engine.connect() as connection:
+#     result = connection.execute(text("SELECT x, y, value FROM grid;"))
+#     rows = result.fetchall()
+#     df = pd.DataFrame(rows, columns=['x', 'y', 'value'])
+
+# # Print the DataFrame to validate the changes
+# print("Updated table data:")
+# print(df)
+
+
+# engine.dispose()
+
+# # Check if the file exists and then remove it
+# if db_file.exists():
+#     db_file.unlink()
+#     print(f"Deleted the file: {db_file}")
+# else:
+#     print(f"The file does not exist: {db_file}")
+
+# with engine.connect() as connection:
+#     connection.execute(text("""
+#     CREATE TABLE IF NOT EXISTS grid (
+#         x INTEGER,
+#         y INTEGER,
+#         value REAL,
+#         PRIMARY KEY (x, y)
+#     );
+#     """))
+
+#     connection.execute(text("""
+#     INSERT OR REPLACE INTO grid (x, y, value) VALUES
+#     (1, 1, 0), (1, 2, 0), (1, 3, 0), (1, 4, 0), (1, 5, 0),
+#     (2, 1, 0), (2, 2, 0), (2, 3, 0), (2, 4, 0), (2, 5, 0),
+#     (3, 1, 0), (3, 2, 0), (3, 3, 0), (3, 4, 0), (3, 5, 0),
+#     (4, 1, 0), (4, 2, 0), (4, 3, 0), (4, 4, 0), (4, 5, 0),
+#     (5, 1, 0), (5, 2, 0), (5, 3, 0), (5, 4, 0), (5, 5, 0);
+#     """))
+
+#     # Insert initial values (0) into the grid table
+#     values = ",".join(["({}, {}, {})".format(i, j, 0) for i, j, _ in grid_points])
+#     connection.execute(text("INSERT INTO grid (x, y, value) VALUES {values};"
+# .format(values=values)))
+
+#     # Commit
+#     connection.commit()
+
+#     # Verify data insertion
+#     result = connection.execute(text("SELECT * FROM grid;"))
+#     rows = result.fetchall()
+#     print("Data in grid table:", rows)
+
+#     connection.execute(text("""
+#     INSERT INTO grid (x, y, value) VALUES
+#     """ + ",".join(["({}, {}, {})".format(i, j, 0) for i, j, _ in grid_points]) + ";"))
+
+# engine.dispose()
+
+
+#     result = connection.execute(text("SELECT * FROM grid;"))
+#     rows = result.fetchall()
+#     print("Data in grid table:", rows)
+
+# with engine.connect() as connection:
+#     result = connection.execute(text("SELECT name FROM sqlite_master WHERE type='table';"))
+#     print(result.fetchall())
+
+# with engine.connect() as connection:
+#     # Describe the table schema
+#     result = connection.execute(text("PRAGMA table_info(grid);"))
+#     columns = result.fetchall()
+#     print("Table schema:", columns)
+
+# with engine.connect() as connection:
+#     result = connection.execute(text("SELECT * FROM grid;"))
+#     rows = result.fetchall()
+#     for row in rows:
+#         print(row)
+
+# SQL(db_file, command="select")
+
+
+# import geopandas as gpd
+# import geopy
+# import matplotlib.pyplot as plt
+# import numpy as np
+# import pandas as pd
+# import pyproj
+# import shapely.geometry
+# from geopy.distance import distance
+# from shapely.geometry import Point, Polygon, box
+# from shapely.ops import unary_union
+
+# from echopop.spatial.projection import utm_string_generator, wgs84_to_utm
+# from echopop.survey import Survey
+
+# survey = Survey( init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/ini
+# tialization_config.yml" ,
+#                  survey_year_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_fil
+# es/survey_year_2019_config.yml" )
+
+
+# grid_settings = file_configuration["geospatial"]["griddify"]
+# # lat_min = grid_settings["bounds"]["latitude"][0]
+# lat_min = 33.75
+# # lat_max = grid_settings["bounds"]["latitude"][1]
+# lat_max = 55.50
+# # lon_min = grid_settings["bounds"]["longitude"][0]
+# lon_min = -134.25
+# lon_max = grid_settings["bounds"]["longitude"][1]
+
+# projection = file_configuration["geospatial"]["projection"]
+
+# utm_code = utm_string_generator((lon_max + lon_min)/2, (lat_max + lat_min)/2)
+# utm_num = int(utm_code)
+# utm_str = f"epsg:{utm_num}"
+
+# biology_data = filtered_biology_output
+
+# from sqlalchemy import Engine, create_engine, inspect, text
+
+# root_dir = file_configuration["data_root_dir"]
+# db_directory = Path(root_dir) / "database"
+# db_directory.mkdir(parents=True, exist_ok=True)
+# db_file = db_directory / "biology.db"
+# # Create the engine with the full path
+# engine = create_engine(f'sqlite:///{db_file}')
+
+# SQL_COMMANDS = {
+#     "create": "CREATE TABLE IF NOT EXISTS {table_name} ({column_definitions});",
+#     "check": "SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';",
+#     "drop": "DROP TABLE IF EXISTS {table_name};",
+#     "select": "SELECT {columns} FROM {table_name};",
+#     "index": "CREATE UNIQUE INDEX IF NOT EXISTS {index_name} ON {table_name} ({columns})",
+#     # "insert": "INSERT INTO {table_name} ({columns});",
+#     "insert": """
+#         INSERT INTO {table_name} ({columns})
+#         SELECT {columns}
+#         FROM (SELECT VALUES {values} FROM (VALUES {value_placeholder})) AS source ({columns})
+#         {filter_clause};
+#         """,
+#     "inspect": None,
+# }
+
+# SQL_DTYPES = {
+#     'int32': 'INTEGER',
+#     'int64': 'INTEGER',
+#     'float64': 'FLOAT',
+#     'bool': 'BOOLEAN',
+#     'datetime64[ns]': 'DATETIME',
+#     'object': 'TEXT'
+# }
+
+# def SQL(db_file: str, command: str, **kwargs):
+
+#     # Create engine from `db_file` string
+#     engine = create_engine(f"sqlite:///{db_file}")
+
+#     # Format `columns`, if there are any and more than 1
+#     if "columns" in kwargs.keys():
+#         if isinstance(kwargs["columns"], list):
+#             kwargs["columns"] = ", ".join(kwargs["columns"])
+#     else:
+#         kwargs["columns"] = "*"
+
+#     # Format `columns`, if there are any and more than 1
+#     # if "filter_columns" in kwargs.keys():
+#     #     # ---- Store the value for later
+#     #     kwargs["filter_columns_store"] = kwargs["filter_columns"]
+#     #     if isinstance(kwargs["filter_columns"], list):
+#     #         kwargs["filter_columns"] = ", ".join(kwargs["filter_columns"])
+
+#     # Run the command
+#     try:
+#         with engine.connect() as connection:
+#             # ---- SELECT
+#             if command == "select":
+#                 return pd.read_sql(text(SQL_COMMANDS[command].format(**kwargs)), con=connection)
+#             # ---- CREATE
+#             elif command == "create":
+#                 # ---- Extract dataframe
+#                 df_to_add = kwargs["dataframe"]
+#                 # ---- Check whether the table already exists or not
+#                 table_exists = (
+#                     connection.execute(text(SQL_COMMANDS["check"].format(**kwargs))).fetchone()
+#                 )
+#                 # ---- If it doesn't, pre-allocate the table
+#                 if table_exists is None:
+#                     # ---- Get column definitions as a string
+#                     column_def_dict = {
+#                         col: SQL_DTYPES.get(str(dtype), 'TEXT')
+#                         for col, dtype in zip(df_to_add.columns, df_to_add.dtypes)
+#                     }
+#                     # ---- Convert to a single string
+#                     kwargs["column_definitions"] = (
+#                         ", ".join([f"{col} {dtype}" for col, dtype in column_def_dict.items()])
+#                     )
+#                     # ---- Create table
+#                     connection.execute(text(SQL_COMMANDS["create"].format(**kwargs)))
+#             # ---- REPLACE
+#             elif command == "replace":
+#                 # ---- Extract dataframe
+#                 df_to_add = kwargs["dataframe"]
+#                 # ---- Replace current
+#                 df_to_add.to_sql(name=kwargs["table_name"],
+#                                  con=connection,
+#                                  if_exists="replace", index=False)
+
+#             # ---- INSERT
+#             elif command == "insert":
+#                 # ---- Extract dataframe
+#                 df_to_add = kwargs["dataframe"]
+#                 # ---- Check if
+#                 # table_exists = (
+#                 #     connection.execute(text(SQL_COMMANDS["check"].format(**kwargs))).fetchone()
+#                 # )
+#                 # tables = SQL(db_file, "inspect")
+#                 # ---- If it doesn't, pre-allocate the table
+#                 # if kwargs["table_name"] not in tables and "filter_columns" in kwargs.keys():
+#                 df_to_add.to_sql(name=kwargs["table_name"],
+#                                     con=connection,
+#                                     if_exists="append", index=False)
+#                 # else:
+#                     #     # ---- Format `filter_columns` command if present
+#                     # if "filter_columns" in kwargs.keys():
+#                     #     # ---- Fetch table
+#                     #     fetch_table = (
+#                     #         connection.execute(text(
+#                     #             ("SELECT DISTINCT {filter_columns} FROM {table_name}")
+#                     #             .format(**kwargs))
+#                     #         )
+#                     #     )
+#                     #     # ---- Format the SQL data into a DataFrame
+#                     #     fetched_df = pd.DataFrame(fetch_table.fetchall(),
+# columns=fetch_table.keys())
+#                     #     # ---- Create an index tuples
+#                     #     index_tuples = (
+#                     #         set(fetched_df[kwargs["filter_columns_store"]]
+#                     #             .itertuples(index=False, name=None))
+#                     #     )
+#                     #     # ---- Filter the dataframe
+#                     #     filtered_df = (
+#                     #         df_to_add[
+#                     #             ~df_to_add[fetched_df.columns].apply(tuple, axis=1)
+#                     #             .isin(index_tuples)
+#                     #             ]
+#                     #     )
+#                     #     # ---- Insert the data
+#                     #     filtered_df.to_sql(name=kwargs["table_name"],
+#                     #                         con=connection,
+#                     #                         if_exists="append", index=False)
+#                     # else:
+#                     # df_to_add.to_sql(name=kwargs["table_name"],
+#                     #                 con=connection,
+#                     #                 if_exists="append", index=False)
+#             # ---- INSPECT
+#             elif command == "inspect":
+#                 return inspect(engine).get_table_names()
+#             else:
+#                 connection.execute(text(SQL_COMMANDS[command].format(**kwargs)))
+#     finally:
+#         # ---- Dispose of the engine to release any resources being pooled/used
+#         engine.dispose()
+
+# _ = SQL(db_file, "drop", table_name="catch_df")
+# _ = SQL(db_file, "drop", table_name="specimen_df")
+# _ = SQL(db_file, "drop", table_name="length_df")
+# _ = SQL(db_file, "drop", table_name="files_read")
+
+# _ = SQL(db_file, "insert", table_name="files_read", dataframe=current_files)
+# current = SQL(db_file, "select", table_name="files_read", columns="filepath")
+# current
+
+
+# # Get acoustic directory and initialization settings
+# # ---- Files
+# biology_file_settings = file_configuration["input_directories"]["biological"]
+# # ---- General settings
+# biology_analysis_settings = file_configuration["biology"]
+
+# # Get the file-specific settings, datatypes, columns, etc.
+# # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
+# biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"]
+# # ---- Extract the expected file name ID's
+# biology_file_ids = biology_file_settings["file_name_formats"]
+# # ---- Extract all of the file ids
+# biology_config_ids = list(biology_file_ids.keys())
+# # ---- Initialize the dictionary that will define this key in the `input` attribute
+# biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
+# # ---- Initialize the SQL dictionary
+# sql_biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
+
+# # Create full filepath
+# biology_directory_path = (
+#     Path(file_configuration["data_root_dir"]) / biology_file_settings["directory"]
+# )
+# # ---- Directory check
+# directory_existence = biology_directory_path.exists()
+# # ---- Error evaluation (if applicable)
+# if not directory_existence:
+#     raise FileNotFoundError(
+#         f"The acoustic data directory [{biology_directory_path}] does not exist."
+#     )
+# # ---- Get the defined file extension
+# file_extension = biology_file_settings["extension"]
+# # ---- Create Path.glob generator object
+# file_path_obj = biology_directory_path.glob(f"*{'.'+file_extension}")
+# #---- Create list of `*.csv`` files
+# csv_files = list(file_path_obj)
+# # ---- Ensure files exist or raise error otherwise
+# if len(csv_files) < 1:
+#     raise FileNotFoundError(
+#         f"No `*.csv` files found in [{biology_directory_path}]!"
+#     )
+# else:
+#     # ---- Create Path to SQL database file
+#     db_directory = Path(file_configuration["data_root_dir"]) / "database"
+#     # ---- Create the directory if it does not already exist
+#     db_directory.mkdir(parents=True, exist_ok=True)
+#     # ---- Complete path to `biology.db`
+#     db_file = db_directory / "biology.db"
+#     # ---- Query the external SQL database to see if the file tracking table exists
+#     tables = SQL(db_file, "inspect")
+#     # ---- Create a list of string-formatted Path names
+#     csv_files_str = [str(file) for file in csv_files]
+#     # ---- Create DataFrame
+#     current_files = pd.DataFrame(csv_files_str, columns=["filepath"])
+#     # ---- Create if it is missing and then advance `csv_files`
+#     if "files_read" not in tables:
+#         # ---- Insert into the SQL database file
+#         _ = SQL(db_file, "insert", table_name="files_read", columns="filepath",
+#                     dataframe=current_files)
+#         # ---- Create empty list for later comparison
+#         new_files = []
+#     else:
+#         # ---- Pull already processed filenames
+#         previous_files = SQL(db_file, "select", table_name="files_read")
+#         # ---- Compare against the current filelist
+#         new_files = (
+#             [file for file in csv_files_str if file not in set(previous_files["filepath"])]
+#         )
+#         # ---- Create a DataFrame for the new files
+#         new_files_df = pd.DataFrame(new_files, columns=["filepath"])
+#         # ---- Insert into the SQL database file
+#         _ = SQL(db_file, "insert", table_name="files_read", dataframe=new_files_df)
+
+# # Iterate through each of the file ids and read in the data
+# for id in list(biology_file_ids.keys()):
+#     # ---- Extract the specific config mapping for this tag/id
+#     sub_config_map = biology_config_map[id]
+#     # ---- Drop the `{FIELD_ID}` tag identifier
+#     file_id_format = re.sub(r'\{FILE_ID:([^}]+)\}', r'\1', biology_file_ids[id])
+#     # ---- Replace all other tags with `*` placeholders
+#     file_id_format = re.sub(r"\{[^{}]+\}", "*", file_id_format)
+#     # ---- Create Path object with the generalized format
+#     subfile_path_obj = biology_directory_path.glob(f"{file_id_format}.{file_extension}")
+#     # ---- List all files that match this pattern
+#     subcsv_files_str = [str(file) for file in list(subfile_path_obj)]
+#     # ---- Filter for only new files
+#     subset_files = set(subcsv_files_str).intersection(set(new_files))
+#     # ---- Pull from SQL database, if applicable
+#     if f"{id}_df" in tables:
+#         # ---- SELECT
+#         sql_df = SQL(db_file, "select", table_name=f"{id}_df", columns="*")
+#         # ---- Concatenate to the dictionary
+#         sql_biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sql_df])
+#     # ---- Add data files not stored in SQL database
+#     if len(subset_files) > 0 or len(subset_files)== 0 and f"{id}_df" not in tables:
+#         if len(subset_files) > 0:
+#             file_list = subset_files
+#         else:
+#             file_list = subcsv_files_str
+#         # ---- Create a list of relevant dataframes
+#         sub_df_lst = [read_biology_csv(Path(file), biology_file_ids[id], sub_config_map)
+#                         for file in file_list]
+#         # ---- Concatenate into a single DataFrame
+#         sub_df = pd.concat(sub_df_lst, ignore_index=True)
+#         # ---- Concatenate to the dictionary DataFrame
+#         biology_output[f"{id}_df"] = pd.concat([biology_output[f"{id}_df"], sub_df])
+
+# # Get contrasts used for filtering the dataset
+# # ---- Species
+# species_filter = file_configuration["species"]["number_code"]
+# # ---- Trawl partition information
+# trawl_filter = biology_analysis_settings["catch"]["partition"]
+# # ---- Apply the filter
+# filtered_biology_output = {
+#     key: df[
+#         (df['species_id'] == species_filter if 'species_id' in df.columns else True) &
+#         (df['trawl_partition'].str.lower() == trawl_filter if 'trawl_partition' in df.columns
+# else True)
+#     ]
+#     for key, df in biology_output.items() if isinstance(df, pd.DataFrame) and not df.empty
+# }
+
+# # Update the SQL database
+# for table_name, df in filtered_biology_output.items():
+#     # ---- Update
+#     _ = SQL(db_file, "insert", table_name=table_name, columns="*",
+#             dataframe=df)
+
+# # Combine the two datasets
+# merged_output = {
+#     key: pd.concat([
+#         sql_biology_output.get(key, pd.DataFrame()),
+#         filtered_biology_output.get(key, pd.DataFrame())
+#     ]).drop_duplicates().reset_index(drop=True)
+#     for key in set(sql_biology_output) | set(filtered_biology_output)
+# }
+# # ---- Return output
+# merged_output
+
+# coordinate_metadata.attrs[]
+
+# SQL(biology_db, command="drop", table_name="catch_df")
+# SQL(biology_db, command="drop", table_name="specimen_df")
+# SQL(biology_db, command="drop", table_name="length_df")
+# SQL(biology_db, command="drop", table_name="files_read")
+# _ = SQL(db_file=db_file, command="create", table_name="files_read", columns="filepath")
+# tables = SQL(db_file, "inspect")
+# tables
+# current = SQL(db_file, "select", table_name="files_read", columns=["filepath"])
+# current
+
+# SQL(db_file, "select", table_name="catch_df", columns="*")
+# new_files_df = pd.DataFrame(csv_files_str, columns=['file_path'])
+# _ = SQL("insert", engine, table_name="files_read",dataframe=new_files_df)
+# current = SQL("select", engine, table_name="csv_files_read", columns="file_path")
+# current
+# for table_name, df in biology_data.items():
+#     df.to_sql(table_name, con=engine, if_exists='append', index=False)
+# command = "read"
+# engine = create_engine(f'sqlite:///{db_file}')
+# table_name = "files_read"
+# columns = "file_path"
+
+# kwargs = {
+#     "table_name": table_name,
+#     "columns": columns,
+# }
+
+# zarr_data_ds["depth"].diff(dim="depth")
+
+# prc_nasc_df.groupby(["longitude", "latitude"])
+
+# from pandas.core.groupby import DataFrameGroupBy
+
+
+# def estimate_echometrics(acoustic_data_df: pd.DataFrame):
+
+#     # Create copy
+#     acoustic_df = acoustic_data_df.copy().reset_index(drop=True)
+
+#     # Pre-compute the change in depth
+#     acoustic_df["dz"] = acoustic_df["depth"].diff()
+
+#     # Initialize echometrics dictionary
+#     echometrics = {}
+
+#     # Compute the metrics center-of-mass
+#     if acoustic_df["NASC"].sum() == 0.0:
+#         echometrics.update({
+#             "n_layers": 0,
+#             "mean_Sv": -999,
+#             "max_Sv": -999,
+#             "nasc_db": np.nan,
+#             "center_of_mass": np.nan,
+#             "dispersion": np.nan,
+#             "evenness": np.nan,
+#             "aggregation": np.nan,
+#             "occupied_area": 0.0,
+#         })
+#     else:
+
+#         # Compute the number of layers
+#         echometrics.update({
+#             "n_layers": acoustic_df["depth"][acoustic_df["NASC"] > 0.0].size
+#         })
+
+#         # Compute ABC
+#         # ---- Convert NASC to ABC
+#         acoustic_df["ABC"] = acoustic_df["NASC"] / (4 * np.pi * 1852 ** 2)
+#         # ---- Estimate mean Sv
+#         echometrics.update({
+#             "mean_Sv": 10.0 * np.log10(acoustic_df["ABC"].sum() / acoustic_df["depth"].max())
+#         })
+#         # --- Estimate max Sv (i.e. )
+#         echometrics.update({
+#             "max_Sv": 10 * np.log10(acoustic_df["ABC"].max()
+#                                     / acoustic_df.loc[np.argmax(acoustic_df["ABC"]), "dz"])
+#         })
+
+#         # Compute (acoustic) abundance
+#         echometrics.update({
+#             "nasc_db": 10 * np.log10(acoustic_df["ABC"].sum())
+#         })
+
+#         # Compute center of mass
+#         echometrics.update({
+#             "center_of_mass": (
+#                 (acoustic_df["depth"] * acoustic_df["NASC"]).sum()
+#                 / (acoustic_df["NASC"]).sum()
+#             )
+#         })
+
+#         # Compute the dispersion
+#         echometrics.update({
+#             "dispersion": (
+#                 ((acoustic_df["depth"] - echometrics["center_of_mass"]) ** 2
+#                 * acoustic_df["NASC"]).sum() / (acoustic_df["NASC"]).sum()
+#             )
+#         })
+
+#         # Compute the evenness
+#         echometrics.update({
+#             "evenness": (acoustic_df["NASC"] **2).sum() / ((acoustic_df["NASC"]).sum()) ** 2
+#         })
+
+#         # Compute the index of aggregation
+#         echometrics.update({
+#             "aggregation": 1 / echometrics["evenness"]
+#         })
+
+#         # Get the occupied area
+#         echometrics.update({
+#             "occupied_area": (
+#                 acoustic_df["dz"][acoustic_df["ABC"] > 0.0].sum() / acoustic_df["depth"].max()
+#             )
+#         })
+
+#     # Return the dictionary
+#     return echometrics
+
+# def integrate_nasc(acoustic_data_df: pd.DataFrame, echometrics: bool = True):
+
+#     # Vertically integrate PRC NASC
+#     nasc_dict = {"nasc": acoustic_data_df["NASC"].sum()}
+
+#     # Horizontally concatenate `echometrics`, if `True`
+#     if echometrics:
+#         # ---- Compute values
+#         # NOTE: This uses NASC instead of linear `sv`
+#         echometrics_dict = estimate_echometrics(acoustic_data_df)
+#         # ---- Merge
+#         nasc_dict.update(echometrics_dict)
+
+#     # Convert `nasc_dict` to a DataFrame and return the output
+#     return pd.Series(nasc_dict)
+
+# def process_group(group):
+#     result = integrate_nasc(group, echometrics=True)
+#     result = result.reset_index(drop=True)
+#     # Concatenate the result back to the original group for alignment
+#     group = group.reset_index(drop=True)
+#     combined = pd.concat([group, result], axis=1)
+#     return combined
+
+# acoustic_data_df = acoustic_data["prc_nasc_df"]
+
+
+# rc_nasc_df[prc_nasc_df["distance"] == 0.0]
+# acoustic_data_df = mek[mek["distance"] == 0.0]
+# pd.DataFrame(nasc_dict, index=[0]).reset_index(drop=True).unstack()
+# nasc_data_df = (
+#     prc_nasc_df.groupby(["longitude", "latitude", "ping_time"])
+#     .apply(lambda group: integrate_nasc(group, echometrics=False), include_groups=False)
+#     .reset_index()
+# )
+
+
+# kwargs = {
+#     "table_name": "csv_files_read",
+#     "columns": "file_path",
+#     "dataframe": new_files_df
+# }
+
+# current_process = psutil.Process()
+# import logging
+
+# # Create a session
+# Session = sessionmaker(bind=engine)
+# session = Session()
+
+# # Perform database operations
+# logging.basicConfig(level=logging.INFO)
+# logger = logging.getLogger(__name__)
+# logger.info("Performing database operations")
+
+# # Create a session
+# Session = sessionmaker(bind=engine)
+# session = Session()
+
+# # Perform database operations
+# logger.info("Performing database operations")
+
+# # Close the session
+# session.close()
+# logger.info("Session closed")
+
+# # Dispose the engine
+# engine.dispose()
+# logger.info("Engine disposed")
+
+# # Force garbage collection
+# import gc
+
+# gc.collect()
+# logger.info("Garbage collection performed")
+
+# import psutil
+
+# pid = psutil.Process().pid
+# process = psutil.Process(pid)
+# open_files = process.open_files()
+# db_path = r'C:\Users\Brandyn\Documents\GitHub\EchoPro_data\live_2019_files\database\biology.db'
+
+# # Check if the file is still in use
+# for file in open_files:
+#     if db_path in file.path:
+#         logger.info(f"File {db_path} is still in use.")
+#     else:
+#         logger.info(f"File {db_path} is not in use.")
+
+# # Define the SQL to drop the table
+# drop_table_sql = "DROP TABLE IF EXISTS csv_files_read;"
+# # Execute the drop table SQL
+# with engine.connect() as connection:
+#     _ = connection.execute(text(drop_table_sql))
+
+# import sqlite3
+
+# if os.path.exists(db_path):
+#     conn = sqlite3.connect(db_path)
+#     conn.close()
+#     # Force the file to be removed
+#     try:
+#         os.remove(db_path)
+#         print(f"Database file {db_path} has been deleted.")
+#     except PermissionError:
+#         print(f"Failed to delete {db_path}. The file is still in use.")
+
+# create_table_sql = """
+# CREATE TABLE IF NOT EXISTS csv_files_read (
+#     file_path TEXT UNIQUE
+# );
+# """
+# # Execute the create table SQL
+# with engine.connect() as connection:
+#     _ = connection.execute(text(create_table_sql))
+
+# root_directory =  Path(root_dir)
+# dataset = "biology"
+
+# # Convert to strings
+# csv_files_str = [str(file) for file in csv_files]
+
+# existing_files_df = pd.read_sql('SELECT file_path FROM csv_files_read', con=engine)
+# existing_files_set = set(existing_files_df['file_path'])
+# # Filter out duplicates from the csv_files list
+# new_files = [file for file in csv_files_str if file not in existing_files_set]
+# # Insert only new file paths into the SQL table
+# if new_files:
+#     new_files_df = pd.DataFrame(new_files, columns=['file_path'])
+#     _ = new_files_df.to_sql('csv_files_read', con=engine, if_exists='append', index=False)
+
+
+# with engine.connect() as conn:
+#     conn.execute("""
+#         CREATE TABLE IF NOT EXISTS csv_files_read (
+#             file_path TEXT UNIQUE
+#         )
+#     """)
+
+# csv_files
+# files_df.to_sql('csv_files_read', con=engine, if_exists='append', index=False)
+# file_name_format = biology_file_ids[id]
+# def compile_filename_format(file_name_format: str):
+
+#     # Create a copy of `file_name_format`
+#     regex_pattern = file_name_format
+
+#     # Iterate through the keys from `LIVE_FILE_FORMAT_MAP` to format a regex pattern
+#     for key, value in LIVE_FILE_FORMAT_MAP.items():
+#         regex_pattern = regex_pattern.replace(f"{{{key}}}", value["expression"])
+#     # ---- Replace the `FILE_ID` tag
+#     regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P<FILE_ID>\1)', regex_pattern)
+
+#     # Compile the regex pattern and return the output
+#     return re.compile(regex_pattern)
+
+# from sqlalchemy.orm import sessionmaker
+
+# Session = sessionmaker(bind=engine)
+# session = Session()
+# session.close()
+# engine.pool.status()
+# # Dispose the engine to close all connections
+# engine.dispose()
+# import gc
+
+# gc.collect()
+# import psutil
+
+# dbapi_conn = engine.raw_connection()
+# dbapi_conn.close()
+# # Get the process ID of the current process
+# pid = psutil.Process().pid
+
+# # List all open files for the current process
+# process = psutil.Process(pid)
+# open_files = process.open_files()
+
+# for file in open_files:
+#     print(file.path)
+
+
+# pattern = filename_format
+# config_settings = sub_config_map
+# regex_pattern = pattern
+
+# # Replace patterns based on LIVE_FILE_FORMAT_MAP
+# for key, value in LIVE_FILE_FORMAT_MAP.items():
+#     regex_pattern = regex_pattern.replace(f'{{{key}}}', value['expression'])
+# regex_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'(?P<FILE_ID>\1)', regex_pattern)
+# new_pattern = compile_filename_format(regex_pattern)
+# match_obj = new_pattern.search(file.name)
+# # Get substring components as a list
+# filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern)
+# valid_tags = list(set(["HAUL", "SPECIES_CODE"]).intersection(set(filename_substrings)))
+
+# for i in valid_tags:
+#     matched_key = LIVE_FILE_FORMAT_MAP[i]
+#     df[matched_key["name"]] = matched_key["dtype"](match_obj.group(i))
+
+
+# # Assign the data as new columns to the DataFrame
+# for key, value in data_to_add.items():
+#     df[key] = value
+
+# for i in valid_tags:
+#     matched_key = LIVE_FILE_FORMAT_MAP[i]
+#     df[matched_key["name"]] = matched_key["dtype"](match_obj.group(i))
+# biology_analysis_settings
+# species_id_value = 22500
+# trawl_partition_value = 'Codend'  # Adjust as needed
+# {
+#     key: df[
+#         (('species_id' not in df.columns) or (df['species_id'] == species_id_value)) &
+#         (('trawl_partition' not in df.columns) or (df['trawl_partition'] ==
+# trawl_partition_value))
+#     ]
+#     for key, df in biology_output.items() if isinstance(df, pd.DataFrame)
+# }
+
+# (match_obj.group(i)).astype(matched_key["dtype"])
+# pattern = '{DATE:YYYYMM}_{HAUL}_{FILE_ID:catch_perc}'
+# modified_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'\1', pattern)
+# # Create the regex pattern
+# regex_pattern = modified_pattern.replace('{', '(?P<').replace('}', '>.+?)')
+# re.compile(regex_pattern)
+
+# modified_pattern = re.sub(r'\{FILE_ID:(.+?)\}', r'\1', pattern)
+
+# # Create the regex pattern
+# regex_pattern = modified_pattern.replace('{', '(?P<').replace('}', '>.+?)')
+# compile_filename_format(regex_pattern)
+# # Regular expression to capture values inside the curly braces
+# regex = r'\{([^:}]+):([^}]+)\}'
+
+# # Find all matches
+# matches = re.findall(regex, modified_pattern)
+
+# # Get substring components as a list
+# filename_substrings = re.findall(r'\{([^:}]+)(?::[^}]+)?}', pattern)
+
+# pattern_changed = pattern.replace("FILE_ID:", "")
+
+# # Compilte the filename regular expression format
+# compiled_regex = compile_filename_format(pattern_changed)
+
+# file_id_tag = pattern.split('{FILE_ID:')[1].split('}')[0]
+
+#  # Get the file name and produce a `re.Match` object
+# match_obj = compiled_regex.search(file.name)
+
+
+# def read_biology_csv(file: Path, pattern: re.Pattern, config_settings: dict):
+
+#     # Get the file name and produce a `re.Match` object
+#     match_obj = pattern.search(file.name)
+
+#     # Read in the `*.csv` file
+#     df = pd.read_csv(file, usecols=list(config_settings["dtypes"].keys()))
+
+#     # Validate the dataframe
+#     # ---- Check for any missing columns
+#     missing_columns = (
+#         [key for key in config_settings["dtypes"].keys() if key not in df.columns]
+#     )
+#     # ---- Raise Error, if needed
+#     if missing_columns:
+#         raise ValueError(
+#             f"The following columns are missing from [{file}]: {', '.join(missing_columns)}!"
+#         )
+#     # ---- Ensure the correct datatypes
+#     df_validated = df.astype(config_settings["dtypes"])
+
+#     # Replace column names and drop
+#     df_validated = df_validated.rename(columns=config_settings["names"])
+
+#     # Get the haul number and add the the dataframe
+#     # ---- Extract the haul number and convert to an integer
+#     haul_num = int(match_obj.group("HAUL"))
+#     # ---- Add the column
+#     df_validated["haul_num"] = haul_num
+
+#     # Return the resulting DataFrame
+#     return df_validated
+
+# boundary_dict = griddify_definitions["bounds"]
+
+# import geopandas as gpd
+# import numpy as np
+# import pandas as pd
+# from geopy.distance import distance
+
+# from echopop.spatial.projection import utm_string_generator
+
+# ##
+# grid_settings["grid_resolution"]["x"] = 50
+# grid_settings["grid_resolution"]["y"] = 50
+# lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters
+# lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters
+
+# # CREATE BOUNDING
+# bound_df = pd.DataFrame({
+#     "lon": np.array([lon_min, lon_max, lon_max, lon_min, lon_min]),
+#     "lat": np.array([lat_min, lat_min, lat_max, lat_max, lat_min])
+# })
+
+# bound_gdf = gpd.GeoDataFrame(
+#     data=bound_df,
+#     geometry=gpd.points_from_xy(bound_df["lon"], bound_df["lat"]),
+#     crs = projection
+# )
+# import shapely.geometry
+
+# from echopop.spatial.projection import utm_string_generator
+
+# utm_string_generator(-117.0, 33.75)
+# bound_gdf.total_bounds
+# # Convert to UTM
+# bound_utm = bound_gdf.to_crs(utm_num)
+# bound_utm.total_bounds
+# y_step = lat_step
+# x_step = lon_step
+# # bound_utm = bound_gdf
+# # y_step = grid_settings["grid_resolution"]["y"] * 1852 / 110574
+# # x_step = grid_settings["grid_resolution"]["x"] * 1852 / 60.0
+
+# xmin, ymin, xmax, ymax = bound_utm.total_bounds
+
+# # Get number of cells
+# n_x_cells = int(np.ceil((xmax - xmin) / x_step))
+# n_y_cells = int(np.ceil((ymax - ymin) / y_step))
+
+# import pyproj
+
+# # create the cells in a loop
+# # grid_cells = []
+# # for x0 in np.arange(xmin, xmax, x_step):
+# #     for y0 in np.arange(ymin, ymax, y_step):
+# #         # bounds
+# #         utm_zone = utm_string_generator(x0, y0)
+# #         proj = pyproj.Proj(f"epsg:{utm_code}")
+# #         x1 = x0-x_step
+# #         y1 = y0+y_step
+# #         grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))
+
+# grid_cells = []
+# for y0 in np.arange(ymin, ymax, y_step):
+
+#     # x_step = grid_settings["grid_resolution"]["x"] * 1852 / (1852 * 60 * np.cos(np.radians(y0)))
+
+#     for x0 in np.arange(xmin, xmax, x_step):
+#         # bounds
+#         # utm_zone = utm_string_generator(x0, y0)
+#         # proj = pyproj.Proj(f"epsg:{utm_code}")
+#         # x1, y1 = proj(x0, y0)
+#         # x2, y2 = proj(x0 - x_step, y0 + y_step)
+#         # grid_cells.append(box(x1, y1, x2, y2))
+#         x1 = x0-x_step
+#         y1 = y0+y_step
+#         grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))
+
+# cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=utm_code)
+# cells_gdf.shape
+# n_x_cells * n_y_cells
+# # cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"])
+# cells_gdf.total_bounds
+# cells_gdf.to_crs(projection).total_bounds
+# from shapely.geometry import mapping
+# from shapely.validation import make_valid
+
+# ########
+# world = gpd.read_file("C:/Users/Brandyn/Documents/GitHub/EchoPro_data/live_2019_files/coastline/
+# ne_10m_land/ne_10m_land.shp")
+# bb_orig = box(lon_min, lat_min, lon_max, lat_max)
+# boundary_box = box(lon_min - 5, lat_min - 5, lon_max + 5, lat_max + 5)
+# world_orig = gpd.clip(world, box(lon_min-1, lat_min-1, lon_max+1, lat_max+1))
+# world_clipped_latlon = gpd.clip(world, boundary_box)
+# world_clipped = gpd.clip(world, boundary_box).to_crs(utm_code)
+
+# world_utm = world.to_crs(utm_code)
+# world_utm = world_utm[~world_utm.is_empty]
+
+# bbox_latlon = box(lon_min, lat_min, lon_max, lat_max)
+
+# gpd.GeoDataFrame(geometry=[bbox_latlon], crs=projection).to_crs(utm_code)
+
+# bbox_utm = bound_utm.total_bounds
+
+# buffer = [-lon_step * 1.01, -lat_step * 1.01, lon_step * 1.01, lat_step * 1.01]
+# array_buffer = bbox_utm + buffer
+# array_names = ["minx", "miny", "maxx", "maxy"]
+# buffered = dict(zip(array_names, array_buffer))
+# buffer_boundary = box(**buffered)
+# # box(array_buffer[0], array_buffer[1], array_buffer[2], array_buffer[3])
+# # buffer_boundary = buffer_boundary.to_crs(world_utm.crs)
+
+# buffer_boundary_gdf = gpd.GeoDataFrame(geometry=[buffer_boundary], crs=world_utm.crs)
+# # Replace with the correct EPSG code
+# bb_orig_gdf = gpd.GeoDataFrame(geometry=[bb_orig], crs=projection)
+# # sub_clipped = gpd.clip(world_utm, buffer_boundary)
+# # sub_clipped = gpd.clip(world_utm, bbox_utm)
+
+# from datetime import datetime
+
+# import geopandas as gpd
+# import matplotlib.cm as cm
+# import matplotlib.colors as colors
+# import matplotlib.dates as mdates
+# import matplotlib.pyplot as plt
+# import numpy as np
+# from matplotlib.colors import ListedColormap
+# from shapely import wkt
+
+# # fig, ax = plt.subplots(figsize=(10, 10))
+# # # Plot the buffer_boundary
+# # world.plot(ax=ax, linewidth=2, color='gray')
+# # buffer_boundary_gdf.to_crs(projection).plot(ax=ax, facecolor='none', edgecolor='blue')
+# # bb_orig_gdf.plot(ax=ax, facecolor='none', edgecolor='red')
+# # plt.xlim(lon_min-3, lon_max+3)
+# # plt.ylim(lat_min-3, lat_max+3)
+# # plt.show()
+# from echopop.live.sql_methods import SQL
+
+# db_filepath = realtime_survey.config["database"]["grid"]
+# survey_db = realtime_survey.config["database"]["acoustics"]
+# grid_df = SQL(db_filepath, "select", table_name="grid_df")
+# # grid_df[grid_df.abundance > 0]
+# grid_df[grid_df.abundance > 1e10]
+# # grid_df[grid_df.abundance > 0]
+# coast_df = SQL(db_filepath, "select", table_name="coastline_df")
+# survey_df = SQL(survey_db, "select", table_name="survey_data_df")
+
+# # def parse_datetime(date_str):
+# #     # List of possible formats
+# #     formats = [
+# #         '%Y-%m-%d %H:%M:%S.%f',  # With fractional seconds
+# #         '%Y-%m-%d %H:%M:%S',     # Without fractional seconds
+# #         '%Y-%m-%dT%H:%M:%S.%f',  # ISO 8601 format with fractional seconds
+# #         '%Y-%m-%dT%H:%M:%S'      # ISO 8601 format without fractional seconds
+# #     ]
+
+# #     for fmt in formats:
+# #         try:
+# #             return pd.to_datetime(date_str, format=fmt)
+# #         except (ValueError, TypeError):
+# #             continue  # Try the next format
+
+# #     return pd.NaT  # Return NaT if no formats match
+
+# # survey_df["ping_time"] = survey_df["ping_time"].apply(parse_datetime)
+
+# # pd.to_datetime(survey_df["ping_time"], format='%Y-%m-%d %H:%M:%S.%f', errors="coerce")
+
+# # fig, ax = plt.subplots(figsize=(5, 8))
+# # ax.scatter(survey_df.ping_time, survey_df.nasc)
+# # plt.ylabel("NASC")
+# # # ax.xaxis.set_major_locator(mdates.DayLocator(5, 10, 15))
+# # plt.show()
+
+
+# # times = np.arange(np.datetime64('2001-01-02'),
+# #                   np.datetime64('2002-02-03'), np.timedelta64(75, 'm'))
+# # y = np.random.randn(len(times))
+# # survey_df[(survey_df.nasc > 0) & (survey_df.nasc < 1e5)]["nasc"].mean()
+# # survey_df[(survey_df.nasc > 0) & (survey_df.nasc > 1e5)]["nasc"].mean()
+
+# # fig, ax = plt.subplots()
+# # ax.plot(times, y)
+# # survey_df[(survey_df.number_density > 0) & (survey_df.x == 21)]
+# # # a = self.input["acoustics"]["prc_nasc_df"]
+# # # survey_df[(survey_df.x) == 24 & (survey_df.y == 13)]
+
+# grid_df["geometry"] = grid_df["geometry"].apply(wkt.loads)
+# coast_df["geometry"] = coast_df["geometry"].apply(wkt.loads)
+
+# projection = realtime_survey.config["geospatial"]["projection"]
+
+# grid_gdf = gpd.GeoDataFrame(grid_df, geometry="geometry", crs=projection)
+# grid_gdf_1 = grid_gdf[grid_gdf.abundance > 0]
+# coast_gdf = gpd.GeoDataFrame(coast_df, geometry="geometry", crs=projection)
+
+# lims = grid_gdf.total_bounds
+# # nu = dataset_gdf[(dataset_gdf.stratum_x == 25) & (dataset_gdf.stratum_y == 11)]
+# # dataset_gdf.stratum_x.max()
+# # # np.linspace(1, 1, len(np.arange(xmin, xmax+x_step, x_step))-1)
+
+# # # np.arange(1, len(np.arange(xmin, xmax+x_step, x_step)))
+# # pd.cut(
+# #     nu["x"],
+# #     np.arange(xmin, xmax, x_step),
+# #     right = False,
+# #     labels = np.arange(1, len(np.arange(xmin, xmax, x_step))),
+# # ).astype(int) - 1
+# # grid_gdf["x"] =  grid_gdf["x"] - 1
+
+# # fig, ax = plt.subplots(figsize=(5, 8))
+# # grid_gdf.plot(ax=ax, edgecolor="gainsboro", color="white", linewidth=0.5, legend=False)
+# # plt.plot(dataset_gdf.longitude, dataset_gdf.latitude, linewidth=1, color='black')
+# # plt.plot(nu.longitude, nu.latitude, linewidth=1, color="red")
+# # # Calculate centroids and plot text
+# # for idx, row in grid_gdf.iterrows():
+# #     centroid = row.geometry.centroid
+# #     var = f"{row.x}-{row.y}"
+# #     ax.annotate(var, xy=(centroid.x, centroid.y),
+# #                 xytext=(0,0), fontsize=8,
+# #                 textcoords="offset points",
+# #                 ha='center', va='center', color='black')
+# # plt.tight_layout()
+# # plt.margins(0, 0)
+# # coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
+# # plt.xlim(lims[0]*1.005, lims[2]*1.01)
+# # plt.ylim(lims[1]*0.98, lims[3]*1.005)
+# # plt.show()
+
+
+# variable = "abundance"
+# VARIABLE_MAP = {
+#     "number_density_mean": {
+#         "name": "Mean number density",
+#         "units": "fish $\\mathregular{nmi^{-2}}$"
+#     },
+#     "biomass_density_mean": {
+#         "name": "Mean biomass density",
+#         "units": "kg $\\mathregular{nmi^{-2}}$"
+#     },
+#     "biomass": {
+#         "name": "Biomass",
+#         "units": "kg"
+#     },
+#     "abundance": {
+#         "name": "Abundance",
+#         "units": "$\\it{N}$"
+#     }
+# }
+
+# viridis = plt.colormaps.get_cmap('viridis').resampled(1024)
+# newcolors = viridis(np.linspace(0, 1, 1024))[::-1]
+# white = np.array([1, 1, 1, 1])
+# newcolors[0, :] = white
+# custom_cmap = ListedColormap(newcolors)
+# # Check the minimum and maximum values for normalization
 
-# # np.arange(1, len(np.arange(xmin, xmax+x_step, x_step)))
-# pd.cut(
-#     nu["x"],
-#     np.arange(xmin, xmax, x_step),
-#     right = False,
-#     labels = np.arange(1, len(np.arange(xmin, xmax, x_step))),
-# ).astype(int) - 1
-# grid_gdf["x"] =  grid_gdf["x"] - 1
 
 # fig, ax = plt.subplots(figsize=(5, 8))
 # grid_gdf.plot(ax=ax, edgecolor="gainsboro", color="white", linewidth=0.5, legend=False)
-# plt.plot(dataset_gdf.longitude, dataset_gdf.latitude, linewidth=1, color='black')
-# plt.plot(nu.longitude, nu.latitude, linewidth=1, color="red")
-# # Calculate centroids and plot text
-# for idx, row in grid_gdf.iterrows():
-#     centroid = row.geometry.centroid
-#     var = f"{row.x}-{row.y}"
-#     ax.annotate(var, xy=(centroid.x, centroid.y), 
-#                 xytext=(0,0), fontsize=8, 
-#                 textcoords="offset points",
-#                 ha='center', va='center', color='black')
+# grid_gdf_1.plot(ax=ax, column=variable, edgecolor="black", linewidth=2, cmap=custom_cmap,
+# legend=False, norm=norm)
+# plt.scatter(survey_df["longitude"], survey_df["latitude"], linewidth=0.5, color="black")
+# vmin = grid_gdf[variable][grid_gdf[variable] > 0.0].min()
+# vmax = grid_gdf[variable].max()
+# norm = colors.Normalize(vmin=0, vmax=vmax, clip=False)
+# # norm = colors.Normalize(vmin=grid_gdf[variable][grid_gdf[variable] > 0.0].min(),
+# vmax=grid_gdf[variable].max())
+# # cbar = plt.colorbar(cm.ScalarMappable(norm=norm, cmap=custom_cmap), ax=ax,
+# orientation="horizontal", shrink=0.5)
+# cbar = plt.colorbar(cm.ScalarMappable(cmap=custom_cmap, norm=norm), ax=ax,
+# orientation="horizontal", shrink=0.5)
+# cbar.set_label(f"{VARIABLE_MAP[variable]["name"]} ({VARIABLE_MAP[variable]["units"]})",
+#                fontsize=12, labelpad=10, loc='center')
+# cbar.ax.xaxis.set_label_position('top')
+# cbar.ax.xaxis.set_ticks_position('top')
 # plt.tight_layout()
-# plt.margins(0, 0)
+# plt.margins(0,0)
+# # grid_gdf_1.plot(ax=ax, linewidth=1.5, color="black")
 # coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
 # plt.xlim(lims[0]*1.005, lims[2]*1.01)
 # plt.ylim(lims[1]*0.98, lims[3]*1.005)
+# plt.xlabel(u'Longitude (\u00B0E)')
+# plt.ylabel(u'Latitude (\u00B0N)')
 # plt.show()
 
 
-variable = "abundance"
-VARIABLE_MAP = {
-    "number_density_mean": {
-        "name": "Mean number density",
-        "units": "fish $\\mathregular{nmi^{-2}}$"
-    }, 
-    "biomass_density_mean": {
-        "name": "Mean biomass density",
-        "units": "kg $\\mathregular{nmi^{-2}}$"
-    },     
-    "biomass": {
-        "name": "Biomass",
-        "units": "kg"
-    },
-    "abundance": {
-        "name": "Abundance",
-        "units": "$\\it{N}$"
-    }
-}
-
-viridis = plt.colormaps.get_cmap('viridis').resampled(1024)
-newcolors = viridis(np.linspace(0, 1, 1024))[::-1]
-white = np.array([1, 1, 1, 1])
-newcolors[0, :] = white
-custom_cmap = ListedColormap(newcolors)
-# Check the minimum and maximum values for normalization
-
-
-fig, ax = plt.subplots(figsize=(5, 8))
-grid_gdf.plot(ax=ax, edgecolor="gainsboro", color="white", linewidth=0.5, legend=False)
-grid_gdf_1.plot(ax=ax, column=variable, edgecolor="black", linewidth=2, cmap=custom_cmap, legend=False, norm=norm)
-plt.scatter(survey_df["longitude"], survey_df["latitude"], linewidth=0.5, color="black")
-vmin = grid_gdf[variable][grid_gdf[variable] > 0.0].min()
-vmax = grid_gdf[variable].max()
-norm = colors.Normalize(vmin=0, vmax=vmax, clip=False)
-# norm = colors.Normalize(vmin=grid_gdf[variable][grid_gdf[variable] > 0.0].min(), vmax=grid_gdf[variable].max())
-# cbar = plt.colorbar(cm.ScalarMappable(norm=norm, cmap=custom_cmap), ax=ax, orientation="horizontal", shrink=0.5)
-cbar = plt.colorbar(cm.ScalarMappable(cmap=custom_cmap, norm=norm), ax=ax, orientation="horizontal", shrink=0.5)
-cbar.set_label(f"{VARIABLE_MAP[variable]["name"]} ({VARIABLE_MAP[variable]["units"]})", 
-               fontsize=12, labelpad=10, loc='center')  
-cbar.ax.xaxis.set_label_position('top')
-cbar.ax.xaxis.set_ticks_position('top')
-plt.tight_layout()
-plt.margins(0,0)
-# grid_gdf_1.plot(ax=ax, linewidth=1.5, color="black")
-coast_gdf.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
-plt.xlim(lims[0]*1.005, lims[2]*1.01)
-plt.ylim(lims[1]*0.98, lims[3]*1.005)
-plt.xlabel(u'Longitude (\u00B0E)')
-plt.ylabel(u'Latitude (\u00B0N)')
-plt.show()
-
-
-co = SQL(db_filepath, "select", table_name="coastline_df")
-co["geometry"] = co["geometry"].apply(wkt.loads)
-co_gdf = gpd.GeoDataFrame(co, geometry="geometry", crs=projection)
-
-
-
-test["geometry"].apply(wkt.loads)
-clipped_cells_latlon["geometry"]
-len(bbox_latlon.exterior.coords)
-len(buffer_boundary.exterior.coords)
-
-# world_clipped_latlon = gpd.clip(world_utm, buffer_boundary).to_crs(projection)
-world_clipped_latlon
-########
-cells_clipped = cells_gdf["geometry"].difference(world_clipped.geometry.union_all()).to_frame("geometry")
-# cells_clipped = cells_gdf["geometry"].difference(world_clipped_latlon.geometry.union_all()).to_frame("geometry")
-cell_colors = cells_clipped.area / (lat_step * lon_step)
-# cell_colors = cells_clipped.to_crs({"proj": "cea"}).area / 46300.00000000001**2
-cells_clipped['cell_colors'] = cell_colors
-# ---> back to epsg lat/long
-cells_latlon = cells_clipped.to_crs(projection)
-cells_latlon_clipped = gpd.clip(cells_latlon, bb_orig_gdf)
-cell_colors_clipped = cells_latlon_clipped.to_crs(utm_code).area / (lat_step * lon_step)
-# cell_colors = cells_clipped.to_crs({"proj": "cea"}).area / 46300.00000000001**2
-cells_latlon_clipped['cell_colors'] = cell_colors_clipped
-########
-from shapely.geometry import Point, LineString, shape
-nasc_df = survey.input["acoustics"]["nasc_df"]
-nasc_gdf = gpd.GeoDataFrame(data=nasc_df, geometry=gpd.points_from_xy(nasc_df["longitude"], nasc_df["latitude"]), crs=projection)
-geo_df = nasc_gdf.groupby(["transect_num"])['geometry'].apply(lambda x: LineString(x.tolist())).to_frame("geometry").set_crs(projection)
-custom_crs = '+proj=epsg:4326 +lat_ts=0 +lat_0=0 +lon_0=-180 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs +type=crs'
-cells_latlon_clipped.to_crs(custom_crs).crs
-########
-import sqlalchemy as sqla
-import matplotlib.colors as colors
-import matplotlib.cm as cm
-cells_transformed = cells_latlon.to_crs(utm_code)
-lims = cells_transformed.total_bounds
-
-fig, ax = plt.subplots(figsize=(10, 10))
-# cells_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=True)
-# cells_clipped.plot.hexbin()
-cells_latlon.to_crs(utm_code).plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=False)
-# cells_latlon.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=False)
-# cells_latlon_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=False)
-# cells_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=True)
-# cells_gdf.plot(ax=ax, facecolor="none", edgecolor="black")
-norm = colors.Normalize(vmin=cells_latlon["cell_colors"].min(), vmax=cells_latlon["cell_colors"].max())
-cbar = plt.colorbar(cm.ScalarMappable(norm=norm, cmap="viridis"), ax=ax, orientation="horizontal", shrink=0.5)
-cbar.set_label("Normalized grid area (50x50 nmi)", fontsize=12, labelpad=10, loc='center')  
-cbar.ax.xaxis.set_label_position('top')
-cbar.ax.xaxis.set_ticks_position('top')
-geo_df.reset_index().to_crs(utm_code).plot(ax=ax, color="red")
-# geo_df.reset_index().plot(ax=ax, color="red")
-# plt.plot(ax=ax, nasc_df["longitude"], nasc_df["latitude"], color="red")
-ax.margins(0.00, 0.00)
-world_orig.to_crs(utm_code).plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
-# world_orig.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
-# bb_orig_gdf.to_crs(utm_code).plot(ax=ax, facecolor='none', edgecolor='red')
-plt.xlim(lims[0]*1.02, lims[2]*1.01)
+# co = SQL(db_filepath, "select", table_name="coastline_df")
+# co["geometry"] = co["geometry"].apply(wkt.loads)
+# co_gdf = gpd.GeoDataFrame(co, geometry="geometry", crs=projection)
+
+
+# test["geometry"].apply(wkt.loads)
+# clipped_cells_latlon["geometry"]
+# len(bbox_latlon.exterior.coords)
+# len(buffer_boundary.exterior.coords)
+
+# # world_clipped_latlon = gpd.clip(world_utm, buffer_boundary).to_crs(projection)
+# world_clipped_latlon
+# ########
+# cells_clipped = cells_gdf["geometry"].difference(world_clipped.geometry.union_all())
+# .to_frame("geometry")
+# # cells_clipped = cells_gdf["geometry"].difference(world_clipped_latlon.geometry.union_all())
+# .to_frame("geometry")
+# cell_colors = cells_clipped.area / (lat_step * lon_step)
+# # cell_colors = cells_clipped.to_crs({"proj": "cea"}).area / 46300.00000000001**2
+# cells_clipped['cell_colors'] = cell_colors
+# # ---> back to epsg lat/long
+# cells_latlon = cells_clipped.to_crs(projection)
+# cells_latlon_clipped = gpd.clip(cells_latlon, bb_orig_gdf)
+# cell_colors_clipped = cells_latlon_clipped.to_crs(utm_code).area / (lat_step * lon_step)
+# # cell_colors = cells_clipped.to_crs({"proj": "cea"}).area / 46300.00000000001**2
+# cells_latlon_clipped['cell_colors'] = cell_colors_clipped
+# ########
+# from shapely.geometry import LineString, Point, shape
+
+# nasc_df = survey.input["acoustics"]["nasc_df"]
+# nasc_gdf = gpd.GeoDataFrame(data=nasc_df, geometry=gpd.points_from_xy(nasc_df["longitude"],
+# nasc_df["latitude"]), crs=projection)
+# geo_df = nasc_gdf.groupby(["transect_num"])['geometry'].apply(lambda x: LineString(x.tolist()))
+# .to_frame("geometry").set_crs(projection)
+# custom_crs = '+proj=epsg:4326 +lat_ts=0 +lat_0=0 +lon_0=-180 +x_0=0 +y_0=0 +datum=WGS84 +units=m
+# +no_defs +type=crs'
+# cells_latlon_clipped.to_crs(custom_crs).crs
+# import matplotlib.cm as cm
+# import matplotlib.colors as colors
+
+# ########
+# import sqlalchemy as sqla
+
+# cells_transformed = cells_latlon.to_crs(utm_code)
+# lims = cells_transformed.total_bounds
+
+# fig, ax = plt.subplots(figsize=(10, 10))
+# # cells_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=True)
+# # cells_clipped.plot.hexbin()
+# cells_latlon.to_crs(utm_code).plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis",
+
+#                                    legend=False)
+# # cells_latlon.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=False)
+# # cells_latlon_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis",
+# # legend=False)
+# # cells_clipped.plot(ax=ax, column="cell_colors", edgecolor="black", cmap="viridis", legend=True)
+# # cells_gdf.plot(ax=ax, facecolor="none", edgecolor="black")
+# norm = colors.Normalize(vmin=cells_latlon["cell_colors"].min(),
+#                         vmax=cells_latlon["cell_colors"].max())
+# cbar = plt.colorbar(cm.ScalarMappable(norm=norm, cmap="viridis"), ax=ax,
+# orientation="horizontal",
+#                     shrink=0.5)
+# cbar.set_label("Normalized grid area (50x50 nmi)", fontsize=12, labelpad=10, loc='center')
+# cbar.ax.xaxis.set_label_position('top')
+# cbar.ax.xaxis.set_ticks_position('top')
+# geo_df.reset_index().to_crs(utm_code).plot(ax=ax, color="red")
+# # geo_df.reset_index().plot(ax=ax, color="red")
+# # plt.plot(ax=ax, nasc_df["longitude"], nasc_df["latitude"], color="red")
+# ax.margins(0.00, 0.00)
+# world_orig.to_crs(utm_code).plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
+# # world_orig.plot(ax=ax, linewidth=1.2, color='gray', edgecolor="black")
+# # bb_orig_gdf.to_crs(utm_code).plot(ax=ax, facecolor='none', edgecolor='red')
+# plt.xlim(lims[0]*1.02, lims[2]*1.01)
+# # ax.set_yticks([4e6, 5e6, 6e6])
+# # ax.set_yticklabels(["4000", "5000", "6000"], fontsize=10)
+# plt.ylim(lims[1]*0.98, lims[3]*1.005)
 # ax.set_yticks([4e6, 5e6, 6e6])
 # ax.set_yticklabels(["4000", "5000", "6000"], fontsize=10)
-plt.ylim(lims[1]*0.98, lims[3]*1.005)
-ax.set_yticks([4e6, 5e6, 6e6])
-ax.set_yticklabels(["4000", "5000", "6000"], fontsize=10)
-plt.xlabel("Eastings (km)")
-plt.ylabel("Northings (km)")
-# plt.xlabel("Longitude (°E)")
-# ax.set_xticks([-135, -130, -125, -120])
-# plt.ylabel("Latitude (°N)")
-ax.set_xticks([-600e3, -400e3, -200e3, 0, 200e3, 400e3, 600e3, 800e3])
-ax.set_xticklabels(["-600", "-400", "-200", "0", "200", "400", "600", "800"], fontsize=10)
-# Adding the colorbar title
-# cax = fig.get_axes()[1]  # Assuming the colorbar is the second axis
-# cax.set_ylabel("Normalized grid area (25x25 nmi)")  # Setting the title of the colorbar
-plt.tight_layout()
-plt.show()
\ No newline at end of file
+# plt.xlabel("Eastings (km)")
+# plt.ylabel("Northings (km)")
+# # plt.xlabel("Longitude (°E)")
+# # ax.set_xticks([-135, -130, -125, -120])
+# # plt.ylabel("Latitude (°N)")
+# ax.set_xticks([-600e3, -400e3, -200e3, 0, 200e3, 400e3, 600e3, 800e3])
+# ax.set_xticklabels(["-600", "-400", "-200", "0", "200", "400", "600", "800"], fontsize=10)
+# # Adding the colorbar title
+# # cax = fig.get_axes()[1]  # Assuming the colorbar is the second axis
+# # cax.set_ylabel("Normalized grid area (25x25 nmi)")  # Setting the title of the colorbar
+# plt.tight_layout()
+# plt.show()
diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index 8e15088c..7c462db8 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -1,373 +1,393 @@
-from echopop.live.live_survey import LiveSurvey
-from echopop.live.sql_methods import SQL
-import echopop.live.live_visualizer as elv
-from pathlib import Path
-from echopop.live import live_data_processing as eldp
-from echopop.live import live_data_loading as eldl
-from echopop.live.live_core import(
-    LIVE_DATA_STRUCTURE, LIVE_INPUT_FILE_CONFIG_MAP
-)
-import boto3
-from botocore.exceptions import NoCredentialsError, ClientError
-import pandas as pd
-import numpy as np
-from echopop.live.sql_methods import SQL, sql_data_exchange, get_table_key_names, sql_group_update, query_processed_files, sql_update_strata_summary
-from echopop.live.live_spatial_methods import apply_spatial_definitions
-from echopop.live.live_acoustics import average_sigma_bs, compute_nasc
-from echopop.live.live_biology import compute_sigma_bs
-from echopop.acoustics import ts_length_regression, to_dB, to_linear
-from echopop.utils.operations import group_interpolator_creator
-from functools import reduce
-from echopop.live.live_data_loading import filter_filenames, read_biology_csv
-
-####################################################################################################
-# TEST: Set up `LiveSurvey` object
-# NOTE: General initialization parameter configuration
-live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
-# NOTE: File configuration
-live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
-# NOTE: Create object
-realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True)
-# NOTE: String-representation via `LiveSurvey.__repr__`: 
-# NOTE: Lists current files being processed and linked databases (WIP)
-self = realtime_survey
-file_configuration = self.config
-
-input_filenames = ["202407_003_operation_info.csv", "202407_22500_003_lf.csv", "202407_22500_003_spec.csv", "202407_003_catch_perc.csv"]
-realtime_survey.config["input_directories"]["biology"]["directory"] = "s3://sh2407-upload/data/Echopop-biology"
-
-survey_data = SQL("C:/Users/Brandyn/Downloads/acoustics.db", "select", table_name="survey_data_df")
-
-
-del realtime_survey.config["data_root_dir"]
-self = realtime_survey
-
-# realtime_survey.config["storage_options"] = aws_credentials
-realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True)
-realtime_survey.load_biology_data(input_filenames=input_filenames)
-realtime_survey.input["biology"]
-def is_s3_path(path):
-    """Check if a path is an S3 path."""
-    return path.startswith("s3://")
-
-dataset_directory = realtime_survey.config["input_directories"]["biology"]["directory"]
-s3_path = dataset_directory
-is_s3_path(dataset_directory)
-
-cloud_credentials = aws_credentials
-cloud_credentials = {}
-def validate_s3_path(s3_path: str, cloud_credentials: dict):
-    """Check if (parts of) S3 path exists."""
-
-    # Redundant validation that S3 object validation is appropriate
-    if not is_s3_path(s3_path):
-        raise ValueError("The path is not an S3 path.")    
-    
-    # Validate credentials
-    if not all([True if param in cloud_credentials.keys() else False 
-                for param in ["key", "secret"]]):
-        # ---- Find missing credentials
-        missing_creds = set(["key", "secret"]) - set(cloud_credentials)
-        # ---- Format into string
-        missing_creds_str = ", ".join(["'{}'".format(x.replace("'", "''")) for x in missing_creds])
-        # ---- Raise Error
-        raise PermissionError(
-            f"Required S3 credentials missing: {missing_creds_str}."
-        )
-
-    # Remove the s3:// prefix
-    s3_path_reduced = s3_path[len("s3://"):]
-
-    # Split into bucket and key
-    parts = s3_path_reduced.split("/", 1)
-    if len(parts) < 2:
-        raise ValueError(f"Invalid S3 path format for '{s3_path}'.")
-    
-    # Get bucket name and directory keys
-    bucket_name, directory = parts
-
-    # Initialize the S3 client
-    s3_client = boto3.client("s3", 
-                             aws_access_key_id=cloud_credentials["key"], 
-                             aws_secret_access_key=cloud_credentials["secret"])
-    
-    # Check if the bucket exists
-    try:
-        s3_client.head_bucket(Bucket=bucket_name)
-    except ClientError as e:
-        raise FileNotFoundError(
-            f"S3 bucket '{bucket_name}' does not exist or you do not have access."
-        )
-    
-    # Check if the S3 directory exists
-    try:
-        # ---- Ping a response from the bucket     
-        response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=directory, MaxKeys=1)
-        # ---- Check for `Contents`
-        if "Contents" not in response:
-            raise FileNotFoundError(f"S3 path '{s3_path}' does not exist.")
-    except ClientError as e: 
-        # --- Raise Error and propagate it upwards
-        raise e
-
-validate_s3_path(s3_path, cloud_credentials)
-
-import pandas as pd
-
-self = realtime_survey
-biology_files = self.meta["provenance"]["biology_files_read"]
-file_configuration = self.config
-dataset = "biology"
-
-# Get the dataset file settings
-file_settings = file_configuration["input_directories"][dataset]
-
-def construct_directorypath(file_configuration: dict, file_settings: dict):
-    """Construct the root directory path."""
-
-    # Get the general root_directory, if present
-    if "data_root_dir" in file_configuration:
-        root_directory = file_configuration["data_root_dir"]
-    else:
-        root_directory = ""
-
-    # Get the local directory (or this may be the root directory depending on the config)
-    data_directory = file_settings["directory"]
-
-    # Return the directory path
-    if root_directory != "":    
-        return "/".join([root_directory, data_directory])
-    else:
-        return data_directory
-
-directory_path = construct_directorypath(file_configuration, file_settings)
-
-def validate_local_path(directory_path: str):
-
-    # Validate filepath
-    # ---- Error evaluation (if applicable)
-    if not Path(directory_path).exists():
-        raise FileNotFoundError(
-            f"The acoustic data directory [{directory_path}] does not exist."
-        )
-    
-    # Validate that files even exist
-    # ---- List available files of target extension
-    data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}"))
-    # ---- Error evaluation (if applicable)
-    if not data_files:
-        raise FileNotFoundError(
-            f"No `*.{file_settings['extension']}` files found in [{directory_path}]!"
-        )
-
-
-
-
-# Get the biology data file settings
-file_settings = file_configuration["input_directories"]["biology"]
-
-# Get the file-specific settings, datatypes, columns, etc.
-# ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
-biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"] 
-# ---- Extract the expected file name ID's
-biology_file_ids = file_settings["file_name_formats"]
-# ---- Extract all of the file ids
-biology_config_ids = list(biology_file_ids.keys())
-# ---- Initialize the dictionary that will define this key in the `input` attribute
-biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
-
-
-# Initialize a session with AWS credentials
-s3_client = boto3.client(
-    's3',
-    aws_access_key_id=aws_credentials["key"],
-    aws_secret_access_key=aws_credentials["secret"]
-)
-response = s3_client.list_buckets()
-buckets = response.get('Buckets', [])
-for bucket in buckets:
-    print(f"Bucket Name: {bucket['Name']}")
-s3_client.head_bucket(Bucket="sh2407-upload")
-realtime_survey.load_biology_data(pandas_kwargs=aws_credentials, input_filenames=input_filenames)
-realtime_survey.config["ship_id"]
-grid_data = SQL(realtime_survey.config["database"]["grid"], "select", table_name="grid_df")
-grid_data[grid_data.abundance > 0]
-bucket = boto3.client("s3", region_name=None)
-bucket.head_bucket(Bucket=realtime_survey.config["input_directories"]["biology"]["directory"] + "/")
-bucket.list_objects_v2(Bucket=realtime_survey.config["input_directories"]["biology"]["directory"], Prefix=path, MaxKeys=1)
-####################################################################################################
-# TEST: TRIGGER --> NEW ACOUSTIC DATA
-# NOTE: Load new acoustic data (Either glob file search or `input_filenames Optional[List[str]]`)
-realtime_survey.load_acoustic_data()
-# NOTE: Process new acoustic data
-# NOTE: This will update linked database tables
-realtime_survey.process_acoustic_data()
-# NOTE: Generate population estimates (or pass if there are no biological data)
-# NOTE: `working_dataset = Literal["acoustic", "biology"]`
-realtime_survey.estimate_population(working_dataset="acoustic")
-# NOTE: String-representation via `LiveSurvey.__repr__`: 
-# NOTE: Lists current files being processed and linked databases (WIP)
-realtime_survey.input["acoustics"]
-####################################################################################################
-# TEST: TRIGGER --> NEW BIOLOGY DATA
-# NOTE: Load new biological data (Either glob file search or `input_filenames Optional[List[str]]`)
-realtime_survey.load_biology_data()
-len(realtime_survey.meta["provenance"]["biology_files_checkpoint1"])
-realtime_survey.meta["provenance"]["biology_files_checkpoint3"]
-# NOTE: Process new biological data
-# NOTE: This will update linked database tables
-realtime_survey.process_biology_data()
-# NOTE: Generate population estimates (or pass if there are no acoustic data)
-# NOTE: `working_dataset = Literal["acoustic", "biology"]`
-realtime_survey.estimate_population(working_dataset="biology")
-# NOTE: String-representation via `LiveSurvey.__repr__`: 
-# NOTE: Lists current files being processed and linked databases (WIP)
-realtime_survey
-####################################################################################################
-# TEST: `LiveSurvey` --[`files_processed`]--> `Echodataflow`
-# NOTE: `LiveSurvey.meta` attribute
-# ---- ACOUSTIC
-realtime_survey.meta["provenance"]["acoustic_files"]
-# ---- BIOLOGICAL
-realtime_survey.meta["provenance"]["biology_files"]
-# NOTE: SQL function query from database file [cumulative list]
-# ---- ACOUSTIC
-SQL(db_file=realtime_survey.config["database"]["acoustics"],
-    command="select", table_name="files_processed")
-dat = SQL(db_file=realtime_survey.config["database"]["acoustics"],command="select", table_name="files_processed")
-# ---- BIOLOGICAL
-SQL(db_file=realtime_survey.config["database"]["biology"],command="select", table_name="files_processed")
-dat.loc[0:, "filepath"][105]
-####################################################################################################
-# TEST: `LiveSurvey` --[(key) SQL tables]--> Users
-# !!! The SQL functions will fail if the tables have not yet been created/initialized
-# ---- ACOUSTICS
-# NOTE: Mean linear backscatter coefficient (`sigma_bs`) keyed for each haul and stratum
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df")
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="specimen_df").latitude.max()
-realtime_survey.input["spatial"]["strata"]
-# NOTE: Along-track acoustically-derived number/biomass densities and NASC 
-SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
-# ---- BIOLOGICAL
-# NOTE: Fitted (discretized) length-weight relationship
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_fitted_df")
-# NOTE: Quantized length-binned weights (summed)
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df")
-# NOTE: Average weights per stratum
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df")
-# NOTE: Stratum summary tables
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df")
-####################################################################################################
-# FROM THE `LiveSurvey` object !
-# ---- Convert to a Panel
-import panel as pn
-# ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table
-survey_data_db = Path(realtime_survey.config["database"]["acoustics"])
-# grid_db = Path(realtime_survey.config["database"]["grid"])
-grid_db = Path("C:/Users/Brandyn/Downloads/grid.db")
-dat = SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
-dat
-dat1 = SQL(grid_db, "select", table_name="grid_df")
-SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df")
-
-sql_cmd = "SELECT * FROM sigma_bs_mean_df ORDER BY stratum, haul_num, species_id"
-# Create the engine
-engine = create_engine(f"sqlite:///{"C:/Users/Brandyn/Downloads/biology.db"}")
-# Create the SQL database connection and send the script 
-with engine.connect() as connection:
-    table = connection.execute(text(sql_cmd))
-
-data = table.fetchall() 
-dd = pd.DataFrame(data, columns=table.keys()).loc[0:1, :]
-dd = dd[["stratum", "haul_num", "species_id", "sigma_bs", "sigma_bs_count", "sigma_bs_sum", "id"]]
-dd.loc[:, "id"] = pd.Series([f"{(4,4,4)}", f"{(5,5,5)}"])
-SQL("C:/Users/Brandyn/Downloads/biology.db", "insert", table_name="sigma_bs_mean_df", dataframe=dd)
-SQL("C:/Users/Brandyn/Downloads/biology.db", "map")
-SQL(biology_db, "drop", table_name="sigma_bs_mean_df")
-SQL(biology_db, "select", table_name="sigma_bs_mean_df")
-dd.loc[:, "haul_num"] = pd.Series([101, 103])
-dd = dd[["species_id", "haul_num", "id", "stratum", "sigma_bs", "sigma_bs_count", "sigma_bs_sum"]]
-SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=dd, id_columns=key_list+["id"])
-SQL(biology_db, "select", table_name="sigma_bs_mean_df")
-import numpy as np; import pandas as pd
-SQL("C:/Users/Brandyn/Downloads/biology.db", "select", table_name="length_weight_df")
-sigma_bs_df = SQL("C:/Users/Brandyn/Downloads/biology.db", "select", table_name="sigma_bs_mean_df")
-table_df = SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df")
-sigma_bs_df = table_df
-# ---- Check the table keys
-table_keys = np.unique(table_df["id"]).tolist()
-# ---- Get unique values
-current_keys = np.unique(sigma_bs_df["id"]).tolist()
-# ---- Get INSERTION keys
-insertion_keys = list(set(current_keys).difference(set(table_keys)))
-# ---- Get UPDATE keys
-update_keys = list(set(current_keys).intersection(set(table_keys)))
-insertion_df = sigma_bs_df[sigma_bs_df["id"].isin(insertion_keys)]
-insertion_df.loc[0, "species_id"] = 22500
-insertion_df.loc[0, "stratum"] = 5
-insertion_df.loc[0, "haul_num"] = 100
-insertion_df.loc[0, "sigma_bs"] = 1e-10
-insertion_df.loc[0, "sigma_bs_count"] = 100
-insertion_df.loc[0, "sigma_bs_sum"] = 1e10 * 100
-insertion_df.loc[0, "id"] = f"{(1,1,1)}"
-SQL(realtime_survey.config["database"]["biology"], "insert", table_name="sigma_bs_mean_df", 
-    dataframe=insertion_df)
-SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df")
-survey_data = SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
-dat1[dat1.abundance > 0]
-dat[dat.number_density > 0]
-coast_db = grid_db
-biology_db = Path(realtime_survey.config["database"]["biology"])
-projection = realtime_survey.config["geospatial"]["projection"]
-# NOTE: PLOTS
-# Ensure Panel is initialized
-pn.extension()
-# ---- Helper function
-def plt_to_pn(fig):
-    # Convert to a panel object
-    panel = pn.panel(fig)
-    # Display
-    panel.show() # OR panel.servable() if you want to serve it in a Panel server
-# ---- PLOT GRID
-fig = elv.plot_livesurvey_grid(grid_db, projection, coast_db)
-fig.show()
-plt_to_pn(fig)
-# ---- PLOT TRACK
-from echopop.live.live_visualizer import plot_livesurvey_track
-fig1 = plot_livesurvey_track(survey_data, projection, coast_db)
-fig1.show()
-plt_to_pn(fig1)
-# ---- PLOT DISTRIBUTIONS
-weight_table = SQL(biology_db, "select", 
-                   table_name="length_weight_df")
-stratum_table = SQL(biology_db, "select", 
-                    table_name="strata_summary_df")
-specimen_table = SQL(biology_db, "select", 
-                     table_name="specimen_data_df")
-length_table = SQL(biology_db, "select", 
-                   table_name="length_df")
-fig2 = elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table, length_table)
-plt_to_pn(fig2)
-### MULTIPANEL
-panel0 = pn.panel(fig, name='Gridded population estimates')
-panel1 = pn.panel(fig1, name='Alongtrack population estimates')
-panel2 = pn.panel(fig2, name='Length and weight distributions')
-
-def serve_panels():
-    # Create links to each panel
-    home = pn.Column(
-        pn.pane.Markdown("# Main Page"),
-        pn.pane.Markdown("[Gridded population estimates](gridded_population_estimates)", sizing_mode="stretch_width"),
-        pn.pane.Markdown("[Alongtrack population estimates](alongtrack_population_estimates)", sizing_mode="stretch_width"),
-        pn.pane.Markdown("[Length and weight distributions](length_weight_distributions)", sizing_mode="stretch_width")
-    )
-
-    # Serve the home page and individual panels
-    pn.serve({
-        'Main Page': home,
-        'gridded_population_estimates': panel0,
-        'alongtrack_population_estimates': panel1,
-        'length_weight_distributions': panel2
-    },  show=True)
-# Run the function to serve panels
-serve_panels()
\ No newline at end of file
+# from echopop.live.live_survey import LiveSurvey
+# from echopop.live.sql_methods import SQL
+# import echopop.live.live_visualizer as elv
+# from pathlib import Path
+# from echopop.live import live_data_processing as eldp
+# from echopop.live import live_data_loading as eldl
+# from echopop.live.live_core import(
+#     LIVE_DATA_STRUCTURE, LIVE_INPUT_FILE_CONFIG_MAP
+# )
+# import boto3
+# from botocore.exceptions import NoCredentialsError, ClientError
+# import pandas as pd
+# import numpy as np
+# from echopop.live.sql_methods import SQL, sql_data_exchange, get_table_key_names,
+# sql_group_update, query_processed_files, sql_update_strata_summary
+# from echopop.live.live_spatial_methods import apply_spatial_definitions
+# from echopop.live.live_acoustics import average_sigma_bs, compute_nasc
+# from echopop.live.live_biology import compute_sigma_bs
+# from echopop.acoustics import ts_length_regression, to_dB, to_linear
+# from echopop.utils.operations import group_interpolator_creator
+# from functools import reduce
+# from echopop.live.live_data_loading import filter_filenames, read_biology_csv
+
+# ##################################################################################################
+# # TEST: Set up `LiveSurvey` object
+# # NOTE: General initialization parameter configuration
+# live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initializat
+# ion_config.yml"
+# # NOTE: File configuration
+# live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_yea
+# r_2019_config.yml"
+# # NOTE: Create object
+# realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True)
+# realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path, verbose=True)
+
+# # NOTE: String-representation via `LiveSurvey.__repr__`:
+# # NOTE: Lists current files being processed and linked databases (WIP)
+# self = realtime_survey
+# file_configuration = self.config
+
+# input_filenames = ["202407_003_operation_info.csv", "202407_22500_003_lf.csv",
+# "202407_22500_003_spec.csv", "202407_003_catch_perc.csv"]
+# realtime_survey.config["input_directories"]["biology"]["directory"] =
+# "s3://sh2407-upload/data/Echopop-biology"
+
+# survey_data = SQL("C:/Users/Brandyn/Downloads/acoustics.db", "select",
+# table_name="survey_data_df")
+
+
+# del realtime_survey.config["data_root_dir"]
+# self = realtime_survey
+
+# # realtime_survey.config["storage_options"] = aws_credentials
+# realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True)
+# realtime_survey.load_biology_data(input_filenames=input_filenames)
+# realtime_survey.input["biology"]
+# def is_s3_path(path):
+#     """Check if a path is an S3 path."""
+#     return path.startswith("s3://")
+
+# dataset_directory = realtime_survey.config["input_directories"]["biology"]["directory"]
+# s3_path = dataset_directory
+# is_s3_path(dataset_directory)
+
+# cloud_credentials = aws_credentials
+# cloud_credentials = {}
+# def validate_s3_path(s3_path: str, cloud_credentials: dict):
+#     """Check if (parts of) S3 path exists."""
+
+#     # Redundant validation that S3 object validation is appropriate
+#     if not is_s3_path(s3_path):
+#         raise ValueError("The path is not an S3 path.")
+
+#     # Validate credentials
+#     if not all([True if param in cloud_credentials.keys() else False
+#                 for param in ["key", "secret"]]):
+#         # ---- Find missing credentials
+#         missing_creds = set(["key", "secret"]) - set(cloud_credentials)
+#         # ---- Format into string
+#         missing_creds_str = ", ".join(["'{}'".format(x.replace("'", "''")) for x in
+# missing_creds])
+#         # ---- Raise Error
+#         raise PermissionError(
+#             f"Required S3 credentials missing: {missing_creds_str}."
+#         )
+
+#     # Remove the s3:// prefix
+#     s3_path_reduced = s3_path[len("s3://"):]
+
+#     # Split into bucket and key
+#     parts = s3_path_reduced.split("/", 1)
+#     if len(parts) < 2:
+#         raise ValueError(f"Invalid S3 path format for '{s3_path}'.")
+
+#     # Get bucket name and directory keys
+#     bucket_name, directory = parts
+
+#     # Initialize the S3 client
+#     s3_client = boto3.client("s3",
+#                              aws_access_key_id=cloud_credentials["key"],
+#                              aws_secret_access_key=cloud_credentials["secret"])
+
+#     # Check if the bucket exists
+#     try:
+#         s3_client.head_bucket(Bucket=bucket_name)
+#     except ClientError as e:
+#         raise FileNotFoundError(
+#             f"S3 bucket '{bucket_name}' does not exist or you do not have access."
+#         )
+
+#     # Check if the S3 directory exists
+#     try:
+#         # ---- Ping a response from the bucket
+#         response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=directory, MaxKeys=1)
+#         # ---- Check for `Contents`
+#         if "Contents" not in response:
+#             raise FileNotFoundError(f"S3 path '{s3_path}' does not exist.")
+#     except ClientError as e:
+#         # --- Raise Error and propagate it upwards
+#         raise e
+
+# validate_s3_path(s3_path, cloud_credentials)
+
+# import pandas as pd
+
+# self = realtime_survey
+# biology_files = self.meta["provenance"]["biology_files_read"]
+# file_configuration = self.config
+# dataset = "biology"
+
+# # Get the dataset file settings
+# file_settings = file_configuration["input_directories"][dataset]
+
+# def construct_directorypath(file_configuration: dict, file_settings: dict):
+#     """Construct the root directory path."""
+
+#     # Get the general root_directory, if present
+#     if "data_root_dir" in file_configuration:
+#         root_directory = file_configuration["data_root_dir"]
+#     else:
+#         root_directory = ""
+
+#     # Get the local directory (or this may be the root directory depending on the config)
+#     data_directory = file_settings["directory"]
+
+#     # Return the directory path
+#     if root_directory != "":
+#         return "/".join([root_directory, data_directory])
+#     else:
+#         return data_directory
+
+# directory_path = construct_directorypath(file_configuration, file_settings)
+
+# def validate_local_path(directory_path: str):
+
+#     # Validate filepath
+#     # ---- Error evaluation (if applicable)
+#     if not Path(directory_path).exists():
+#         raise FileNotFoundError(
+#             f"The acoustic data directory [{directory_path}] does not exist."
+#         )
+
+#     # Validate that files even exist
+#     # ---- List available files of target extension
+#     data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}"))
+#     # ---- Error evaluation (if applicable)
+#     if not data_files:
+#         raise FileNotFoundError(
+#             f"No `*.{file_settings['extension']}` files found in [{directory_path}]!"
+#         )
+
+
+# # Get the biology data file settings
+# file_settings = file_configuration["input_directories"]["biology"]
+
+# # Get the file-specific settings, datatypes, columns, etc.
+# # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
+# biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"]
+# # ---- Extract the expected file name ID's
+# biology_file_ids = file_settings["file_name_formats"]
+# # ---- Extract all of the file ids
+# biology_config_ids = list(biology_file_ids.keys())
+# # ---- Initialize the dictionary that will define this key in the `input` attribute
+# biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
+
+
+# # Initialize a session with AWS credentials
+# s3_client = boto3.client(
+#     's3',
+#     aws_access_key_id=aws_credentials["key"],
+#     aws_secret_access_key=aws_credentials["secret"]
+# )
+# response = s3_client.list_buckets()
+# buckets = response.get('Buckets', [])
+# for bucket in buckets:
+#     print(f"Bucket Name: {bucket['Name']}")
+# s3_client.head_bucket(Bucket="sh2407-upload")
+# realtime_survey.load_biology_data(pandas_kwargs=aws_credentials, input_filenames=input_filenames)
+# realtime_survey.config["ship_id"]
+# grid_data = SQL(realtime_survey.config["database"]["grid"], "select", table_name="grid_df")
+# grid_data[grid_data.abundance > 0]
+# bucket = boto3.client("s3", region_name=None)
+# bucket.head_bucket(Bucket=realtime_survey.config["input_directories"]["biology"]["directory"]
+# +"/")
+# bucket.list_objects_v2(Bucket=realtime_survey.config["input_directories"]["biology"]["directory"],
+# Prefix=path, MaxKeys=1)
+# #################################################################################################
+# # TEST: TRIGGER --> NEW ACOUSTIC DATA
+# # NOTE: Load new acoustic data (Either glob file search or `input_filenames Optional[List[str]]`)
+# realtime_survey.load_acoustic_data()
+# # NOTE: Process new acoustic data
+# # NOTE: This will update linked database tables
+# realtime_survey.process_acoustic_data()
+# # NOTE: Generate population estimates (or pass if there are no biological data)
+# # NOTE: `working_dataset = Literal["acoustic", "biology"]`
+# realtime_survey.estimate_population(working_dataset="acoustic")
+# # NOTE: String-representation via `LiveSurvey.__repr__`:
+# # NOTE: Lists current files being processed and linked databases (WIP)
+# realtime_survey.input["acoustics"]
+# ##################################################################################################
+# # TEST: TRIGGER --> NEW BIOLOGY DATA
+# # NOTE: Load new biological data (Either glob file search or `input_filenames Optional[List[str]]`
+# realtime_survey.load_biology_data()
+# len(realtime_survey.meta["provenance"]["biology_files_checkpoint1"])
+# realtime_survey.meta["provenance"]["biology_files_checkpoint3"]
+# # NOTE: Process new biological data
+# # NOTE: This will update linked database tables
+# realtime_survey.process_biology_data()
+# # NOTE: Generate population estimates (or pass if there are no acoustic data)
+# # NOTE: `working_dataset = Literal["acoustic", "biology"]`
+# realtime_survey.estimate_population(working_dataset="biology")
+# # NOTE: String-representation via `LiveSurvey.__repr__`:
+# # NOTE: Lists current files being processed and linked databases (WIP)
+# realtime_survey
+# ##################################################################################################
+# # TEST: `LiveSurvey` --[`files_processed`]--> `Echodataflow`
+# # NOTE: `LiveSurvey.meta` attribute
+# # ---- ACOUSTIC
+# realtime_survey.meta["provenance"]["acoustic_files"]
+# # ---- BIOLOGICAL
+# realtime_survey.meta["provenance"]["biology_files"]
+# # NOTE: SQL function query from database file [cumulative list]
+# # ---- ACOUSTIC
+# SQL(db_file=realtime_survey.config["database"]["acoustics"],
+#     command="select", table_name="files_processed")
+# dat = SQL(db_file=realtime_survey.config["database"]["acoustics"],command="select",
+# table_name="files_processed")
+# # ---- BIOLOGICAL
+# SQL(db_file=realtime_survey.config["database"]["biology"],command="select",
+# table_name="files_processed")
+# dat.loc[0:, "filepath"][105]
+# ##################################################################################################
+# # TEST: `LiveSurvey` --[(key) SQL tables]--> Users
+# # !!! The SQL functions will fail if the tables have not yet been created/initialized
+# # ---- ACOUSTICS
+# # NOTE: Mean linear backscatter coefficient (`sigma_bs`) keyed for each haul and stratum
+# SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df")
+# SQL(realtime_survey.config["database"]["biology"], "select", table_name="specimen_df")
+# .latitude.max()
+# realtime_survey.input["spatial"]["strata"]
+# # NOTE: Along-track acoustically-derived number/biomass densities and NASC
+# SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
+# # ---- BIOLOGICAL
+# # NOTE: Fitted (discretized) length-weight relationship
+# SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_fitted_df")
+# # NOTE: Quantized length-binned weights (summed)
+# SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df")
+# # NOTE: Average weights per stratum
+# SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df")
+# # NOTE: Stratum summary tables
+# SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df")
+# ##################################################################################################
+# # FROM THE `LiveSurvey` object !
+# # ---- Convert to a Panel
+# import panel as pn
+# # ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table
+# survey_data_db = Path(realtime_survey.config["database"]["acoustics"])
+# # grid_db = Path(realtime_survey.config["database"]["grid"])
+# grid_db = Path("C:/Users/Brandyn/Downloads/grid.db")
+# dat = SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
+# dat
+# dat1 = SQL(grid_db, "select", table_name="grid_df")
+# SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df")
+
+# sql_cmd = "SELECT * FROM sigma_bs_mean_df ORDER BY stratum, haul_num, species_id"
+# # Create the engine
+# engine = create_engine(f"sqlite:///{"C:/Users/Brandyn/Downloads/biology.db"}")
+# # Create the SQL database connection and send the script
+# with engine.connect() as connection:
+#     table = connection.execute(text(sql_cmd))
+
+# data = table.fetchall()
+# dd = pd.DataFrame(data, columns=table.keys()).loc[0:1, :]
+# dd = dd[["stratum", "haul_num", "species_id", "sigma_bs", "sigma_bs_count", "sigma_bs_sum", "id"]]
+# dd.loc[:, "id"] = pd.Series([f"{(4,4,4)}", f"{(5,5,5)}"])
+# SQL("C:/Users/Brandyn/Downloads/biology.db", "insert", table_name="sigma_bs_mean_df",
+# dataframe=dd)
+# SQL("C:/Users/Brandyn/Downloads/biology.db", "map")
+# SQL(biology_db, "drop", table_name="sigma_bs_mean_df")
+# SQL(biology_db, "select", table_name="sigma_bs_mean_df")
+# dd.loc[:, "haul_num"] = pd.Series([101, 103])
+# dd = dd[["species_id", "haul_num", "id", "stratum", "sigma_bs", "sigma_bs_count", "sigma_bs_sum"]]
+# SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=dd, id_columns=key_list+["id"])
+# SQL(biology_db, "select", table_name="sigma_bs_mean_df")
+# import numpy as np; import pandas as pd
+# SQL("C:/Users/Brandyn/Downloads/biology.db", "select", table_name="length_weight_df")
+# sigma_bs_df = SQL("C:/Users/Brandyn/Downloads/biology.db", "select",
+# table_name="sigma_bs_mean_df")
+# table_df = SQL(realtime_survey.config["database"]["biology"], "select",
+# table_name="sigma_bs_mean_df")
+# sigma_bs_df = table_df
+# # ---- Check the table keys
+# table_keys = np.unique(table_df["id"]).tolist()
+# # ---- Get unique values
+# current_keys = np.unique(sigma_bs_df["id"]).tolist()
+# # ---- Get INSERTION keys
+# insertion_keys = list(set(current_keys).difference(set(table_keys)))
+# # ---- Get UPDATE keys
+# update_keys = list(set(current_keys).intersection(set(table_keys)))
+# insertion_df = sigma_bs_df[sigma_bs_df["id"].isin(insertion_keys)]
+# insertion_df.loc[0, "species_id"] = 22500
+# insertion_df.loc[0, "stratum"] = 5
+# insertion_df.loc[0, "haul_num"] = 100
+# insertion_df.loc[0, "sigma_bs"] = 1e-10
+# insertion_df.loc[0, "sigma_bs_count"] = 100
+# insertion_df.loc[0, "sigma_bs_sum"] = 1e10 * 100
+# insertion_df.loc[0, "id"] = f"{(1,1,1)}"
+# SQL(realtime_survey.config["database"]["biology"], "insert", table_name="sigma_bs_mean_df",
+#     dataframe=insertion_df)
+# SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df")
+# survey_data = SQL(realtime_survey.config["database"]["acoustics"], "select",
+# table_name="survey_data_df")
+# dat1[dat1.abundance > 0]
+# dat[dat.number_density > 0]
+# coast_db = grid_db
+# biology_db = Path(realtime_survey.config["database"]["biology"])
+# projection = realtime_survey.config["geospatial"]["projection"]
+# # NOTE: PLOTS
+# # Ensure Panel is initialized
+# pn.extension()
+# # ---- Helper function
+# def plt_to_pn(fig):
+#     # Convert to a panel object
+#     panel = pn.panel(fig)
+#     # Display
+#     panel.show() # OR panel.servable() if you want to serve it in a Panel server
+# # ---- PLOT GRID
+# fig = elv.plot_livesurvey_grid(grid_db, projection, coast_db)
+# fig.show()
+# plt_to_pn(fig)
+# # ---- PLOT TRACK
+# from echopop.live.live_visualizer import plot_livesurvey_track
+# fig1 = plot_livesurvey_track(survey_data, projection, coast_db)
+# fig1.show()
+# plt_to_pn(fig1)
+# # ---- PLOT DISTRIBUTIONS
+# weight_table = SQL(biology_db, "select",
+#                    table_name="length_weight_df")
+# stratum_table = SQL(biology_db, "select",
+#                     table_name="strata_summary_df")
+# specimen_table = SQL(biology_db, "select",
+#                      table_name="specimen_data_df")
+# length_table = SQL(biology_db, "select",
+#                    table_name="length_df")
+# fig2 = elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table,
+# length_table)
+# plt_to_pn(fig2)
+# ### MULTIPANEL
+# panel0 = pn.panel(fig, name='Gridded population estimates')
+# panel1 = pn.panel(fig1, name='Alongtrack population estimates')
+# panel2 = pn.panel(fig2, name='Length and weight distributions')
+
+# def serve_panels():
+#     # Create links to each panel
+#     home = pn.Column(
+#         pn.pane.Markdown("# Main Page"),
+#         pn.pane.Markdown("[Gridded population estimates](gridded_population_estimates)",
+# sizing_mode="stretch_width"),
+#         pn.pane.Markdown("[Alongtrack population estimates](alongtrack_population_estimates)",
+# sizing_mode="stretch_width"),
+#         pn.pane.Markdown("[Length and weight distributions](length_weight_distributions)",
+# sizing_mode="stretch_width")
+#     )
+
+#     # Serve the home page and individual panels
+#     pn.serve({
+#         'Main Page': home,
+#         'gridded_population_estimates': panel0,
+#         'alongtrack_population_estimates': panel1,
+#         'length_weight_distributions': panel2
+#     },  show=True)
+# # Run the function to serve panels
+# serve_panels()
diff --git a/echopop/utils/operations.py b/echopop/utils/operations.py
index 5db0e84c..2ac1b77e 100644
--- a/echopop/utils/operations.py
+++ b/echopop/utils/operations.py
@@ -306,10 +306,12 @@ def group_merge(dataframe, dataframes_to_add, inner_on, outer_on, how="outer", d
 
 
 def group_interpolator_creator(
-    grouped_data: pd.DataFrame, independent_var: str, dependent_var: str, 
-    contrast: Union[List[str], str]
+    grouped_data: pd.DataFrame,
+    independent_var: str,
+    dependent_var: str,
+    contrast: Union[List[str], str],
 ) -> dict:
-    
+
     # Check if `contrast` is a list or not
     if not isinstance(contrast, list):
         contrast = []
@@ -328,9 +330,7 @@ def interpolator_factory(sub_group):
 
     # Produce a dictionary comprising all of the produced interpolators
     interpolators = (
-        grouped_data.groupby(contrast).apply(
-            lambda group: interpolator_factory(group)
-        )
+        grouped_data.groupby(contrast).apply(lambda group: interpolator_factory(group))
     ).to_dict()
 
     # Return output
diff --git a/echopop/zarr_read_ingest_test.py b/echopop/zarr_read_ingest_test.py
index 101bc81a..f9385b6b 100644
--- a/echopop/zarr_read_ingest_test.py
+++ b/echopop/zarr_read_ingest_test.py
@@ -1,1885 +1,1948 @@
-import xarray as xr
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-from typing import Union, Tuple, Optional
-from pathlib import Path
-import copy
-import yaml
-import glob
-from datetime import datetime
-import geopandas as gpd
-import os
-import re
-import contextlib
-from echopop.acoustics import ts_length_regression, to_linear, to_dB
-from sqlalchemy import create_engine, text, Engine, inspect
-from echopop.live.live_core import LIVE_DATA_STRUCTURE, LIVE_FILE_FORMAT_MAP, LIVE_INPUT_FILE_CONFIG_MAP, SPATIAL_CONFIG_MAP
-from echopop.live.live_data_loading import validate_data_directory
-from echopop.live.sql_methods import SQL, SQL_COMMANDS, query_processed_files, format_sql_columns, sql_group_update, sql_data_exchange, initialize_database, sql_update_strata_summary
-from echopop.live import live_data_processing as eldp
-from echopop.live import live_data_loading as eldl
-from echopop.live.live_data_processing import query_dataset, get_unique_identifiers
-from echopop.live.live_survey import LiveSurvey
-from echopop.live.live_acoustics import integrate_nasc, configure_transmit_frequency
-from echopop.live.live_biology import preprocess_biology_data
-from echopop.survey import Survey
-import geopandas as gpd
-import pandas as pd
-import numpy as np
-import shapely.geometry
-from shapely.geometry import box
-from echopop.spatial.projection import utm_string_generator
-from geopy.distance import distance
-from echopop.live.sql_methods import SQL
-from shapely import wkt
-import matplotlib.pyplot as plt
-import geopandas as gpd
-import matplotlib.colors as colors
-import matplotlib.cm as cm
-import numpy as np
-from matplotlib.colors import ListedColormap
-self = realtime_survey
-spatial_config = self.config["geospatial"]
-dataset = self.input["acoustics"]["nasc_df"]
-
-
-
-
-survey_2019 = Survey("C:/Users/Brandyn/Documents/GitHub/echopop/config_files/initialization_config.yml", "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/survey_year_2019_config.yml")
-survey_2019.transect_analysis()
-survey_2019.analysis["transect"]["biology"]["weight"]["weight_stratum_df"]
-analysis_dict = survey_2019.analysis["transect"]
-SQL(acoustic_db, "select", table_name="sigma_bs_mean_df")
-proportions_dict=analysis_dict["biology"]["proportions"]["number"]
-length_weight_dict = analysis_dict["biology"]["weight"]
-stratum_proportions_sexed["proportion_aged"] + stratum_proportions_sexed["proportion_unaged"]
-
-updated_survey_data = nasc_biology.copy()
-gridding_column = file_configuration["gridding_column"]
-
-unique_keys = get_unique_identifiers(updated_survey_data, gridding_column)
-
-
-file_configuration = self.config
-grid_settings["grid_resolution"]["x"] = 50
-grid_settings["grid_resolution"]["y"] = 50
-lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters
-lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters
-self = realtime_survey
-file_configuration = self.config
-
-def initialize_grid():
-
-    # Get root directory, if defined
-    if "data_root_dir" in file_configuration:
-        root_dir = Path(file_configuration["data_root_dir"])
-    else:
-        root_dir = Path()
-
-    # Get `grid` settings
-    grid_database = file_configuration["input_directories"]["grid"]["database_name"]
-
-    # Create full filepath
-    db_filepath = root_dir / "database" / grid_database
-
-    # Create if file doesn't already exist
-    if not db_filepath.exists():
-
-        # Get projection
-        projection = file_configuration["geospatial"]["projection"]
-        
-        # Get grid settings
-        grid_settings = file_configuration["geospatial"]["griddify"]
-
-        # Get the resolution
-        resolution = grid_settings["grid_resolution"]
-        # ---- Convert from nmi to m
-        resolution_m = {key: distance(nautical=dist).meters for key, dist in resolution.items()}
-
-        # Get boundary coordinates
-        boundary = grid_settings["bounds"]
-        # ---- x
-        x = boundary["longitude"]
-        # ---- y
-        y = boundary["latitude"]
-        # ---- Create DataFrame
-        boundary_df = pd.DataFrame({
-            "x": np.array([np.min(x), np.max(x), np.max(x), np.min(x), np.min(x)]),
-            "y": np.array([np.min(y), np.min(y), np.max(y), np.max(y), np.min(y)])
-        })
-
-        # Create GeoDataFrame
-        boundary_gdf = gpd.GeoDataFrame(
-            data = boundary_df,
-            geometry=gpd.points_from_xy(boundary_df["x"], boundary_df["y"]),
-            crs = projection
-        )
-
-        # Convert to UTM (decimal degrees to m)
-        # ---- Create UTM code
-        utm_code = utm_string_generator((boundary_df.x.min() + boundary_df.x.max()) / 2, 
-                                        (boundary_df.y.min() + boundary_df.y.max()) / 2)
-        # ---- Create number code
-        utm_num = int(utm_code)
-        # ---- Create string code
-        utm_str = f"epsg:{utm_num}"
-        # ---- UTM conversion
-        boundary_gdf_utm = boundary_gdf.to_crs(utm_num)
-
-        # Get step sizes for each grid cell
-        # ---- x
-        x_step = resolution_m["x_distance"]
-        # ---- y
-        y_step = resolution_m["y_distance"]
-
-        # Prepare grid cell generation
-        # ---- Get new boundaries
-        xmin, ymin, xmax, ymax = boundary_gdf_utm.total_bounds
-        # ---- Initialize empty list
-        grid_cells = []
-        # ---- Initialize coordinate counter
-        y_ct = 0
-        x_coord = []; y_coord = []
-        # ---- Iterate through to generate cells
-        for y0 in np.arange(ymin, ymax, y_step):
-            y_ct += 1
-            x_ct = 0
-            for x0 in np.arange(xmin, xmax, x_step):
-                x_ct += 1
-                # ---- Step forward
-                x_coord.append(x_ct)
-                y_coord.append(y_ct)
-                x1 = x0 - x_step
-                y1 = y0 + y_step
-                # ---- Append to list
-                grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))
-
-        # Convert to a GeoDataFrame
-        cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=utm_code)
-        # ---- Add cordinates
-        cells_gdf.loc[:, "x"] = np.array(x_coord)
-        cells_gdf.loc[:, "y"] = np.array(y_coord)        
-
-        # Get coastline shapefile directory, if defined
-        if "coastline" in file_configuration["input_directories"]:
-
-            # Get coastline settings
-            coast_settings = file_configuration["input_directories"]["coastline"]
-            # ---- Create filepath
-            shp_filepath = (
-                root_dir / coast_settings["directory"] 
-                / coast_settings["coastline_name"] / f"{coast_settings["coastline_name"]}.shp"
-            )
-            # ---- Validate existence
-            if not shp_filepath.exists():
-                raise FileNotFoundError(
-                    f"{shp_filepath} does not exist!"
-                )
-            
-            # Get original lat/lon geometry boundaries
-            xmin0, ymin0, xmax0, ymax0 = boundary_gdf.total_bounds
-            
-            # Read in file
-            full_coast = gpd.read_file(shp_filepath)
-            # ---- Convert to UTM
-            full_coast_utm = full_coast.to_crs(utm_code)
-            # ---- Remove empty
-            full_coast_utm = full_coast_utm[~full_coast_utm.is_empty]
-
-            # Create bouning box with a buffer
-            boundary_box = box(xmin0 - 5, ymin0 - 5, xmax0 + 5, ymax0 + 5)
-            # ---- Create an unbuffered copy
-            boundary_box_unbuffered = box(xmin0, ymin0, xmax0, ymax0)
-            # ---- Convert to a GeoDataFrame
-            boundary_box_unbuffered_gdf = (
-                gpd.GeoDataFrame(geometry=[boundary_box_unbuffered], crs=projection)
-            )
-            # ---- Clip the coastline for saving
-            clipped_coast_original = (
-                gpd.clip(full_coast, box(xmin0 + 1, ymin0 + 1, xmax0 + 1, ymax0 + 1))
-            )
-
-            # Clip the coastline shapefile
-            clipped_coast = gpd.clip(full_coast, boundary_box).to_crs(utm_code)
-
-            # Clip the grid cells
-            cells_gdf.loc[:, "geometry"] = (
-                cells_gdf["geometry"].difference(clipped_coast.geometry.union_all())
-            )
-
-            # Calculate area per cell
-            cells_gdf.loc[:, "area"] = cells_gdf.area
-
-            # Convert back to original projection and clip 
-            clipped_cells_latlon = (
-                gpd.clip(cells_gdf.to_crs(projection), boundary_box_unbuffered_gdf)
-                .reset_index(drop=True)
-            )
-
-            # Initialize empty columns that can be added to later on
-            clipped_cells_latlon.loc[:, ["number_density_mean", "biomass_density_mean", 
-                                         "abundance", "biomass"]] = 0.0
-            
-            # Create output DataFrame
-            output_df = pd.DataFrame({
-                "geometry": clipped_cells_latlon["geometry"].apply(lambda geom: geom.wkt)
-            })
-            # ---- Add the required columns
-            output_df = pd.concat([output_df, clipped_cells_latlon.loc[:, ["x", "y", "area"]]], 
-                                  axis=1) 
-            # ---- Initialize empty columns that can be added to later on
-            output_df.loc[:, ["number_density_mean", "biomass_density_mean", "abundance", 
-                              "biomass"]] = 0.0
-           
-            # Write to the database file (for the grid)
-            # ---- Create engine
-            engine = sqla.create_engine(f"sqlite:///{db_filepath}")
-            # ---- Connect and create table
-            _ = output_df.to_sql("grid_df", engine, if_exists="replace")
-
-            # Write to the database file (for the coastline shapefile)
-            # ---- Create output copy
-            coastline_out = pd.DataFrame({
-                "geometry": clipped_coast_original["geometry"].apply(lambda geom: geom.wkt)
-            })
-            # ---- Concatenate
-            coastline_out = (
-                pd.concat([coastline_out, clipped_coast_original.drop(columns="geometry")], axis=1)
-            )
-            # ---- Connect and create table
-            _ = coastline_out.to_sql("coastline_df", engine, if_exists="replace")
-
-####################################################################################################  
-# TEST: YAML FILE CONFIGURATION
-# ---- Define filepaths
-self = LiveSurvey
-live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
-live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
-# ---- Run function: `live_configuration`
-file_configuration = self.config
-files = biology_files
-
-biology_output = initial_biology_output
-file_configuration = self.config
-table_name = "length_df"
-df = filtered_biology_output[table_name]
-database_file = biology_db
-kwargs = dict(dataframe=df, table_name=table_name, id_columns=["id"], primary_keys=["id"], output_type=pd.DataFrame)
-
-# NOTE: ARGUMENT: {working_dataset: Literal["acoustics", "biology"]}
-working_dataset = "acoustics"
-self = realtime_survey
-file_configuration = self.config
-self.results["biology"] = self.input["biology_processed"]
-self.results["acoustics"] = self.input["nasc_df"]
-
-# Get spatial column
-spatial_column = file_configuration["spatial_column"]
-
-# Initialize the working data dictionary
-working_data = copy.deepcopy(self.results)
-contrast_columns = []
-# ---- Define unique columns
-unique_columns = spatial_column + contrast_columns
-
-acoustic_db = file_configuration["database"][working_dataset]
-self = realtime_survey
-acoustic_dict = self.input["acoustics"]
-verbose = True
-contrast_columns = []
-db_file = acoustic_db
-table_name="survey_data_df"
-data_columns = data_columns
-unique_columns=unique_columns
-constraint="nasc > 0.0"
-data_dict = self.input["acoustics"]
-data_dict["nasc_df"]["stratum"] = 1
-data_dict["prc_nasc_df"]["stratum"] = 2
-table_name = "sigma_bs_mean_df"
-data_columns=["sigma_bs", "sigma_bs_count"]
-biology_db
-strata_df = self.input["spatial"]["strata"]
-
-def biology_pipeline(biology_dict: dict, 
-                     strata_df: pd.DataFrame, 
-                     file_configuration: dict, 
-                     verbose: bool,
-                     contrast_columns: List[str] = []):
-
-    # Get spatial column
-    spatial_column = file_configuration["spatial_column"]
-    unique_columns = spatial_column + contrast_columns
-
-    # Get database file
-    acoustic_db = file_configuration["database"]["acoustics"]
-
-    # Get biology database file
-    biology_db = file_configuration["database"]["biology"]
-
-    # Check for data completion
-    # ---- List of boolean values
-    full_biology_data = (
-        [True for _, df in biology_dict.items() if isinstance(df, pd.DataFrame) and df is not None]
-    )
-    # ---- Validation
-    if not all(full_biology_data):
-        # ---- Print, if verbose
-        if verbose:
-            print(
-                f"No new processed biology data available for processing."
-            )
-    else:
-        # Get related biology data
-        acoustic_df = get_nasc_sql_data(acoustic_db, 
-                                        biology_dict, 
-                                        unique_columns=unique_columns)        
-
-        # Get the corresopding `sigma_bs` data (and also compute the sample-number weighted average)
-        sigma_bs_df = get_sigma_bs_sql_data(acoustic_db, 
-                                            biology_dict,
-                                            unique_columns=unique_columns)
-
-    # Calculate population estimates if valid data are available
-    if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]):    
-        # ---- Merge the NASC and sigma_bs datasets
-        nasc_biology = acoustic_df.merge(sigma_bs_df, on=unique_columns)
-        # ---- Compute the number densities (animals nmi^-2)
-        nasc_biology["number_density"] = (
-            nasc_biology["nasc"]
-            / (4.0 * np.pi * nasc_biology["sigma_bs_mean"])
-        )
-
-    # Get the corresponding average strata weights (computed for all fish)
-    weight_spatial_averages = get_average_strata_weights(biology_db,
-                                                         biology_dict,
-                                                         unique_columns=unique_columns)
-    
-    if weight_spatial_averages is not None:
-        # Merge average weights with number density estimates
-        nasc_biology = nasc_biology.merge(weight_spatial_averages, on=unique_columns)
-
-        # Compute biomass densities
-        nasc_biology["biomass_density"] = (
-            nasc_biology["number_density"] * nasc_biology["average_weight"]
-        )
-
-    # Update the survey population estimate DataFrame with the newly computed densities
-    if not nasc_biology.empty:        
-        sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df", 
-                        columns=["number_density", "biomass_density"], 
-                        unique_columns=["stratum", "longitude", "latitude", "ping_time"])
-    
-    # Summarize strata
-    summarize_strata(nasc_biology, strata_df, file_configuration)
-
-db_file=acoustic_db
-dataframe=nasc_biology
-table_name="survey_data_df"
-columns=["number_density", "biomass_density"]
-unique_columns=["stratum", "longitude", "latitude", "ping_time"]
-nasc_biology["number_density"].sum() / 2
-nasc_biology["number_density"]
-SQL(acoustic_db, "select", table_name="survey_data_df")
-SQL(biology_db, "select", table_name="strata_summary_df")
-strata_df = self.input["spatial"]["strata"].copy()
-strata_df[["length_mean", "weight_mean", "TS_mean", "number_density_mean", 
-           "biomass_density_mean", "abundance_sum", "biomass_sum"]] = np.nan
-strata_df.drop(columns=["latitude_interval"], inplace=True)
-SQL(acoustic_db, "select", table_name="survey_data_df")
-
-SQL(biology_db, "drop", table_name="strata_summary_df")
-SQL(biology_db, "create", table_name="strata_summary_df", dataframe=strata_df, primary_keys=["stratum"])
-SQL(biology_db, "insert", table_name="strata_summary_df", dataframe=strata_df,
-    id_columns=["stratum"])
-
-tt = pd.DataFrame({
-    "x": np.array([1, 1, 1, 2, 2, 2, 3, 3, 3]),
-    "y": np.array([1, 2, 3, 1, 2, 3, 1, 2, 3]),
-    "area": 50 ** 2,
-    "mean_number_density": 0.0,
-    "mean_biomass_density": 0.0,
-    "abundance": 0.0,
-    "biomass": 0.0
-})
-
-nasc_biology_output_a = self.input["nasc_df"].assign(x=1, y=1).reset_index(drop=True)
-nasc_biology_output_a.loc[3, "x"] = 2
-nasc_biology_output_a.loc[3, "y"] = 3
-nasc_biology_output_a = nasc_biology_output_a.filter(["stratum", "x", "y", "longitude", "latitude", "nasc", "number_density", "biomass_density"])
-nasc_biology_output = nasc_biology_output_a.merge(sigma_bs_mean_df, on=spatial_column)
-nasc_biology_output["number_density"] = (
-    nasc_biology_output["nasc"]
-    / (4.0 * np.pi * nasc_biology_output["sigma_bs_mean"])
-)
-nasc_biology_output =nasc_biology_output.merge(general_weight_averages)
-nasc_biology_output["biomass_density"] = nasc_biology_output["number_density"] * nasc_biology_output["average_weight"]
-nasc_biology_output = nasc_biology_output.filter(["stratum", "x", "y", "longitude", "latitude", "number_density", "biomass_density"])
-nasc_biology_output = nasc_biology_output[nasc_biology_output["number_density"] > 0.0].reset_index()
-
-SQL(acoustic_db, "drop", table_name="reference")
-SQL(acoustic_db, "drop", table_name="grid")
-
-SQL(acoustic_db, "create", table_name = "reference", dataframe=tt)
-SQL(acoustic_db, "create", table_name = "grid", dataframe=nasc_biology_output_a)
-
-SQL(acoustic_db, "insert", table_name = "reference", dataframe=tt)
-SQL(acoustic_db, "insert", table_name = "grid", dataframe=nasc_biology_output_a)
-
-SQL(acoustic_db, "select", table_name="grid")
-SQL(acoustic_db, "select", table_name="reference")
-
-sql_group_update(acoustic_db, dataframe=nasc_biology_output, 
-                 table_name="grid", columns=["number_density", "biomass_density"], 
-                 unique_columns=["stratum", "x", "y", "longitude", "latitude"])
-
-SQL(acoustic_db, "select", table_name="grid")
-
-from typing import List
-
-data_table = "grid"
-grid_table = "reference"
-column_pairs = [("number_density", "abundance"), ("biomass_density", "biomass")]
-
-dataframe = nasc_biology_output
-
-import sqlalchemy as sqla
-grid_db_file = file_configuration["database"]["grid"]
-survey_db_file = Path(file_configuration["data_root_dir"]) / "database" / "acoustics.db"
-data_table = "survey_data_df"
-grid_table = "grid_df"
-coordinates = ["x", "y"]
-from echopop.live.sql_methods import SQL
-
-SQL(grid_db_file, "select", table_name=grid_table)
-SQL(survey_db_file, "select", table_name=data_table)
-SQL(data_table, "map")
-
-gridding_column = self.config["gridding_column"]
-
-updated_survey_data = nasc_biology.copy()
-# Get relevant table
-previous_grid = query_dataset(grid_db_file, updated_survey_data, 
-                              table_name=grid_table,
-                              data_columns=["x", "y", "area", "number_density_mean", 
-                                            "biomass_density_mean", "abundance", "biomass"],
-                              unique_columns=["x", "y"])
-previous_data = query_dataset(survey_db_file, updated_survey_data, 
-                              table_name=data_table,
-                              data_columns=["x", "y", "number_density", "biomass_density"],
-                              unique_columns=["x", "y"])
-# Get unique coordinates
-update_keys = get_unique_identifiers(updated_survey_data, gridding_column).set_index(["x", "y"])
-
-
-# Index
-previous_grid.set_index(["x", "y"], inplace=True)
-previous_grid["biomass_density_mean"] = previous_data.groupby(["x", "y"])["biomass_density"].mean()
-previous_grid["number_density_mean"] = previous_data.groupby(["x", "y"])["number_density"].mean()
-
-# Convert area from m^2 to nmi^2
-previous_grid["abundance"] = previous_grid["number_density_mean"] * previous_grid["area"]
-previous_grid["biomass"] = previous_grid["biomass_density_mean"] * previous_grid["area"]
-previous_grid = previous_grid.reset_index()
-
-sql_group_update(grid_db_file, dataframe=previous_grid, 
-                 table_name=grid_table, 
-                 columns=["number_density_mean", "biomass_density_mean", "abundance", "biomass"], 
-                 unique_columns=["x", "y"])
-
-murr = SQL(grid_db_file, "select", table_name=grid_table)
-murr[murr.abundance > 0]
-
-update_keys["number_density_mean"] = updated_survey_data.groupby(["x", "y"])["number_density"].mean()
-update_keys["biomass_density_mean"] = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean()
-
-am = SQL(grid_db_file, "select", table_name="grid_df")
-am[am.abundance > 0]
-bm = SQL(grid_db_file, "select", table_name="grid_df")
-bm[bm.abundance > 0]
-number_density_mean = updated_survey_data.groupby(["x", "y"])["number_density"].mean()
-biomass_density_mean = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean()
-
-SQL(grid_db_file, "select", table_name=grid_table)
-
-
-
-pulled_data = pd.concat([SQL(grid_db_file, "select", 
-                             table_name=grid_table, 
-                             condition=f"x = {t[0]} & y = {t[1]}") for t in unique_coord])
-previous_cell_data = pd.concat([SQL(survey_db_file, "select", 
-                                    table_name=data_table,
-                                    condition=f"x = {t[0]} & y = {t[1]}") for t in unique_coord])
-
-from echopop.live.live_data_processing import get_nasc_sql_data, get_sigma_bs_sql_data, get_average_strata_weights, summarize_strata
-from echopop.live.sql_methods import sql_group_update
-from typing import List
-from shapely.geometry import box
-SQL(grid_db_file, "select", table_name="grid_df")
-# Compute means
-number_density_mean = previous_cell_data.groupby(["x", "y"])["number_density"].mean()
-previous_cell_data = previous_cell_data.groupby(["x", "y"])["biomass_density"].mean()
-
-[SQL(grid_db_file, "select", table_name=grid_table, condition=f"x = {xi} & y = {yi}") for xi, yi in zip(nasc_data_df["x"], nasc_data_df["y"])]
-
-# Write to the database file (for the grid)
-# ---- Create engine
-engine = sqla.create_engine(f"sqlite:///{db_filepath}")
-
-def update_population_grid(grid_db_file: str, 
-                           data_table: str,
-                           grid_table: str,
-                           dataframe: pd.DataFrame,
-                           column_pairs: Union[List[tuple[str, str]], tuple[str, str]],
-                           coordinates: List[str]):
-    
-    # Convert `column_pairs` to a list, if needed
-    if not isinstance(column_pairs, list):
-        column_pairs = [column_pairs]
-
-    dataframe[coordinates]
-    # Format the coordinate pairs
-    # ---- Convert coordinate values into a list of tuples
-    coord_pairs = [tuple(row) for row in dataframe[coordinates].itertuples(index=False)]    
-    # ---- Get unique pairs
-    coords = list(set(coord_pairs))
-
-    # Format the SQL script command
-    # ---- Initialize
-    sql_script = []
-    # ---- Iteratively update
-    for input_column, output_column in column_pairs:
-        sql_script.append(
-        f"""
-        BEGIN TRANSACTION;
-                        
-        -- Calculate averages for input_column and update grid_table
-        WITH avgs AS (
-            SELECT
-                {coordinates[0]},
-                {coordinates[1]},
-                AVG(d.{input_column}) as avg_value
-            FROM {data_table} d
-            GROUP BY d.{coordinates[0]}, d.{coordinates[1]}
-        )
-
-        -- Update the grid_table with both average and computed total
-        UPDATE {grid_table}
-        SET 
-            mean_{input_column} = (
-                SELECT avg_value
-                FROM avgs
-                WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
-                    AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
-            ),
-            {output_column} = (
-                SELECT avg_value * {grid_table}.area
-                FROM avgs
-                WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
-                    AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
-            )       
-        WHERE EXISTS (
-            SELECT 1
-            FROM avgs
-            WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
-              AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
-        );
-
-        COMMIT;
-        """
-        )
-
-    # Create the engine
-    engine = create_engine(f"sqlite:///{db_file}")
-
-    # Create the SQL database connection and send the script 
-    with engine.connect() as connection:
-        dbapi_conn = connection.connection
-        _ = dbapi_conn.executescript("\n".join(sql_script))
-
-
-
-def update_population_grid(db_file: str, 
-                           data_table: str,
-                           grid_table: str,
-                           dataframe: pd.DataFrame,
-                           column_pairs: Union[List[tuple[str, str]], tuple[str, str]],
-                           coordinates: List[str]):
-    
-    # Convert `column_pairs` to a list, if needed
-    if not isinstance(column_pairs, list):
-        column_pairs = [column_pairs]
-
-    dataframe[coordinates]
-    # Format the coordinate pairs
-    # ---- Convert coordinate values into a list of tuples
-    coord_pairs = [tuple(row) for row in dataframe[coordinates].itertuples(index=False)]    
-    # ---- Get unique pairs
-    coords = list(set(coord_pairs))
-
-    # Format the SQL script command
-    # ---- Initialize
-    sql_script = []
-    # ---- Iteratively update
-    for input_column, output_column in column_pairs:
-        sql_script.append(
-        f"""
-        BEGIN TRANSACTION;
-                        
-        -- Calculate averages for input_column and update grid_table
-        WITH avgs AS (
-            SELECT
-                {coordinates[0]},
-                {coordinates[1]},
-                AVG(d.{input_column}) as avg_value
-            FROM {data_table} d
-            GROUP BY d.{coordinates[0]}, d.{coordinates[1]}
-        )
-
-        -- Update the grid_table with both average and computed total
-        UPDATE {grid_table}
-        SET 
-            mean_{input_column} = (
-                SELECT avg_value
-                FROM avgs
-                WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
-                    AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
-            ),
-            {output_column} = (
-                SELECT avg_value * {grid_table}.area
-                FROM avgs
-                WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
-                    AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
-            )       
-        WHERE EXISTS (
-            SELECT 1
-            FROM avgs
-            WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
-              AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
-        );
-
-        COMMIT;
-        """
-        )
-
-    # Create the engine
-    engine = create_engine(f"sqlite:///{db_file}")
-
-    # Create the SQL database connection and send the script 
-    with engine.connect() as connection:
-        dbapi_conn = connection.connection
-        _ = dbapi_conn.executescript("\n".join(sql_script))
-
-
-SQL(acoustic_db, "select", table_name=data_table)
-SQL(acoustic_db, "select", table_name=grid_table)
-
-
-SQL(acoustic_db, "update", table_name="grid", dataframe=nasc_biology_output, unique_columns=["stratum", "x", "y"], columns=["number_density", "biomass_density"])
-SQL(acoustic_db, "select", table_name="reference")
-
-source_db = acoustic_db
-target_db = biology_db
-
-source_table = "grid"
-target_table = "strata_summary_df"
-
-data_columns = ["number_density", "biomass_density"]
-strata_columns = ["stratum"]
-strata = [2]
-stratum_list = ', '.join(map(str, stratum_values))
-
-data_column = data_columns[0]
-data_columns = data_columns[0]
-def sql_update_strata_summary(source_db: str,
-                              target_db: str,
-                              arg_fun: str,
-                              data_columns: List[tuple[str, str]],
-                              strata: list):
-    
-    # Format strata list as a string
-    strata_str = ', '.join(map(str, strata))
-
-    # Function reference map
-    FUNCTION_MAP = {
-        "sum": {"function": "SUM", 
-                "suffix": "sum"},
-        "mean": {"function": "AVG",
-                "suffix": "mean"}
-    }
-
-    # Prepare the SQL script
-    sql_script = f"""
-    -- Attach the source and target databases
-    ATTACH DATABASE '{source_db}' AS source;
-    ATTACH DATABASE '{target_db}' AS target;
-
-    """
-
-    # Dynamically format the cross-database command
-    for data_column, method in data_columns:
-        # ----- Format the function-method-suffic keys
-        suffix = FUNCTION_MAP[method]["suffix"]
-        fun = FUNCTION_MAP[method]["function"]
-        # ---- Create the combined SQL command using f-strings
-        sql_script += f"""
-        -- Calculate averages and directly update the target table
-        UPDATE target.{target_table}
-        SET {data_column}_{suffix} = (
-            SELECT {fun}({data_column})
-            FROM source.{source_table}
-            WHERE stratum = target.{target_table}.stratum
-        )
-        WHERE stratum IN ({strata_str});
-        """
-    # ----- Append DETACH commands only once at the end   
-    sql_script += """
-    -- Detach the databases
-    DETACH DATABASE source;
-    DETACH DATABASE target;
-    """
-
-    # Create the engine
-    engine = create_engine(f"sqlite:///{target_db}")
-
-    # Create the SQL database connection and send the script 
-    with engine.connect() as connection:
-        dbapi_conn = connection.connection
-        _ = dbapi_conn.executescript(sql_script)
-
-SQL(biology_db, "select", table_name=target_table)
-SQL(acoustic_db, "select", table_name=source_table)["number_density"].mean()
-connection.close()
-dbapi_conn.close()
-
-
-pairs = [(1, 2), (3, 4), (5, 6)]
-
-# Convert the pairs into a format suitable for SQL IN clause
-pairs_placeholder = ', '.join(f'({x}, {y})' for x, y in pairs)
-
-# Construct the SQL command as a text string
-sql_command = f'''
-BEGIN TRANSACTION;
-
-UPDATE reference
-SET total = (
-    SELECT AVG(g.sigma_bs) * r.area
-    FROM grid g
-    WHERE g.stratum = r.stratum_x
-)
-WHERE (stratum_x, stratum_y) IN ({pairs_placeholder});
-
-COMMIT;
-'''
-
-psi = 10 ** (-21/10)
-psi * 280**2 * 1500 * 128e-6 / 2
-psi / 3 * 280 ** 3 / 280 / 1852 ** 2 * nasc_biology["number_density"]
-
-psi * (280.0 ** 2) / 1852 ** 2
-depth_area = 280 ** 2 * psi
-swath_length = 0.5 * 1852
-depth_area * swath_length / 1852 ** 2 * nasc_biology["number_density"]
-280 ** 2 * psi / 1852 ** 2 * nasc_biology["number_density"]
-
-SQL(acoustic_db, "map") 
-beam_angle = 9.0 * np.pi / 180.0
-280.0 * np.tan(beam_angle) * 2.0 * swath_length / 1852 ** 2 * nasc_biology["number_density"]
-280.0 * np.tan(beam_angle) * 2.0 ** 2 * np.pi * swath_length / 1852 ** 2 * nasc_biology["number_density"]
-area = 2.0 * nasc_biology["center_of_mass"] ** 2 * np.tan(beam_angle)
-area / 1852 ** 2 * nasc_biology["number_density"]
-SQL(acoustic_db, "map") 
-
-# Merge hake fraction data into `nasc_interval_df`
-# ---- Initial merge
-nasc_interval_df = nasc_interval_df.merge(
-    input_dict["spatial"]["strata_df"], on=[stratum_col, "haul_num"], how="outer"
-)
-# ---- Replace `fraction_hake` where NaN occurs
-nasc_interval_df["fraction_hake"] = nasc_interval_df["fraction_hake"].fillna(0.0)
-# ---- Drop NaN
-nasc_interval_df.dropna(subset=["transect_num"], inplace=True)
-
-# Calculate the along-transect number density (animals per nmi^2)
-# ---- Merge NASC measurements with mean sigma_bs for each stratum
-nasc_biology = nasc_interval_df.merge(sigma_bs_strata, on=[stratum_col])
-# ---- Calculate the number densities
-nasc_biology["number_density"] = (
-    nasc_biology["fraction_hake"]
-    * nasc_biology["nasc"]
-    / (4.0 * np.pi * nasc_biology["sigma_bs_mean"])
-)
-
-
-if working_dataset == "acoustic":
-    db_file = self.config["database"]["acoustic"]
-elif working_dataset == "biology":
-    db_file = self.config["database"]["biology"]
-else:
-    raise ValueError(
-        f"Argument for `working_dataset` [{working_dataset}] is invalid."
-        f" Value must either be 'acoustic' or 'biology'."
-    )
-
-# Extract the necessary correct strata mean sigma_bs
-sigma_bs_strata = analysis_dict["acoustics"]["sigma_bs"]["strata_mean_df"]
-
-# Pull out the length-weight conversion for each stratum
-length_weight_strata = analysis_dict["biology"]["weight"]["weight_stratum_df"]
-
-# Get the name of the stratum column
-stratum_col = settings_dict["transect"]["stratum_name"]
-
-
-catch_data = self.input["biology"]["catch_df"]
-
-# Get the spatial column name, if there is one
-spatial_column = file_configuration["spatial_column"]
-# ---- Append additional columns that will be used
-contrast_columns = spatial_column + ["sex", "species_id"]
-
-# Calculate grouped totals
-# ---- Sum the net haul weights from station 1/unaged fish
-catch_weights = catch_data.count_variable(
-    contrasts=["species_id"] + spatial_column, 
-    variable="haul_weight", fun="sum"
-)
-# ---- Rename resulting columns for both
-catch_weights.rename(columns={"count": "total_weight"}, inplace=True)
-
-# ---- Specimen
-specimen_weights = specimen_weight_binned.sum().reset_index(name="total_weight")
-
-specimen_weight_binned
-# Calculate the sexed and total stratum weights for each sex among unaged fish
-# ---- Sum the net haul weights from station 1/unaged fish
-catch_weights = catch_data.count_variable(
-    contrasts=["species_id"] + file_configuration["spatial_column"], 
-    variable="haul_weight", fun="sum"
-)
-# ---- Rename resulting columns for both
-catch_weights.rename(columns={"count": "total_weight"}, inplace=True)
-
-# For the specimen data 
-# ---- Sum the net haul weights from station 1/unaged fish
-# ---- Specimen
-specimen_weights_sex = (
-    specimen_weight_binned
-    .groupby(contrast_columns)["weight"]
-    .sum()
-)
-# ---- Total (per stratum, if it exists)
-specimen_weight_total = specimen_weights_sex.transpose().unstack(1).sum(axis=1)
-
-# For the length (unaged) dataset
-length_weights_sex = (
-    length_weight_binned
-    .groupby(contrast_columns)["weight_interp"]
-    .sum()
-)
-# ---- Further reduce to the grand total (per stratum, if it exists)
-length_weight_total = length_weights_sex.transpose().unstack(1).sum(axis=1)
-
-# ---- Standardize the unaged sexed weights
-length_weight_standardized = (
-    (length_weights_sex / length_weight_total).unstack(0) 
-    * catch_weights["total_weight"].to_numpy()
-)
-
-# Calculate the specimen weight proportions
-# ---- Pivot weight bins
-specimen_weight_binned_pvt = (
-    specimen_weight_binned.pivot_table(
-        columns=spatial_column,
-        index=["length_bin", "species_id", "sex"],
-        values="weight",
-        observed = False
-    )
-)
-# ---- Divide by the aged stratum weights (relative to only aged fish)
-specimen_weight_proportions_pvt = (
-    specimen_weight_binned_pvt / specimen_weight_total.to_numpy()
-)
-# ---- Pivot back to the desired format
-specimen_weight_proportion = (
-    specimen_weight_proportions_pvt
-    .stack().reset_index(name="weight_proportion")
-    .pivot_table(columns=stratum_column + ["species_id", "sex"], 
-                 index="length_bin", values="weight_proportion")
-)    
-# ---- Calculate the internal (i.e. only aged fish) for each sex
-within_specimen_sex_proportions = (
-    specimen_weight_proportion.sum()
-)
-
-# Calculate the total strata weights
-# ---- Index `catch_weights`
-catch_weights_idx = catch_weights.set_index(stratum_column + ["species_id"])
-# ---- Compute the spatially-stratified/grouped weights
-spatial_weights = (
-    pd.concat([specimen_weight_total.to_frame("total_weight"), catch_weights_idx])
-    .pivot_table(
-        columns=stratum_column, 
-        aggfunc="sum", 
-        values="total_weight", 
-        observed=False
-    )
-)
-
-# Calculate the weight proportions relative to the overall stratum weights
-# ---- Aged
-# -------- Reformat into dataframe and merge with total stratum weights
-specimen_weights_binned_df = (
-    specimen_weight_binned_pvt.stack()
-    .to_frame("specimen_weight")
-    .reset_index()
-    .merge(spatial_weights.T.reset_index(), on=stratum_column)
-)
-# -------- Calculate proportions
-specimen_weights_binned_df["weight_proportion_overall"] = (
-    specimen_weights_binned_df["specimen_weight"] / specimen_weights_binned_df["total_weight"]
-)
-# -------- Consolidate to calculate the sexed proportions per stratum
-specimen_weight_sex_proportions = specimen_weights_binned_df.groupby(stratum_column + ["species_id", "sex"])[
-    "weight_proportion_overall"
-].sum()
-# ---- Unaged
-# -------- Reformat into dataframe and merge with total stratum weights
-length_weights_sex_standardized_df = (
-    length_weight_standardized.stack()
-    .to_frame("catch_weight")
-    .reset_index()
-    .merge(spatial_weights.T.reset_index(), on=stratum_column)
-)
-# -------- Calculate proportions
-length_weights_sex_standardized_df["weight_proportion_overall"] = (
-    length_weights_sex_standardized_df["catch_weight"]
-    / length_weights_sex_standardized_df["total_weight"]
-)
-# -------- Back-calculate the sexed weight proportions relative to just unaged fish
-# ------------ Aggregate proportions
-length_total_sex_proportions = length_weights_sex_standardized_df.pivot_table(
-    columns=["species_id", "sex"], index=stratum_column, values="weight_proportion_overall"
-).transpose().unstack(["species_id"]).sum(axis=0)
-# ------------ Re-compute the proportions
-length_weight_sex_proportions = (
-    length_weights_sex_standardized_df.pivot_table(
-        index=["species_id", "sex"], columns=stratum_column, 
-        values="weight_proportion_overall"
-    )
-    / length_total_sex_proportions.to_numpy()
-)
-
-# Compute the overall length-binned weight distributions among unaged fish
-# ---- Extract the number proportions computed for unaged fish
-length_number_proportions = length_number_proportion.copy()
-# ---- Filter out values besides those computed for 'all' fish
-length_number_proportions = length_number_proportions[length_number_proportions["sex"] == "all"]
-# ---- Convert to a table
-length_number_proportions_tbl = length_number_proportions.pivot_table(
-    columns=stratum_column + ["species_id"],
-    index=["length_bin"],
-    values="proportion_number_length",
-    aggfunc="sum",
-    observed=False,
-)
-# ---- Extract the fitted weight values calculated for all fish
-length_weight_all = length_weight_df[length_weight_df["sex"] == "all"]
-# ---- Generate the fitted weight array
-fitted_weights = length_weight_all.copy()
-# ---- Get actual length bins in dataset
-fitted_weights = fitted_weights[fitted_weights["length_bin"].isin(length_number_proportions["length_bin"])]
-# ---- Apportion the averaged weights
-length_apportioned_weights = length_number_proportions_tbl.T * fitted_weights["weight_fitted"].to_numpy()
-# ---- Compute the average weight proportions per length bin per stratum
-average_length_bin_weights = length_apportioned_weights.T / length_apportioned_weights.sum(axis=1)
-# ---- Convert back to a DataFrame
-average_length_bin_weights_df = average_length_bin_weights.unstack().reset_index(
-    name="weight_proportion"
-)
-
-# Calculate the aged and unaged weight proportions
-# ---- Aged
-aged_proportions = specimen_weight_sex_proportions.unstack("sex").sum(axis=1)
-# ---- Unaged
-unaged_proportions = 1 - aged_proportions
-# -------- Re-weight the unaged sexed proportions
-unaged_weight_sex_proportions_overall = (
-    (length_weight_sex_proportions * unaged_proportions.unstack().transpose()).astype(float).fillna(0.0)
-)
-
-unaged_proportions.unstack().transpose()
-# Format the outputs
-# ---- Aged: stratum-sex-age-length relative to aged and total weights
-aged_overall_df = (
-    specimen_weight_proportion.unstack()
-    .reset_index(name="weight_proportions")
-    .merge(
-        specimen_weights_binned_df[
-            stratum_column + ["length_bin", "sex", "species_id", "weight_proportion_overall"]
-        ]
-    )
-)
-# ---- Aged: stratum-sex relative to total weights
-aged_sex_df =within_specimen_sex_proportions.reset_index(name="weight_proportion_aged").set_index(
-        stratum_column + ["species_id", "sex"]
-    )
-# ---- Add the aged sex proportiosn relative to the overall survey
-aged_sex_df["weight_proportion_overall_aged"] = specimen_weight_sex_proportions
-# ---- Consolidate the aged and unaged sexed dataframes
-# -------- Initialize the dataframe
-aged_unaged_sex_proportions = aged_sex_df.reset_index().set_index(["species_id", "sex"] + stratum_column)
-# --------- Add the within-unaged weight proportions
-aged_unaged_sex_proportions["weight_proportion_unaged"] = (
-    length_weight_sex_proportions.stack()
-)
-# --------- Add the overall-unaged weight proportions
-aged_unaged_sex_proportions["weight_proportion_overall_unaged"] = (
-    unaged_weight_sex_proportions_overall.stack()
-)
-# ---- Overall aged and unaged proportions
-aged_unaged_proportions = aged_proportions.reset_index(name="aged_proportions")
-# ---- Set index
-aged_unaged_proportions.set_index(stratum_column + ["species_id"], inplace=True)
-# -------- Add unaged proportions
-aged_unaged_proportions["unaged_proportions"] = unaged_proportions#.reset_index()
-# ---- Reset the index
-aged_unaged_proportions = aged_unaged_proportions.reset_index()
-####################################################################################################
-# * Functionality for reading in processed acoustic data
-# TODO: Expand data validator and limit cases to '*.zarr' (for now)
-# TODO: Refactor "extra" components such as the validation steps, xarray-to-dataframe piping, etc.
-# TODO: Documentation
-file_settings = file_configuration["input_directories"]["acoustics"]
-root_directory = file_configuration["data_root_dir"]
-
-
-####################################################################################################
-def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str, List[str]]] = None):
-
-    # Get all database files
-    database_files = file_configuration["database"]
-
-    # Iterate through all keys
-    for _, db_file in database_files.items():
-        # ---- Map the table names
-        table_names = SQL(db_file, "map")
-        # ---- Drop any noted exceptions
-        if not isinstance(table_exception, list):
-            table_exception = [table_exception]
-        # ---- Drop exception table name
-        if None not in table_exception:
-            table_names = list(set(table_names) - set(table_exception))
-        _ = [SQL(db_file, "drop", table_name=table) for table in table_names]
-        # ---- Validate that all tables were removed        
-        if set(table_names).intersection(set(SQL(table_names, "map"))):
-            raise ValueError(
-                f"Attempted reset of [{str(db_file)}] failed."
-            )
-
-SPATIAL_CONFIG_MAP = {
-    "closest_haul": {
-        "proximity": {
-            "choices": ["distance", "time"],
-        },
-    },
-    "global" : {},
-    "griddify": {
-        "bounds": {
-            "longitude": {
-                "types": [float]
-            },
-            "latitude": {
-                "types": [float]
-            },
-            "northings": {
-                "types": [float]
-            },
-            "eastings": {
-                "types": [float]
-            },
-            "pairs": [("longitude", "latitude"), ("northings", "eastings")],
-        },
-        "grid_resolution": {
-            "x_distance": {
-                "types": float,
-            },
-            "y_distance": {
-                "types": float,
-            },
-            "d_longitude": {
-                "types": float,
-            },
-            "d_latitude": {
-                "types": float,
-            },
-            "grid_size_x": {
-                "types": int,
-            },
-            "grid_size_y": {
-                "types": int,
-            },
-            "pairs": [("x_distance", "y_distance"), ("d_longitude", "d_latitude"), 
-                      ("grid_size_x", "grid_size_y")],       
-        },
-    },
-    "inpfc": {
-        "stratum_names": {
-                "types": [int, str]
-            },
-        "latitude_max": {
-            "types": [float],
-        },
-    },
-    "weighted_haul": {
-        "proximity": {
-            "choices": ["distance", "time"]
-        },
-    },
-}
-
-
-
-reset_db_files(file_configuration, table_exception = "files_read")
-reset_db_files(file_configuration)
-
-stamp = 20240714194248
-stamp.astype(int)
-int(stamp)
-import re
-from datetime import datetime
-
-def infer_datetime_format(timestamp_str: Union[int, str]):
-    patterns = {
-        r"^\d{14}$": "%Y%m%d%H%M%S",             # YYYYMMDDHHMMSS
-        r"^\d{8}$": "%Y%m%d",                     # YYYYMMDD
-        r"^\d{6}$": "%H%M%S",                     # HHMMSS
-        r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$": "%Y-%m-%d %H:%M:%S",  # YYYY-MM-DD HH:MM:SS
-        r"^\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}$": "%Y/%m/%d %H:%M:%S",  # YYYY/MM/DD HH:MM:SS
-        r"^\d{4}-\d{2}-\d{2}$": "%Y-%m-%d",       # YYYY-MM-DD
-        r"^\d{4}/\d{2}/\d{2}$": "%Y/%m/%d"        # YYYY/MM/DD
-    }
-    
-    for pattern, date_format in patterns.items():
-        if re.match(pattern, timestamp_str):
-            return date_format
-    
-    raise ValueError("Unknown timestamp format")
-
-filter_dict = dict(species_filer=species_filter, trawl_filter=trawl_filter)
-
-def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict):
-
-    # Create dataframe copy
-    data_copy = biology_data.copy()
-
-    # Iterate through dictionary to apply filters (if present)
-    for column, value in filter_dict.items():
-        if column in data_copy.columns:
-            data_copy = data_copy[data_copy[column] == value]
-
-    # Return output
-    return data_copy
-
-
-
-df[(df['species_id'] == species_filter if 'species_id' in df.columns else True)]
-df[(df["species_id"] == 17 if "species_id" in df.columns)]
-
-(df[df["haul_num"] == 17 if "haul_num" in df.columns] else True)
-
-
-from datetime import datetime
-
-df = biology_output["trawl_info_df"]
-df.loc[(df['species_id'] == species_filter if 'species_id' in df.columns else True), :]
-df.index
-
-biology_output["trawl_info_df"].reset_index().index
-df = biology_output["catch_df"]
-df = df.loc[0, :].to_frame().T
-df.index
-df.loc[(df['species_id'] == species_filter if 'species_id' in df.columns else True)]
-
-def convert_datetime(timestamp: Union[int, str, pd.Series]):
-
-    if isinstance(timestamp, pd.Series):
-        test_timestamp = str(timestamp[0])
-    else:
-        test_timestamp = str(timestamp)
-
-    # Approximate the datetime format
-    datetime_format = infer_datetime_format(str(test_timestamp))
-
-    #
-    if isinstance(timestamp, pd.Series):
-        return timestamp.apply(lambda x: datetime.strptime(x, datetime_format))
-    else:
-        return datetime.strptime(timestamp, datetime_format)
-    
-infer_datetime_format(stamp)
-convert_datetime(stamp)
-infer_datetime_format(202407)
-
-# {'global': False, 'INPFC': True, 'closest_haul': False, 'weighted_haul': False}
-file_configuration["geospatial"]["link_biology_acoustics"] = "INPFC"
-file_configuration["geospatial"]
-spatial_config = file_configuration["geospatial"]
-###############
-
-acoustic_data = self.input["acoustics"]
-biology_data = self.input["biology"]
-
-
-
-from echopop.live.live_core import SPATIAL_CONFIG_MAP
-
-def load_spatial_data(acoustic_data: dict,
-                      biology_data: dict,                      
-                      file_configuration: dict,):
-    
-    # Extract spatial strata *only* if spatial information from the configuration settings
-    # ---- Get (geo)spatial config
-    spatial_config = file_configuration["geospatial"]
-    # ---- Remove case sensitivity
-    spatial_config = {key.lower(): value for key, value in spatial_config.items()}
-    # ---- Extract the projection
-    projection = spatial_config["projection"]
-    # ---- Extract the biology-acoustics linking method options
-    acoustics_biology_link = spatial_config["link_biology_acoustics"]
-
-    # Validate the configuration
-    validate_spatial_config(spatial_config)
-
-    # Create spatial dictionary that will be added as an `input`
-    spatial_dict = {"link_method": acoustics_biology_link}
-
-    # Assign the spatial link constraints to the acoustic and biological data
-    if acoustics_biology_link == "INPFC":
-        spatial_dict.update({"strata": create_inpfc_strata(spatial_config)})
-
-    # Return the dictionary as an output
-    return spatial_dict
-
-
-
-    # Convert the DataFrame to a GeoDataFrame
-    acoustic_data_gdf = gpd.GeoDataFrame(
-        data=acoustic_data,
-        geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]),
-        crs=projection
-    )
-
-    # Validate the spatial biology-acoustics linking method
-    # ---- Get the biology-acoustics linking method
-    link_method = next(key for key, value in acoustics_biology_link.items() if value)
-    # ---- Flag Error if unexpected method
-    if link_method not in ["global", "closest_haul", "INPFC", "weighted_haul"]:
-        raise ValueError(
-            f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options "
-            f"include: 'global', 'closest_haul', 'weighted_haul', and 'INPFC'."
-        )
-    
-####################################################################################################  
-# TEST: BIOLOGY FILE INGESTION CONFIGURATION
-# NOTE: 
-# ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration`
-biology_data, file_configuration = load_biology_data(file_configuration)
-biology_data
-####################################################################################################
-prc_nasc_df = acoustic_data["prc_nasc_df"]
-
-def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: dict, 
-                          echometrics: bool = True):
-
-    # Integrate NASC (and compute the echometrics, if necessary)
-    nasc_data_df = (
-        acoustic_data_df.groupby(["longitude", "latitude", "ping_time"])
-        .apply(lambda group: integrate_nasc(group, echometrics))
-        .reset_index()
-    )
-    # ---- Amend the dtypes if echometrics were computed
-    if echometrics:
-        nasc_data_df = (
-            nasc_data_df
-            .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float,
-                             "center_of_mass": float, "dispersion": float, "evenness": float,
-                             "aggregation": float, "occupied_area": float})
-        )
-
-    # Get the name of the associated db file
-    acoustics_db = file_configuration["database"]["acoustics"]
-    # ---- Get current tables
-    tables = SQL(acoustics_db, "inspect")
-    
-    # 
-    if "nasc_df" not in tables:
-        _ = SQL(acoustics_db, "insert", table_name="nasc_df", dataframe=nasc_data_df)
-    else:
-        # ---- 
-        nasc_sql = SQL(acoustics_db, "select", table_name="nasc_df")
-        # ----
-        index_equiv = nasc_data_df[["longitude", "latitude", "ping_time"]].isin(nasc_sql)
-        # ----
-        bool_idx = index_equiv.apply(lambda x: np.all(x), axis=1)
-        # ---- 
-        _ = SQL(acoustics_db, "insert", table_name="nasc_df", dataframe=nasc_data_df.loc[~bool_idx])
-        # ----
-        nasc_data_df = pd.concat([nasc_sql, nasc_data_df], ignore_index=True)
-
-    # Return the output
-    return nasc_data_df
-
-
-SQL(acoustics_db, command="drop", table_name="nasc_df")
-SQL(acoustics_db, "inspect")
-
-nasc_analysis = process_acoustic_data(acoustic_data["prc_nasc_df"], file_configuration)
-
-SQL(acoustics_db, command="select", table_name="nasc_df")
-
-TS_SLOPE = 20.0
-TS_INTERCEPT = -68.0
-
-# CONVERT TO TS
-comb_lengths["ts"] = TS_SLOPE * np.log10(comb_lengths["length"]) + TS_INTERCEPT
-# TO SIGMA_BS
-comb_lengths["sigma_bs"] = 10 ** (comb_lengths["ts"] / 10)
-# WEIGHTED MEAN SIGMA_BS
-sigma_mean = np.average(comb_lengths["sigma_bs"], weights=comb_lengths["length_count"])
-
-from typing import Optional
-from echopop.utils import operations
-from echopop.acoustics import ts_length_regression, to_linear, to_dB
-
-__all__ = ["operations"]
-
-# Meld bio datasets
-length_datasets = biology_data["specimen_df"].meld(biology_data["length_df"], 
-                                                   contrasts=["haul_num", "sex", "species_id", "length"])
-
-# Create distribution
-distrib_params = file_configuration["biology"]["length_distribution"]["bins"]
-
-length_bins = np.linspace(**{key: value for key, value in zip(["start", "stop", "num"], distrib_params)}, dtype=float)
-binwidth = np.diff(length_bins / 2.0).mean()
-intervals = np.concatenate([length_bins[:1] - binwidth, length_bins + binwidth])
-length_bins_df = pd.DataFrame({"bin": length_bins, "interval": pd.cut(length_bins, intervals)})
-# 
-length_datasets["length_bin"] = pd.cut(length_datasets["length"], bins=intervals, labels=length_bins_df["bin"])
-
-stratify_key = file_configuration["geospatial"]["link_biology_acoustics"]
-
-if stratify_key == "global":
-    length_distribution = (
-        length_datasets.pivot_table(columns=["sex"], index=["length_bin"], 
-                                    values="length_count", aggfunc="sum", observed=False)
-    )
-    #
-    length_distribution["total"] = length_distribution.sum(axis=1)
-
-length_distribution.transpose()
-SQL(biology_db, "drop", table_name="length_distribution")
-# Get the name of the associated db file
-biology_db = file_configuration["database"]["biology"]
-# ---- Get current tables
-tables = SQL(biology_db, "inspect")
-
-
-if "length_distribution" not in tables:
-    _ = SQL(biology_db, "insert", table_name="length_distribution", 
-            dataframe=length_distribution.transpose())
-    
-
-SQL(biology_db, "select", table_name="length_distribution")
-SQL(biology_db, "drop", table_name="length_distribution")
-SQL(biology_db, "replace", table_name="length_distribution", dataframe=length_distribution.unstack().reset_index(name="count"))
-length_distribution.unstack().reset_index(name="count")
-mixed = SQL(biology_db, "select", table_name="length_distribution")
-length_bins[:1]
-from typing import Optional
-from echopop.utils import operations
-from echopop.acoustics import ts_length_regression, to_linear, to_dB
-
-__all__ = ["operations"]
-
-biology_data = self.input["biology"]
-
-# Meld bio datasets
-length_datasets = biology_data["specimen_df"].meld(biology_data["length_df"], 
-                                                   contrasts=["haul_num", "species_id", "length"])
-
-ts_length_parameters_spp = [
-    spp
-    for spp in file_configuration["acoustics"]["TS_length_regression_parameters"].values()
-    if spp["number_code"] in np.unique(length_datasets.species_id).astype(int)
-]
-
-# ---- get species info
-target_species = pd.DataFrame.from_dict(ts_length_parameters_spp)
-
-ts_lengths_df = length_datasets.merge(
-    target_species.drop("length_units", axis=1),
-    left_on=["species_id"],
-    right_on=["number_code"],
-)
-# ---- filter out other spp
-length_datasets[length_datasets["species_id"].isin(target_species["number_code"])]
-
+# import contextlib
+# import copy
+# import glob
+# import os
+# import re
+# from datetime import datetime
+# from pathlib import Path
+# from typing import Optional, Tuple, Union
+
+# import geopandas as gpd
+# import matplotlib.cm as cm
+# import matplotlib.colors as colors
+# import matplotlib.pyplot as plt
+# import numpy as np
+# import pandas as pd
+# import shapely.geometry
+# import xarray as xr
+# import yaml
+# from geopy.distance import distance
+# from matplotlib.colors import ListedColormap
+# from shapely import wkt
+# from shapely.geometry import box
+# from sqlalchemy import Engine, create_engine, inspect, text
+
+# from echopop.acoustics import to_dB, to_linear, ts_length_regression
+# from echopop.live import live_data_loading as eldl, live_data_processing as eldp
+# from echopop.live.live_acoustics import configure_transmit_frequency, integrate_nasc
+# from echopop.live.live_biology import preprocess_biology_data
+# from echopop.live.live_core import (
+#     LIVE_DATA_STRUCTURE,
+#     LIVE_FILE_FORMAT_MAP,
+#     LIVE_INPUT_FILE_CONFIG_MAP,
+#     SPATIAL_CONFIG_MAP,
+# )
+# from echopop.live.live_data_loading import validate_data_directory
+# from echopop.live.live_data_processing import get_unique_identifiers, query_dataset
+# from echopop.live.live_survey import LiveSurvey
+# from echopop.live.sql_methods import (
+#     SQL,
+#     SQL_COMMANDS,
+#     format_sql_columns,
+#     initialize_database,
+#     query_processed_files,
+#     sql_data_exchange,
+#     sql_group_update,
+#     sql_update_strata_summary,
+# )
+# from echopop.spatial.projection import utm_string_generator
+# from echopop.survey import Survey
+
+# self = realtime_survey
+# spatial_config = self.config["geospatial"]
+# dataset = self.input["acoustics"]["nasc_df"]
+
+
+# survey_2019 = Survey("C:/Users/Brandyn/Documents/GitHub/echopop/config_files/initialization
+# _config.yml", "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/survey_year_2019_config
+# .yml")
+# survey_2019.transect_analysis()
+# survey_2019.analysis["transect"]["biology"]["weight"]["weight_stratum_df"]
+# analysis_dict = survey_2019.analysis["transect"]
+# SQL(acoustic_db, "select", table_name="sigma_bs_mean_df")
+# proportions_dict=analysis_dict["biology"]["proportions"]["number"]
+# length_weight_dict = analysis_dict["biology"]["weight"]
+# stratum_proportions_sexed["proportion_aged"] + stratum_proportions_sexed["proportion_unaged"]
+
+# updated_survey_data = nasc_biology.copy()
+# gridding_column = file_configuration["gridding_column"]
+
+# unique_keys = get_unique_identifiers(updated_survey_data, gridding_column)
+
+
+# file_configuration = self.config
+# grid_settings["grid_resolution"]["x"] = 50
+# grid_settings["grid_resolution"]["y"] = 50
+# lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters
+# lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters
+# self = realtime_survey
+# file_configuration = self.config
+
+# def initialize_grid():
+
+#     # Get root directory, if defined
+#     if "data_root_dir" in file_configuration:
+#         root_dir = Path(file_configuration["data_root_dir"])
+#     else:
+#         root_dir = Path()
+
+#     # Get `grid` settings
+#     grid_database = file_configuration["input_directories"]["grid"]["database_name"]
+
+#     # Create full filepath
+#     db_filepath = root_dir / "database" / grid_database
+
+#     # Create if file doesn't already exist
+#     if not db_filepath.exists():
+
+#         # Get projection
+#         projection = file_configuration["geospatial"]["projection"]
+
+#         # Get grid settings
+#         grid_settings = file_configuration["geospatial"]["griddify"]
+
+#         # Get the resolution
+#         resolution = grid_settings["grid_resolution"]
+#         # ---- Convert from nmi to m
+#         resolution_m = {key: distance(nautical=dist).meters for key, dist in resolution.items()}
+
+#         # Get boundary coordinates
+#         boundary = grid_settings["bounds"]
+#         # ---- x
+#         x = boundary["longitude"]
+#         # ---- y
+#         y = boundary["latitude"]
+#         # ---- Create DataFrame
+#         boundary_df = pd.DataFrame({
+#             "x": np.array([np.min(x), np.max(x), np.max(x), np.min(x), np.min(x)]),
+#             "y": np.array([np.min(y), np.min(y), np.max(y), np.max(y), np.min(y)])
+#         })
+
+#         # Create GeoDataFrame
+#         boundary_gdf = gpd.GeoDataFrame(
+#             data = boundary_df,
+#             geometry=gpd.points_from_xy(boundary_df["x"], boundary_df["y"]),
+#             crs = projection
+#         )
+
+#         # Convert to UTM (decimal degrees to m)
+#         # ---- Create UTM code
+#         utm_code = utm_string_generator((boundary_df.x.min() + boundary_df.x.max()) / 2,
+#                                         (boundary_df.y.min() + boundary_df.y.max()) / 2)
+#         # ---- Create number code
+#         utm_num = int(utm_code)
+#         # ---- Create string code
+#         utm_str = f"epsg:{utm_num}"
+#         # ---- UTM conversion
+#         boundary_gdf_utm = boundary_gdf.to_crs(utm_num)
+
+#         # Get step sizes for each grid cell
+#         # ---- x
+#         x_step = resolution_m["x_distance"]
+#         # ---- y
+#         y_step = resolution_m["y_distance"]
+
+#         # Prepare grid cell generation
+#         # ---- Get new boundaries
+#         xmin, ymin, xmax, ymax = boundary_gdf_utm.total_bounds
+#         # ---- Initialize empty list
+#         grid_cells = []
+#         # ---- Initialize coordinate counter
+#         y_ct = 0
+#         x_coord = []; y_coord = []
+#         # ---- Iterate through to generate cells
+#         for y0 in np.arange(ymin, ymax, y_step):
+#             y_ct += 1
+#             x_ct = 0
+#             for x0 in np.arange(xmin, xmax, x_step):
+#                 x_ct += 1
+#                 # ---- Step forward
+#                 x_coord.append(x_ct)
+#                 y_coord.append(y_ct)
+#                 x1 = x0 - x_step
+#                 y1 = y0 + y_step
+#                 # ---- Append to list
+#                 grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))
+
+#         # Convert to a GeoDataFrame
+#         cells_gdf = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs=utm_code)
+#         # ---- Add coordinates
+#         cells_gdf.loc[:, "x"] = np.array(x_coord)
+#         cells_gdf.loc[:, "y"] = np.array(y_coord)
+
+#         # Get coastline shapefile directory, if defined
+#         if "coastline" in file_configuration["input_directories"]:
+
+#             # Get coastline settings
+#             coast_settings = file_configuration["input_directories"]["coastline"]
+#             # ---- Create filepath
+#             shp_filepath = (
+#                 root_dir / coast_settings["directory"]
+#                 / coast_settings["coastline_name"] / f"{coast_settings["coastline_name"]}.shp"
+#             )
+#             # ---- Validate existence
+#             if not shp_filepath.exists():
+#                 raise FileNotFoundError(
+#                     f"{shp_filepath} does not exist!"
+#                 )
+
+#             # Get original lat/lon geometry boundaries
+#             xmin0, ymin0, xmax0, ymax0 = boundary_gdf.total_bounds
+
+#             # Read in file
+#             full_coast = gpd.read_file(shp_filepath)
+#             # ---- Convert to UTM
+#             full_coast_utm = full_coast.to_crs(utm_code)
+#             # ---- Remove empty
+#             full_coast_utm = full_coast_utm[~full_coast_utm.is_empty]
+
+#             # Create bounding box with a buffer
+#             boundary_box = box(xmin0 - 5, ymin0 - 5, xmax0 + 5, ymax0 + 5)
+#             # ---- Create an unbuffered copy
+#             boundary_box_unbuffered = box(xmin0, ymin0, xmax0, ymax0)
+#             # ---- Convert to a GeoDataFrame
+#             boundary_box_unbuffered_gdf = (
+#                 gpd.GeoDataFrame(geometry=[boundary_box_unbuffered], crs=projection)
+#             )
+#             # ---- Clip the coastline for saving
+#             clipped_coast_original = (
+#                 gpd.clip(full_coast, box(xmin0 + 1, ymin0 + 1, xmax0 + 1, ymax0 + 1))
+#             )
+
+#             # Clip the coastline shapefile
+#             clipped_coast = gpd.clip(full_coast, boundary_box).to_crs(utm_code)
+
+#             # Clip the grid cells
+#             cells_gdf.loc[:, "geometry"] = (
+#                 cells_gdf["geometry"].difference(clipped_coast.geometry.union_all())
+#             )
+
+#             # Calculate area per cell
+#             cells_gdf.loc[:, "area"] = cells_gdf.area
+
+#             # Convert back to original projection and clip
+#             clipped_cells_latlon = (
+#                 gpd.clip(cells_gdf.to_crs(projection), boundary_box_unbuffered_gdf)
+#                 .reset_index(drop=True)
+#             )
+
+#             # Initialize empty columns that can be added to later on
+#             clipped_cells_latlon.loc[:, ["number_density_mean", "biomass_density_mean",
+#                                          "abundance", "biomass"]] = 0.0
+
+#             # Create output DataFrame
+#             output_df = pd.DataFrame({
+#                 "geometry": clipped_cells_latlon["geometry"].apply(lambda geom: geom.wkt)
+#             })
+#             # ---- Add the required columns
+#             output_df = pd.concat([output_df, clipped_cells_latlon.loc[:, ["x", "y", "area"]]],
+#                                   axis=1)
+#             # ---- Initialize empty columns that can be added to later on
+#             output_df.loc[:, ["number_density_mean", "biomass_density_mean", "abundance",
+#                               "biomass"]] = 0.0
+
+#             # Write to the database file (for the grid)
+#             # ---- Create engine
+#             engine = sqla.create_engine(f"sqlite:///{db_filepath}")
+#             # ---- Connect and create table
+#             _ = output_df.to_sql("grid_df", engine, if_exists="replace")
+
+#             # Write to the database file (for the coastline shapefile)
+#             # ---- Create output copy
+#             coastline_out = pd.DataFrame({
+#                 "geometry": clipped_coast_original["geometry"].apply(lambda geom: geom.wkt)
+#             })
+#             # ---- Concatenate
+#             coastline_out = (
+#                 pd.concat([coastline_out, clipped_coast_original.drop(columns="geometry")],
+# axis=1)
+#             )
+#             # ---- Connect and create table
+#             _ = coastline_out.to_sql("coastline_df", engine, if_exists="replace")
+
+# ##################################################################################################
+# # TEST: YAML FILE CONFIGURATION
+# # ---- Define filepaths
+# self = LiveSurvey
+# live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initia
+# lization_config.yml"
+# live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_
+# year_2019_config.yml"
+# # ---- Run function: `live_configuration`
+# file_configuration = self.config
+# files = biology_files
+
+# biology_output = initial_biology_output
+# file_configuration = self.config
+# table_name = "length_df"
+# df = filtered_biology_output[table_name]
+# database_file = biology_db
+# kwargs = dict(dataframe=df, table_name=table_name, id_columns=["id"], primary_keys=["id"],
+# output_type=pd.DataFrame)
+
+# # NOTE: ARGUMENT: {working_dataset: Literal["acoustics", "biology"]}
+# working_dataset = "acoustics"
+# self = realtime_survey
+# file_configuration = self.config
+# self.results["biology"] = self.input["biology_processed"]
+# self.results["acoustics"] = self.input["nasc_df"]
+
+# # Get spatial column
+# spatial_column = file_configuration["spatial_column"]
+
+# # Initialize the working data dictionary
+# working_data = copy.deepcopy(self.results)
+# contrast_columns = []
+# # ---- Define unique columns
+# unique_columns = spatial_column + contrast_columns
+
+# acoustic_db = file_configuration["database"][working_dataset]
+# self = realtime_survey
+# acoustic_dict = self.input["acoustics"]
+# verbose = True
+# contrast_columns = []
+# db_file = acoustic_db
+# table_name="survey_data_df"
+# data_columns = data_columns
+# unique_columns=unique_columns
+# constraint="nasc > 0.0"
+# data_dict = self.input["acoustics"]
+# data_dict["nasc_df"]["stratum"] = 1
+# data_dict["prc_nasc_df"]["stratum"] = 2
+# table_name = "sigma_bs_mean_df"
+# data_columns=["sigma_bs", "sigma_bs_count"]
+# biology_db
+# strata_df = self.input["spatial"]["strata"]
+
+# def biology_pipeline(biology_dict: dict,
+#                      strata_df: pd.DataFrame,
+#                      file_configuration: dict,
+#                      verbose: bool,
+#                      contrast_columns: List[str] = []):
+
+#     # Get spatial column
+#     spatial_column = file_configuration["spatial_column"]
+#     unique_columns = spatial_column + contrast_columns
+
+#     # Get database file
+#     acoustic_db = file_configuration["database"]["acoustics"]
+
+#     # Get biology database file
+#     biology_db = file_configuration["database"]["biology"]
+
+#     # Check for data completion
+#     # ---- List of boolean values
+#     full_biology_data = (
+#         [True for _, df in biology_dict.items() if isinstance(df, pd.DataFrame) and df is
 #
-file_configuration["acoustics"]["TS_length_regression_parameters"][target_species["text_code"]]
-
-def average_sigma_bs(length: Union[pd.DataFrame, float, int], 
-                     TS_L_slope: Optional[float] = None, 
-                     TS_L_intercept: Optional[float] = None, 
-                     weighted: Optional[Union[float, int, str]] = None):
-
-    # 
-    if isinstance(length, pd.DataFrame):
-        if "length" not in length.columns: 
-            raise ValueError(
-                "Column [`length`] missing from dataframe input `length`."
-            )
-        if "TS_L_slope" not in length.columns and TS_L_slope is None:
-            raise ValueError(
-                "Value [`TS_L_slope`] missing from dataframe input `length` and optional "
-                "separate argument `TS_L_slope`."
-            )
-        if "TS_L_intercept" not in length.columns and TS_L_intercept is None:
-            raise ValueError(
-                "Value [`TS_L_intercept`] missing from dataframe input `length` and optional "
-                "separate argument `TS_L_intercept`."
-        )
-    elif isinstance(length, float) or isinstance(length, int):
-        if TS_L_slope is None:
-            raise ValueError(
-                "Argument [`TS_L_slope`] missing."
-            )
-        elif TS_L_slope is not None and not isinstance(TS_L_slope, float):
-            raise TypeError(
-                "Argument `TS_L_slope` must be type `float`."
-        )
-        if "TS_L_intercept" not in length.columns and TS_L_intercept is None:
-            raise ValueError(
-                "Argument [`TS_L_intercept`] missing."
-        )
-        elif TS_L_intercept is not None and not isinstance(TS_L_intercept, float):
-            raise TypeError(
-                "Argument `TS_L_intercept` must be type `float`."
-        )
-
-    #
-    if TS_L_slope is None:
-        TS_L_slope = length["TS_L_slope"]
-
-    #
-    if TS_L_intercept is None:
-        TS_L_intercept = length["TS_L_intercept"]
-
-    #
-    if isinstance(length, pd.DataFrame):
-        length_val = length["length"]
-
-    ts_value = ts_length_regression(length_val, TS_L_slope, TS_L_intercept)
-    sigma_bs_value = to_linear(ts_value)
-
-
-
-    if isinstance(weighted, str):
-        if weighted not in length.columns:
-            raise ValueError(
-                f"Argument [`weighted` (str)], '{weighted}', is not a column in argument `length` "
-                f"(DataFrame)."
-            )
-        else: 
-            return (sigma_bs_value * length[weighted]).sum() / length[weighted].sum()
-    elif weighted is not None: 
-        if weighted.size != sigma_bs_value.size:
-            raise ValueError(
-                f"Argument [`weighted` (float|int)] of size {weighted.size} does not match size of "
-                f"argument [`length` (float|int)`] of size {sigma_bs_value.size}."
-            )
-        else:
-            return (sigma_bs_value * weighted).sum() / weighted.sum()
-    else:
-        return sigma_bs_value.mean()
-
-def parse_condition(condition):
-    # Handle nested conditions and logical operators
-    condition = condition.replace('&', ' AND ').replace('|', ' OR ')
-
-    # Handle "IN" lists and replace square brackets with parentheses
-    condition = re.sub(r'(\w+)\s*IN\s*\[(.*?)\]', lambda m: f"{m.group(1)} IN ({m.group(2)})", condition, flags=re.IGNORECASE)
-    
-    # Handle range conditions for BETWEEN, including floats
-    condition = re.sub(r'(\d*\.\d+|\d+)\s*<=\s*(\w+)\s*<=\s*(\d*\.\d+|\d+)', 
-                       lambda m: f"{m.group(2)} BETWEEN {m.group(1)} AND {m.group(3)}", condition)
-    
-    # Handle individual comparisons
-    condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\d*\.\d+|\d+)', lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", condition)
-    condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\'[^\']*\')', lambda m: f"{m.group(1)} {m.group(2)} {m.group(3)}", condition)
-
-    # Handle single equal sign
-    condition = re.sub(r'(\w+)\s*=\s*(\d*\.\d+|\d+)', lambda m: f"{m.group(1)} = {m.group(2)}", condition)
-
-    # Remove redundant spaces
-    condition = re.sub(r'\s+', ' ', condition).strip()
-
-    return condition
-
-####################################################################################################
-def load_spatial_data(file_configuration: dict,
-                      acoustic_data: pd.DataFrame,
-                      coordinate_metadata: xr.Dataset):
-    
-    # Extract spatial strata *only* if spatial information from the configuration settings
-    # ---- Extract the projection
-    projection = file_configuration["geospatial"]["projection"]
-    # ---- Extract the biology-acoustics linking method options
-    acoustics_biology_link = file_configuration["geospatial"]["link_biology_acoustics"]
-
-    # Convert the DataFrame to a GeoDataFrame
-    acoustic_data_gdf = gpd.GeoDataFrame(
-        data=acoustic_data,
-        geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]),
-        crs=projection
-    )
-
-    # Validate the spatial biology-acoustics linking method
-    # ---- Get the biology-acoustics linking method
-    link_method = next(key for key, value in acoustics_biology_link.items() if value)
-    # ---- Flag Error if unexpected method
-    if link_method not in ["global", "closest_haul", "INPFC", "weighted_haul"]:
-        raise ValueError(
-            f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options "
-            f"include: 'global', 'closest_haul', 'weighted_haul', and 'INPFC'."
-        )
-    
-    # Create INPFC stratum dataframe
-    # ---- Extract 
-        
-    # Validate projection information
-    # ---- Create a dummy GeoDataFrame to extract CRS information
-    # geo_crs = gpd.GeoDataFrame(geometry=[], crs=projection)
-    # ---- Extract coordinate limits from the acoustic data
-    # lat_min = coordinate_metadata.attrs['geospatial_lat_min']
-    # lat_max = coordinate_metadata.attrs['geospatial_lat_max']
-    # lon_min = coordinate_metadata.attrs['geospatial_lon_min']
-    # lon_max = coordinate_metadata.attrs['geospatial_lon_max']
-    # # ---- Create boundary box string
-    # boundary_box_str = (
-    #     f"POLYGON(({lon_min} {lat_min}, {lon_max} {lat_min}, {lon_max} {lat_max}, "
-    #     f"{lon_min} {lat_max}, {lon_min} {lat_min}))"
-    # )
-    
-    # data_gdf = gpd.GeoDataFrame(acoustic_data, geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]),crs=f"epsg:{utm_string_generator(lon_min, lat_min)}")
-    # gpd.GeoDataFrame(acoustic_data, geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]),crs=f"epsg:4326").to_crs("epsg:32610")
-    
-    # from pyproj import CRS
-    # from pyproj.aoi import AreaOfInterest
-    # from pyproj.database import query_utm_crs_info
-    
-    # utm_crs_list = query_utm_crs_info(
-    #     datum_name="WGS 84",
-    #     area_of_interest=AreaOfInterest(
-    #         west_lon_degree=lon_min,
-    #         south_lat_degree=lat_min,
-    #         east_lon_degree=-lon_max,
-    #         north_lat_degree=lat_max,
-    #     ),
-    # )
-    # CRS.from_epsg(utm_crs_list[0].code).to_epsg("+proj=latlon")
-    
-####################################################################################################
-def live_data(file_configuration: dict): 
-    
-    # Extract the file directories (or from the configuration) containing acoustic, biological, and 
-    # spatial definitions/data/parameters
-    # ---- Acoustic data
-    acoustic_data = load_validated_acoustic_data(file_configuration)
-    # ---- Biological data 
-    # ---- Spatial data
-    
-
-
-####################################################################################################
-# * Define `LIVE_DATA_STRUCTURE` configuration mapping (this will be in an equivalent `core.py`)
-# TODO: Update structure with additional information (as needed)
-# TODO: Documentation
-LIVE_DATA_STRUCTURE = {
-    "meta": {
-        "provenance": dict(),
-        "date": list(),
-    },
-    "input": {
-        "acoustics": {
-            "nasc_df": pd.DataFrame(),
-        },
-        "biology": {
-            "catch_df": pd.DataFrame(),
-            "distributions": {
-                "length_bins_df": pd.DataFrame(),
-            },
-            "length_df": pd.DataFrame(),
-            "specimen_df": pd.DataFrame(),
-        },
-    },
-    "results": {
-        "acoustics": dict(),
-        "biology": dict(),
-        "stratified": dict(),        
-    },
-}
-####################################################################################################
-# * Define `LiveSurvey` class structure
-# TODO: Incorporate validators
-# TODO: Scope out full structure including accessors, attributes, and methods
-# TODO: Configure input arguments (for initialization)
-# TODO: Documentation
-class LiveSurvey:
-    """
-    A real-time processing version of the `echopop` base `Survey` class that ingests biological, 
-    acoustic, and event meta data to provide population estimates when generated.
-    """
-
-    def __init__(
-        self,
-        live_init_config_path: Union[str, Path], 
-        live_file_config_path: Union[str, Path],
-    ):
-        # Initialize `meta` attribute
-        self.meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"])
-
-        # Loading the configuration settings and definitions that are used for defining the 
-        # configuration settings
-        self.config = live_configuration(live_file_config_path, live_file_config_path)
-
-        # Loading the datasets defined in the configuration files
-        self.input = el.load_survey_data(self.config)
-
-        # Initialize the `results` data attribute
-        self.results = copy.deepcopy(LIVE_DATA_STRUCTURE["results"])
-
-current_units = zarr_data_ds["frequency_nominal"].units
-acoustic_analysis_settings["transmit"]
-file_configuration
-
-specimen_df = pd.DataFrame(
-    {
-        "haul_num": np.repeat([1,2,3], 4),
-        "station": "specimen",
-        "sex": np.tile(["male", "female"], 6),
-        "length": np.array([11, 11, 11, 18, 21, 23, 13, 11, 19, 25, 18, 9]), 
-        "weight": np.array([11, 14, 16, 18, 21, 23, 13, 11, 19, 25, 18, 9]) / 3.5,
-    },
-)
-
-length_df = pd.DataFrame(
-    {
-        "haul_num": np.repeat([1,2,3], 4),
-        "station": "length",
-        "sex": np.tile(["male", "female"], 6),
-        "length": np.array([16, 15, 19, 14, 9, 10, 18, 15, 16, 22, 17, 11]), 
-        "length_count": np.array([103, 123, 257, 106, 52, 329, 131, 72, 101, 212, 93, 81]),
-    },
-)
-
-catch_df = pd.DataFrame(
-    {
-        "haul_num": np.array([1, 2, 3]),
-        "weight": np.array([503.12, 684.32, 978.54])
-    }
-)
-
-TS_SLOPE = 20.0
-TS_INTERCEPT = -68.0
-
-acoustic_db = realtime_survey.config["database"]["acoustics"]
-SQL(acoustic_db, "select", table_name="files_processed")
-biology_db = realtime_survey.config["database"]["biology"]
-SQL(biology_db, "select", table_name="files_processedk")
-####
-# CONCATENATE FILE SOURCES
-specimen_reframed = specimen_df.groupby(["haul_num", "station", "sex", "length"])["length"].value_counts().to_frame("length_count").reset_index()
-specimen_reframed
-# MELD
-all_lengths = pd.concat([length_df, specimen_reframed])
-# COMBINE 
-comb_lengths = all_lengths.groupby(["haul_num", "sex", "length"])["length_count"].sum().to_frame("length_count").reset_index()
-
-
-from echopop.live.sql_methods import SQL
-
-# Assuming that you have a LiveSurvey object defined
-# ---- Get the database file name (and path)
-biology_db = livesurvey_object.config["database"]["biology"]
-# ----
-# CONVERT TO TS
-comb_lengths["ts"] = TS_SLOPE * np.log10(comb_lengths["length"]) + TS_INTERCEPT
-# TO SIGMA_BS
-comb_lengths["sigma_bs"] = 10 ** (comb_lengths["ts"] / 10)
-# WEIGHTED MEAN SIGMA_BS
-sigma_mean = np.average(comb_lengths["sigma_bs"], weights=comb_lengths["length_count"])
-
-# INTEGRATE NASC
-path2file = "C:/Users/15052/Downloads/win_1720457505_1720460000_NASC.zarr"
-
-Path(path2file).exists()
-xds = xr.open_dataset(path2file, engine="zarr")
-xds
-xdf = xds.to_dataframe().reset_index()
-xdf["NASC"] = xdf["NASC"].fillna(0.0)
-# convert frequency
-xdf["frequency_nominal"] = (xdf["frequency_nominal"] * 1e-3).astype(int)
-# filter
-xdf_38 = xdf[xdf["frequency_nominal"] == nasc_frequency]
-
-xdf_38.plot.scatter(x="distance", y="depth", c="NASC")
-plt.show()
-
-xdf_int = xdf_38.groupby(["distance", "longitude", "latitude"])["NASC"].sum().reset_index()
-
-plt.scatter(xdf_int["longitude"], xdf_int["latitude"], c=xdf_int["NASC"])
-plt.plot(xdf_int["longitude"], xdf_int["latitude"])
-plt.show()
-
-# CONVERT TO NUMBER DENSITY
-xdf_int["number_density"] = xdf_int["NASC"] / (4.0 * np.pi * sigma_mean)
-
-
-###################
-from geopy.distance import distance
-from shapely.geometry import Polygon, Point, box
-import geopandas as gpd
-from shapely.ops import unary_union
-import pyproj
-
-
-grid_settings = file_configuration["geospatial"]["griddify"]
-grid = []
-lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters
-lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters
-lat_min = grid_settings["bounds"]["latitude"][0]
-lat_max = grid_settings["bounds"]["latitude"][1]
-lon_min = grid_settings["bounds"]["longitude"][0]
-lon_max = grid_settings["bounds"]["longitude"][1]
-
-utm_str = utm_string_generator((lon_max + lon_min)/2, (lat_max + lat_min)/2)
-utm_proj = pyproj.Proj(f"epsg:{utm_str}")
-x_min, y_min = utm_proj(lon_min, lat_min)
-x_max, y_max = utm_proj(lon_max, lat_max)
-
-lat = 55.5000
-lon = -134.2500
-utm_code = int(utm_string_generator(lon, lat))
-utm_proj = pyproj.Proj(f"epsg:{utm_code}")
-utm_proj(lon, lat)
-gpd.GeoDataFrame(geometry=gpd.points_from_xy(np.array([lon]), np.array([lat])), crs=projection).to_crs(utm_code)
-
-
-num_lon_steps = int((x_max - x_min) / lon_step)
-num_lat_steps = int((y_max - y_min) / lat_step)
-
-lon1 = np.linspace(x_min, x_max - lon_step, num_lon_steps)
-lat1 = np.linspace(y_min, y_max - lat_step, num_lat_steps)
-lon2 = lon1 + lon_step
-lat2 = lat1 + lat_step
-
-# Convert UTM coordinates back to degrees
-lon_min_grid, lat_min_grid = np.meshgrid(lon1, lat1)
-lon_max_grid, lat_max_grid = np.meshgrid(lon2, lat2)
-
-# Convert UTM coordinates back to degrees with adjusted resolution
-lon1_deg, lat1_deg = utm_proj(lon_min_grid.ravel(), lat_min_grid.ravel(), inverse=True)
-lon2_deg, lat2_deg = utm_proj(lon_max_grid.ravel(), lat_max_grid.ravel(), inverse=True)
-
-polygons = [box(lon1, lat1, lon2, lat2) for lon1, lat1, lon2, lat2 in zip(lon1_deg, lat1_deg, lon2_deg, lat2_deg)]
-grid_gdf = gpd.GeoDataFrame({'geometry': polygons}, crs="epsg:4326")
-
-world = gpd.read_file("C:/Users/15052/Documents/GitHub/echopop_data/live_2019_files/coastline/ne_110m_land/ne_110m_land.shp")
-bbox = box(lon_min - 0.25, lat_min - 0.25, lon_max + 0.25, lat_max + 0.25)
-shapefile = world
-clipped_shapefile = gpd.clip(shapefile, bbox).to_crs(utm_proj.srs)
-clipped_shapefile.to_crs(utm_proj.srs)
-# clipped_geometry = bbox.intersection(world.union_all())
-# clipped_gdf = gpd.GeoDataFrame(geometry=[clipped_geometry], crs=world.crs)
-
-from shapely.geometry import MultiPolygon
-# Create an empty list to store clipped geometries
-# clipped_geometries = []
-
-# # Iterate over each grid polygon
-# for index, row in grid_gdf.iterrows():
-#     # Intersect grid polygon with land shape
-#     intersection = row['geometry'].intersection(clipped_shapefile.unary_union)
-
-#     # If intersection is a MultiPolygon, get the difference with the land shape
-#     if isinstance(intersection, MultiPolygon):
-#         clipped = row['geometry'].difference(clipped_shapefile.unary_union)
-#         if clipped.is_empty:
-#             continue
-#         clipped_geometries.append(clipped)
+# not None]
+#     )
+#     # ---- Validation
+#     if not all(full_biology_data):
+#         # ---- Print, if verbose
+#         if verbose:
+#             print(
+#                 f"No new processed biology data available for processing."
+#             )
+#     else:
+#         # Get related biology data
+#         acoustic_df = get_nasc_sql_data(acoustic_db,
+#                                         biology_dict,
+#                                         unique_columns=unique_columns)
+
+#         # Get the corresopding `sigma_bs` data (and also compute the s
+# ample-number weighted average)
+#         sigma_bs_df = get_sigma_bs_sql_data(acoustic_db,
+#                                             biology_dict,
+#                                             unique_columns=unique_columns)
+
+#     # Calculate population estimates if valid data are available
+#     if all([True if df is not None else False for df in [acoustic_df, sigma_bs_df]]):
+#         # ---- Merge the NASC and sigma_bs datasets
+#         nasc_biology = acoustic_df.merge(sigma_bs_df, on=unique_columns)
+#         # ---- Compute the number densities (animals nmi^-2)
+#         nasc_biology["number_density"] = (
+#             nasc_biology["nasc"]
+#             / (4.0 * np.pi * nasc_biology["sigma_bs_mean"])
+#         )
+
+#     # Get the corresponding average strata weights (computed for all fish)
+#     weight_spatial_averages = get_average_strata_weights(biology_db,
+#                                                          biology_dict,
+#                                                          unique_columns=unique_columns)
+
+#     if weight_spatial_averages is not None:
+#         # Merge average weights with number density estimates
+#         nasc_biology = nasc_biology.merge(weight_spatial_averages, on=unique_columns)
+
+#         # Compute biomass densities
+#         nasc_biology["biomass_density"] = (
+#             nasc_biology["number_density"] * nasc_biology["average_weight"]
+#         )
+
+#     # Update the survey population estimate DataFrame with the newly computed densities
+#     if not nasc_biology.empty:
+#         sql_group_update(acoustic_db, dataframe=nasc_biology, table_name="survey_data_df",
+#                         columns=["number_density", "biomass_density"],
+#                         unique_columns=["stratum", "longitude", "latitude", "ping_time"])
+
+#     # Summarize strata
+#     summarize_strata(nasc_biology, strata_df, file_configuration)
+
+# db_file=acoustic_db
+# dataframe=nasc_biology
+# table_name="survey_data_df"
+# columns=["number_density", "biomass_density"]
+# unique_columns=["stratum", "longitude", "latitude", "ping_time"]
+# nasc_biology["number_density"].sum() / 2
+# nasc_biology["number_density"]
+# SQL(acoustic_db, "select", table_name="survey_data_df")
+# SQL(biology_db, "select", table_name="strata_summary_df")
+# strata_df = self.input["spatial"]["strata"].copy()
+# strata_df[["length_mean", "weight_mean", "TS_mean", "number_density_mean",
+#            "biomass_density_mean", "abundance_sum", "biomass_sum"]] = np.nan
+# strata_df.drop(columns=["latitude_interval"], inplace=True)
+# SQL(acoustic_db, "select", table_name="survey_data_df")
+
+# SQL(biology_db, "drop", table_name="strata_summary_df")
+# SQL(biology_db, "create", table_name="strata_summary_df", dataframe=strata_df,
+# primary_keys=["stratum"])
+# SQL(biology_db, "insert", table_name="strata_summary_df", dataframe=strata_df,
+#     id_columns=["stratum"])
+
+# tt = pd.DataFrame({
+#     "x": np.array([1, 1, 1, 2, 2, 2, 3, 3, 3]),
+#     "y": np.array([1, 2, 3, 1, 2, 3, 1, 2, 3]),
+#     "area": 50 ** 2,
+#     "mean_number_density": 0.0,
+#     "mean_biomass_density": 0.0,
+#     "abundance": 0.0,
+#     "biomass": 0.0
+# })
+
+# nasc_biology_output_a = self.input["nasc_df"].assign(x=1, y=1).reset_index(drop=True)
+# nasc_biology_output_a.loc[3, "x"] = 2
+# nasc_biology_output_a.loc[3, "y"] = 3
+# nasc_biology_output_a = nasc_biology_output_a.filter(["stratum", "x", "y", "longitude",
+# "latitude", "nasc", "number_density", "biomass_density"])
+# nasc_biology_output = nasc_biology_output_a.merge(sigma_bs_mean_df, on=spatial_column)
+# nasc_biology_output["number_density"] = (
+#     nasc_biology_output["nasc"]
+#     / (4.0 * np.pi * nasc_biology_output["sigma_bs_mean"])
+# )
+# nasc_biology_output =nasc_biology_output.merge(general_weight_averages)
+# nasc_biology_output["biomass_density"] = nasc_biology_output["number_density"]
+# * nasc_biology_output["average_weight"]
+# nasc_biology_output = nasc_biology_output.filter(["stratum", "x", "y", "longitude", "latitude"
+# , "number_density", "biomass_density"])
+# nasc_biology_output = nasc_biology_output[nasc_biology_output["number_density"] > 0.0]
+# .reset_index()
+
+# SQL(acoustic_db, "drop", table_name="reference")
+# SQL(acoustic_db, "drop", table_name="grid")
+
+# SQL(acoustic_db, "create", table_name = "reference", dataframe=tt)
+# SQL(acoustic_db, "create", table_name = "grid", dataframe=nasc_biology_output_a)
+
+# SQL(acoustic_db, "insert", table_name = "reference", dataframe=tt)
+# SQL(acoustic_db, "insert", table_name = "grid", dataframe=nasc_biology_output_a)
+
+# SQL(acoustic_db, "select", table_name="grid")
+# SQL(acoustic_db, "select", table_name="reference")
+
+# sql_group_update(acoustic_db, dataframe=nasc_biology_output,
+#                  table_name="grid", columns=["number_density", "biomass_density"],
+#                  unique_columns=["stratum", "x", "y", "longitude", "latitude"])
+
+# SQL(acoustic_db, "select", table_name="grid")
+
+# from typing import List
+
+# data_table = "grid"
+# grid_table = "reference"
+# column_pairs = [("number_density", "abundance"), ("biomass_density", "biomass")]
+
+# dataframe = nasc_biology_output
+
+# import sqlalchemy as sqla
+
+# grid_db_file = file_configuration["database"]["grid"]
+# survey_db_file = Path(file_configuration["data_root_dir"]) / "database" / "acoustics.db"
+# data_table = "survey_data_df"
+# grid_table = "grid_df"
+# coordinates = ["x", "y"]
+# from echopop.live.sql_methods import SQL
+
+# SQL(grid_db_file, "select", table_name=grid_table)
+# SQL(survey_db_file, "select", table_name=data_table)
+# SQL(data_table, "map")
+
+# gridding_column = self.config["gridding_column"]
+
+# updated_survey_data = nasc_biology.copy()
+# # Get relevant table
+# previous_grid = query_dataset(grid_db_file, updated_survey_data,
+#                               table_name=grid_table,
+#                               data_columns=["x", "y", "area", "number_density_mean",
+#                                             "biomass_density_mean", "abundance", "biomass"],
+#                               unique_columns=["x", "y"])
+# previous_data = query_dataset(survey_db_file, updated_survey_data,
+#                               table_name=data_table,
+#                               data_columns=["x", "y", "number_density", "biomass_density"],
+#                               unique_columns=["x", "y"])
+# # Get unique coordinates
+# update_keys = get_unique_identifiers(updated_survey_data, gridding_column).set_index(["x", "y"])
+
+
+# # Index
+# previous_grid.set_index(["x", "y"], inplace=True)
+# previous_grid["biomass_density_mean"] = previous_data.groupby(["x", "y"])["biomass_density"]
+# .mean()
+# previous_grid["number_density_mean"] = previous_data.groupby(["x", "y"])["number_density"].mean()
+
+# # Convert area from m^2 to nmi^2
+# previous_grid["abundance"] = previous_grid["number_density_mean"] * previous_grid["area"]
+# previous_grid["biomass"] = previous_grid["biomass_density_mean"] * previous_grid["area"]
+# previous_grid = previous_grid.reset_index()
+
+# sql_group_update(grid_db_file, dataframe=previous_grid,
+#                  table_name=grid_table,
+#                  columns=["number_density_mean", "biomass_density_mean", "abundance", "biomass"],
+#                  unique_columns=["x", "y"])
+
+# myrrh = SQL(grid_db_file, "select", table_name=grid_table)
+# myrrh[myrrh.abundance > 0]
+
+# update_keys["number_density_mean"] = updated_survey_data.groupby(["x", "y"])
+# ["number_density"].mean()
+# update_keys["biomass_density_mean"] = updated_survey_data.groupby(["x", "y"])
+# ["biomass_density"].mean()
+
+# am = SQL(grid_db_file, "select", table_name="grid_df")
+# am[am.abundance > 0]
+# bm = SQL(grid_db_file, "select", table_name="grid_df")
+# bm[bm.abundance > 0]
+# number_density_mean = updated_survey_data.groupby(["x", "y"])["number_density"].mean()
+# biomass_density_mean = updated_survey_data.groupby(["x", "y"])["biomass_density"].mean()
+
+# SQL(grid_db_file, "select", table_name=grid_table)
+
+
+# pulled_data = pd.concat([SQL(grid_db_file, "select",
+#                              table_name=grid_table,
+#                              condition=f"x = {t[0]} & y = {t[1]}") for t in unique_coord])
+# previous_cell_data = pd.concat([SQL(survey_db_file, "select",
+#                                     table_name=data_table,
+#                                     condition=f"x = {t[0]} & y = {t[1]}") for t in unique_coord])
+
+# from typing import List
+
+# from shapely.geometry import box
+
+# from echopop.live.live_data_processing import (
+#     get_average_strata_weights,
+#     get_nasc_sql_data,
+#     get_sigma_bs_sql_data,
+#     summarize_strata,
+# )
+# from echopop.live.sql_methods import sql_group_update
+
+# SQL(grid_db_file, "select", table_name="grid_df")
+# # Compute means
+# number_density_mean = previous_cell_data.groupby(["x", "y"])["number_density"].mean()
+# previous_cell_data = previous_cell_data.groupby(["x", "y"])["biomass_density"].mean()
+
+# [SQL(grid_db_file, "select", table_name=grid_table, condition=f"x =
+# {xi} & y = {yi}") for xi, yi in zip(nasc_data_df["x"], nasc_data_df["y"])]
+
+# # Write to the database file (for the grid)
+# # ---- Create engine
+# engine = sqla.create_engine(f"sqlite:///{db_filepath}")
+
+# def update_population_grid(grid_db_file: str,
+#                            data_table: str,
+#                            grid_table: str,
+#                            dataframe: pd.DataFrame,
+#                            column_pairs: Union[List[tuple[str, str]], tuple[str, str]],
+#                            coordinates: List[str]):
+
+#     # Convert `column_pairs` to a list, if needed
+#     if not isinstance(column_pairs, list):
+#         column_pairs = [column_pairs]
+
+#     dataframe[coordinates]
+#     # Format the coordinate pairs
+#     # ---- Convert coordinate values into a list of tuples
+#     coord_pairs = [tuple(row) for row in dataframe[coordinates].itertuples(index=False)]
+#     # ---- Get unique pairs
+#     coords = list(set(coord_pairs))
+
+#     # Format the SQL script command
+#     # ---- Initialize
+#     sql_script = []
+#     # ---- Iteratively update
+#     for input_column, output_column in column_pairs:
+#         sql_script.append(
+#         f"""
+#         BEGIN TRANSACTION;
+
+#         -- Calculate averages for input_column and update grid_table
+#         WITH avgs AS (
+#             SELECT
+#                 {coordinates[0]},
+#                 {coordinates[1]},
+#                 AVG(d.{input_column}) as avg_value
+#             FROM {data_table} d
+#             GROUP BY d.{coordinates[0]}, d.{coordinates[1]}
+#         )
+
+#         -- Update the grid_table with both average and computed total
+#         UPDATE {grid_table}
+#         SET
+#             mean_{input_column} = (
+#                 SELECT avg_value
+#                 FROM avgs
+#                 WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
+#                     AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
+#             ),
+#             {output_column} = (
+#                 SELECT avg_value * {grid_table}.area
+#                 FROM avgs
+#                 WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
+#                     AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
+#             )
+#         WHERE EXISTS (
+#             SELECT 1
+#             FROM avgs
+#             WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
+#               AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
+#         );
+
+#         COMMIT;
+#         """
+#         )
+
+#     # Create the engine
+#     engine = create_engine(f"sqlite:///{db_file}")
+
+#     # Create the SQL database connection and send the script
+#     with engine.connect() as connection:
+#         dbapi_conn = connection.connection
+#         _ = dbapi_conn.executescript("\n".join(sql_script))
+
+
+# def update_population_grid(db_file: str,
+#                            data_table: str,
+#                            grid_table: str,
+#                            dataframe: pd.DataFrame,
+#                            column_pairs: Union[List[tuple[str, str]], tuple[str, str]],
+#                            coordinates: List[str]):
+
+#     # Convert `column_pairs` to a list, if needed
+#     if not isinstance(column_pairs, list):
+#         column_pairs = [column_pairs]
+
+#     dataframe[coordinates]
+#     # Format the coordinate pairs
+#     # ---- Convert coordinate values into a list of tuples
+#     coord_pairs = [tuple(row) for row in dataframe[coordinates].itertuples(index=False)]
+#     # ---- Get unique pairs
+#     coords = list(set(coord_pairs))
+
+#     # Format the SQL script command
+#     # ---- Initialize
+#     sql_script = []
+#     # ---- Iteratively update
+#     for input_column, output_column in column_pairs:
+#         sql_script.append(
+#         f"""
+#         BEGIN TRANSACTION;
+
+#         -- Calculate averages for input_column and update grid_table
+#         WITH avgs AS (
+#             SELECT
+#                 {coordinates[0]},
+#                 {coordinates[1]},
+#                 AVG(d.{input_column}) as avg_value
+#             FROM {data_table} d
+#             GROUP BY d.{coordinates[0]}, d.{coordinates[1]}
+#         )
+
+#         -- Update the grid_table with both average and computed total
+#         UPDATE {grid_table}
+#         SET
+#             mean_{input_column} = (
+#                 SELECT avg_value
+#                 FROM avgs
+#                 WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
+#                     AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
+#             ),
+#             {output_column} = (
+#                 SELECT avg_value * {grid_table}.area
+#                 FROM avgs
+#                 WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
+#                     AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
+#             )
+#         WHERE EXISTS (
+#             SELECT 1
+#             FROM avgs
+#             WHERE avgs.{coordinates[0]} = {grid_table}.{coordinates[0]}
+#               AND avgs.{coordinates[1]} = {grid_table}.{coordinates[1]}
+#         );
+
+#         COMMIT;
+#         """
+#         )
+
+#     # Create the engine
+#     engine = create_engine(f"sqlite:///{db_file}")
+
+#     # Create the SQL database connection and send the script
+#     with engine.connect() as connection:
+#         dbapi_conn = connection.connection
+#         _ = dbapi_conn.executescript("\n".join(sql_script))
+
+
+# SQL(acoustic_db, "select", table_name=data_table)
+# SQL(acoustic_db, "select", table_name=grid_table)
+
+
+# SQL(acoustic_db, "update", table_name="grid", dataframe=nasc_biology_output,
+# unique_columns=["stratum", "x", "y"], columns=["number_density", "biomass_density"])
+# SQL(acoustic_db, "select", table_name="reference")
+
+# source_db = acoustic_db
+# target_db = biology_db
+
+# source_table = "grid"
+# target_table = "strata_summary_df"
+
+# data_columns = ["number_density", "biomass_density"]
+# strata_columns = ["stratum"]
+# strata = [2]
+# stratum_list = ', '.join(map(str, stratum_values))
+
+# data_column = data_columns[0]
+# data_columns = data_columns[0]
+# def sql_update_strata_summary(source_db: str,
+#                               target_db: str,
+#                               arg_fun: str,
+#                               data_columns: List[tuple[str, str]],
+#                               strata: list):
+
+#     # Format strata list as a string
+#     strata_str = ', '.join(map(str, strata))
+
+#     # Function reference map
+#     FUNCTION_MAP = {
+#         "sum": {"function": "SUM",
+#                 "suffix": "sum"},
+#         "mean": {"function": "AVG",
+#                 "suffix": "mean"}
+#     }
+
+#     # Prepare the SQL script
+#     sql_script = f"""
+#     -- Attach the source and target databases
+#     ATTACH DATABASE '{source_db}' AS source;
+#     ATTACH DATABASE '{target_db}' AS target;
+
+#     """
+
+#     # Dynamically format the cross-database command
+#     for data_column, method in data_columns:
+#         # ----- Format the function-method-suffic keys
+#         suffix = FUNCTION_MAP[method]["suffix"]
+#         fun = FUNCTION_MAP[method]["function"]
+#         # ---- Create the combined SQL command using f-strings
+#         sql_script += f"""
+#         -- Calculate averages and directly update the target table
+#         UPDATE target.{target_table}
+#         SET {data_column}_{suffix} = (
+#             SELECT {fun}({data_column})
+#             FROM source.{source_table}
+#             WHERE stratum = target.{target_table}.stratum
+#         )
+#         WHERE stratum IN ({strata_str});
+#         """
+#     # ----- Append DETACH commands only once at the end
+#     sql_script += """
+#     -- Detach the databases
+#     DETACH DATABASE source;
+#     DETACH DATABASE target;
+#     """
+
+#     # Create the engine
+#     engine = create_engine(f"sqlite:///{target_db}")
+
+#     # Create the SQL database connection and send the script
+#     with engine.connect() as connection:
+#         dbapi_conn = connection.connection
+#         _ = dbapi_conn.executescript(sql_script)
+
+# SQL(biology_db, "select", table_name=target_table)
+# SQL(acoustic_db, "select", table_name=source_table)["number_density"].mean()
+# connection.close()
+# dbapi_conn.close()
+
+
+# pairs = [(1, 2), (3, 4), (5, 6)]
+
+# # Convert the pairs into a format suitable for SQL IN clause
+# pairs_placeholder = ', '.join(f'({x}, {y})' for x, y in pairs)
+
+# # Construct the SQL command as a text string
+# sql_command = f'''
+# BEGIN TRANSACTION;
+
+# UPDATE reference
+# SET total = (
+#     SELECT AVG(g.sigma_bs) * r.area
+#     FROM grid g
+#     WHERE g.stratum = r.stratum_x
+# )
+# WHERE (stratum_x, stratum_y) IN ({pairs_placeholder});
+
+# COMMIT;
+# '''
+
+# psi = 10 ** (-21/10)
+# psi * 280**2 * 1500 * 128e-6 / 2
+# psi / 3 * 280 ** 3 / 280 / 1852 ** 2 * nasc_biology["number_density"]
+
+# psi * (280.0 ** 2) / 1852 ** 2
+# depth_area = 280 ** 2 * psi
+# swath_length = 0.5 * 1852
+# depth_area * swath_length / 1852 ** 2 * nasc_biology["number_density"]
+# 280 ** 2 * psi / 1852 ** 2 * nasc_biology["number_density"]
+
+# SQL(acoustic_db, "map")
+# beam_angle = 9.0 * np.pi / 180.0
+# 280.0 * np.tan(beam_angle) * 2.0 * swath_length / 1852 ** 2 * nasc_biology["number_density"]
+# 280.0 * np.tan(beam_angle) * 2.0 ** 2 * np.pi * swath_length / 1852 ** 2 *
+# nasc_biology["number_density"]
+# area = 2.0 * nasc_biology["center_of_mass"] ** 2 * np.tan(beam_angle)
+# area / 1852 ** 2 * nasc_biology["number_density"]
+# SQL(acoustic_db, "map")
+
+# # Merge hake fraction data into `nasc_interval_df`
+# # ---- Initial merge
+# nasc_interval_df = nasc_interval_df.merge(
+#     input_dict["spatial"]["strata_df"], on=[stratum_col, "haul_num"], how="outer"
+# )
+# # ---- Replace `fraction_hake` where NaN occurs
+# nasc_interval_df["fraction_hake"] = nasc_interval_df["fraction_hake"].fillna(0.0)
+# # ---- Drop NaN
+# nasc_interval_df.dropna(subset=["transect_num"], inplace=True)
+
+# # Calculate the along-transect number density (animals per nmi^2)
+# # ---- Merge NASC measurements with mean sigma_bs for each stratum
+# nasc_biology = nasc_interval_df.merge(sigma_bs_strata, on=[stratum_col])
+# # ---- Calculate the number densities
+# nasc_biology["number_density"] = (
+#     nasc_biology["fraction_hake"]
+#     * nasc_biology["nasc"]
+#     / (4.0 * np.pi * nasc_biology["sigma_bs_mean"])
+# )
+
+
+# if working_dataset == "acoustic":
+#     db_file = self.config["database"]["acoustic"]
+# elif working_dataset == "biology":
+#     db_file = self.config["database"]["biology"]
+# else:
+#     raise ValueError(
+#         f"Argument for `working_dataset` [{working_dataset}] is invalid."
+#         f" Value must either be 'acoustic' or 'biology'."
+#     )
+
+# # Extract the necessary correct strata mean sigma_bs
+# sigma_bs_strata = analysis_dict["acoustics"]["sigma_bs"]["strata_mean_df"]
+
+# # Pull out the length-weight conversion for each stratum
+# length_weight_strata = analysis_dict["biology"]["weight"]["weight_stratum_df"]
+
+# # Get the name of the stratum column
+# stratum_col = settings_dict["transect"]["stratum_name"]
+
+
+# catch_data = self.input["biology"]["catch_df"]
+
+# # Get the spatial column name, if there is one
+# spatial_column = file_configuration["spatial_column"]
+# # ---- Append additional columns that will be used
+# contrast_columns = spatial_column + ["sex", "species_id"]
+
+# # Calculate grouped totals
+# # ---- Sum the net haul weights from station 1/unaged fish
+# catch_weights = catch_data.count_variable(
+#     contrasts=["species_id"] + spatial_column,
+#     variable="haul_weight", fun="sum"
+# )
+# # ---- Rename resulting columns for both
+# catch_weights.rename(columns={"count": "total_weight"}, inplace=True)
+
+# # ---- Specimen
+# specimen_weights = specimen_weight_binned.sum().reset_index(name="total_weight")
+
+# specimen_weight_binned
+# # Calculate the sexed and total stratum weights for each sex among unaged fish
+# # ---- Sum the net haul weights from station 1/unaged fish
+# catch_weights = catch_data.count_variable(
+#     contrasts=["species_id"] + file_configuration["spatial_column"],
+#     variable="haul_weight", fun="sum"
+# )
+# # ---- Rename resulting columns for both
+# catch_weights.rename(columns={"count": "total_weight"}, inplace=True)
+
+# # For the specimen data
+# # ---- Sum the net haul weights from station 1/unaged fish
+# # ---- Specimen
+# specimen_weights_sex = (
+#     specimen_weight_binned
+#     .groupby(contrast_columns)["weight"]
+#     .sum()
+# )
+# # ---- Total (per stratum, if it exists)
+# specimen_weight_total = specimen_weights_sex.transpose().unstack(1).sum(axis=1)
+
+# # For the length (unaged) dataset
+# length_weights_sex = (
+#     length_weight_binned
+#     .groupby(contrast_columns)["weight_interp"]
+#     .sum()
+# )
+# # ---- Further reduce to the grand total (per stratum, if it exists)
+# length_weight_total = length_weights_sex.transpose().unstack(1).sum(axis=1)
+
+# # ---- Standardize the unaged sexed weights
+# length_weight_standardized = (
+#     (length_weights_sex / length_weight_total).unstack(0)
+#     * catch_weights["total_weight"].to_numpy()
+# )
+
+# # Calculate the specimen weight proportions
+# # ---- Pivot weight bins
+# specimen_weight_binned_pvt = (
+#     specimen_weight_binned.pivot_table(
+#         columns=spatial_column,
+#         index=["length_bin", "species_id", "sex"],
+#         values="weight",
+#         observed = False
+#     )
+# )
+# # ---- Divide by the aged stratum weights (relative to only aged fish)
+# specimen_weight_proportions_pvt = (
+#     specimen_weight_binned_pvt / specimen_weight_total.to_numpy()
+# )
+# # ---- Pivot back to the desired format
+# specimen_weight_proportion = (
+#     specimen_weight_proportions_pvt
+#     .stack().reset_index(name="weight_proportion")
+#     .pivot_table(columns=stratum_column + ["species_id", "sex"],
+#                  index="length_bin", values="weight_proportion")
+# )
+# # ---- Calculate the internal (i.e. only aged fish) for each sex
+# within_specimen_sex_proportions = (
+#     specimen_weight_proportion.sum()
+# )
+
+# # Calculate the total strata weights
+# # ---- Index `catch_weights`
+# catch_weights_idx = catch_weights.set_index(stratum_column + ["species_id"])
+# # ---- Compute the spatially-stratified/grouped weights
+# spatial_weights = (
+#     pd.concat([specimen_weight_total.to_frame("total_weight"), catch_weights_idx])
+#     .pivot_table(
+#         columns=stratum_column,
+#         aggfunc="sum",
+#         values="total_weight",
+#         observed=False
+#     )
+# )
+
+# # Calculate the weight proportions relative to the overall stratum weights
+# # ---- Aged
+# # -------- Reformat into dataframe and merge with total stratum weights
+# specimen_weights_binned_df = (
+#     specimen_weight_binned_pvt.stack()
+#     .to_frame("specimen_weight")
+#     .reset_index()
+#     .merge(spatial_weights.T.reset_index(), on=stratum_column)
+# )
+# # -------- Calculate proportions
+# specimen_weights_binned_df["weight_proportion_overall"] = (
+#     specimen_weights_binned_df["specimen_weight"] / specimen_weights_binned_df["total_weight"]
+# )
+# # -------- Consolidate to calculate the sexed proportions per stratum
+# specimen_weight_sex_proportions = specimen_weights_binned_df.groupby(stratum_column
+# + ["species_id", "sex"])[
+#     "weight_proportion_overall"
+# ].sum()
+# # ---- Unaged
+# # -------- Reformat into dataframe and merge with total stratum weights
+# length_weights_sex_standardized_df = (
+#     length_weight_standardized.stack()
+#     .to_frame("catch_weight")
+#     .reset_index()
+#     .merge(spatial_weights.T.reset_index(), on=stratum_column)
+# )
+# # -------- Calculate proportions
+# length_weights_sex_standardized_df["weight_proportion_overall"] = (
+#     length_weights_sex_standardized_df["catch_weight"]
+#     / length_weights_sex_standardized_df["total_weight"]
+# )
+# # -------- Back-calculate the sexed weight proportions relative to just unaged fish
+# # ------------ Aggregate proportions
+# length_total_sex_proportions = length_weights_sex_standardized_df.pivot_table(
+#     columns=["species_id", "sex"], index=stratum_column, values="weight_proportion_overall"
+# ).transpose().unstack(["species_id"]).sum(axis=0)
+# # ------------ Re-compute the proportions
+# length_weight_sex_proportions = (
+#     length_weights_sex_standardized_df.pivot_table(
+#         index=["species_id", "sex"], columns=stratum_column,
+#         values="weight_proportion_overall"
+#     )
+#     / length_total_sex_proportions.to_numpy()
+# )
+
+# # Compute the overall length-binned weight distributions among unaged fish
+# # ---- Extract the number proportions computed for unaged fish
+# length_number_proportions = length_number_proportion.copy()
+# # ---- Filter out values besides those computed for 'all' fish
+# length_number_proportions = length_number_proportions[length_number_proportions["sex"] == "all"]
+# # ---- Convert to a table
+# length_number_proportions_tbl = length_number_proportions.pivot_table(
+#     columns=stratum_column + ["species_id"],
+#     index=["length_bin"],
+#     values="proportion_number_length",
+#     aggfunc="sum",
+#     observed=False,
+# )
+# # ---- Extract the fitted weight values calculated for all fish
+# length_weight_all = length_weight_df[length_weight_df["sex"] == "all"]
+# # ---- Generate the fitted weight array
+# fitted_weights = length_weight_all.copy()
+# # ---- Get actual length bins in dataset
+# fitted_weights = fitted_weights[fitted_weights["length_bin"].
+# isin(length_number_proportions["length_bin"])]
+# # ---- Apportion the averaged weights
+# length_apportioned_weights = length_number_proportions_tbl.T * fitted_weights["weight_fitted"]
+# .to_numpy()
+# # ---- Compute the average weight proportions per length bin per stratum
+# average_length_bin_weights = length_apportioned_weights.T / length_apportioned_weights
+# .sum(axis=1)
+# # ---- Convert back to a DataFrame
+# average_length_bin_weights_df = average_length_bin_weights.unstack().reset_index(
+#     name="weight_proportion"
+# )
+
+# # Calculate the aged and unaged weight proportions
+# # ---- Aged
+# aged_proportions = specimen_weight_sex_proportions.unstack("sex").sum(axis=1)
+# # ---- Unaged
+# unaged_proportions = 1 - aged_proportions
+# # -------- Re-weight the unaged sexed proportions
+# unaged_weight_sex_proportions_overall = (
+#     (length_weight_sex_proportions * unaged_proportions.unstack().transpose()).astype(float).
+# fillna(0.0)
+# )
+
+# unaged_proportions.unstack().transpose()
+# # Format the outputs
+# # ---- Aged: stratum-sex-age-length relative to aged and total weights
+# aged_overall_df = (
+#     specimen_weight_proportion.unstack()
+#     .reset_index(name="weight_proportions")
+#     .merge(
+#         specimen_weights_binned_df[
+#             stratum_column + ["length_bin", "sex", "species_id", "weight_proportion_overall"]
+#         ]
+#     )
+# )
+# # ---- Aged: stratum-sex relative to total weights
+# aged_sex_df =within_specimen_sex_proportions.reset_index(name="weight_proportion_aged").set_index(
+#         stratum_column + ["species_id", "sex"]
+#     )
+# # ---- Add the aged sex proportiosn relative to the overall survey
+# aged_sex_df["weight_proportion_overall_aged"] = specimen_weight_sex_proportions
+# # ---- Consolidate the aged and unaged sexed dataframes
+# # -------- Initialize the dataframe
+# aged_unaged_sex_proportions = aged_sex_df.reset_index().set_index(["species_id", "sex"]
+# + stratum_column)
+# # --------- Add the within-unaged weight proportions
+# aged_unaged_sex_proportions["weight_proportion_unaged"] = (
+#     length_weight_sex_proportions.stack()
+# )
+# # --------- Add the overall-unaged weight proportions
+# aged_unaged_sex_proportions["weight_proportion_overall_unaged"] = (
+#     unaged_weight_sex_proportions_overall.stack()
+# )
+# # ---- Overall aged and unaged proportions
+# aged_unaged_proportions = aged_proportions.reset_index(name="aged_proportions")
+# # ---- Set index
+# aged_unaged_proportions.set_index(stratum_column + ["species_id"], inplace=True)
+# # -------- Add unaged proportions
+# aged_unaged_proportions["unaged_proportions"] = unaged_proportions#.reset_index()
+# # ---- Reset the index
+# aged_unaged_proportions = aged_unaged_proportions.reset_index()
+# ##################################################################################################
+# # * Functionality for reading in processed acoustic data
+# # TODO: Expand data validator and limit cases to '*.zarr' (for now)
+# # TODO: Refactor "extra" components such as the validation steps, xarray-to-dataframe piping, etc.
+# # TODO: Documentation
+# file_settings = file_configuration["input_directories"]["acoustics"]
+# root_directory = file_configuration["data_root_dir"]
+
+
+# ##################################################################################################
+# def reset_db_files(file_configuration: dict, table_exception: Optional[Union[str,
+# List[str]]] = None):
+
+#     # Get all database files
+#     database_files = file_configuration["database"]
+
+#     # Iterate through all keys
+#     for _, db_file in database_files.items():
+#         # ---- Map the table names
+#         table_names = SQL(db_file, "map")
+#         # ---- Drop any noted exceptions
+#         if not isinstance(table_exception, list):
+#             table_exception = [table_exception]
+#         # ---- Drop exception table name
+#         if None not in table_exception:
+#             table_names = list(set(table_names) - set(table_exception))
+#         _ = [SQL(db_file, "drop", table_name=table) for table in table_names]
+#         # ---- Validate that all tables were removed
+#         if set(table_names).intersection(set(SQL(table_names, "map"))):
+#             raise ValueError(
+#                 f"Attempted reset of [{str(db_file)}] failed."
+#             )
+
+# SPATIAL_CONFIG_MAP = {
+#     "closest_haul": {
+#         "proximity": {
+#             "choices": ["distance", "time"],
+#         },
+#     },
+#     "global" : {},
+#     "griddify": {
+#         "bounds": {
+#             "longitude": {
+#                 "types": [float]
+#             },
+#             "latitude": {
+#                 "types": [float]
+#             },
+#             "northings": {
+#                 "types": [float]
+#             },
+#             "eastings": {
+#                 "types": [float]
+#             },
+#             "pairs": [("longitude", "latitude"), ("northings", "eastings")],
+#         },
+#         "grid_resolution": {
+#             "x_distance": {
+#                 "types": float,
+#             },
+#             "y_distance": {
+#                 "types": float,
+#             },
+#             "d_longitude": {
+#                 "types": float,
+#             },
+#             "d_latitude": {
+#                 "types": float,
+#             },
+#             "grid_size_x": {
+#                 "types": int,
+#             },
+#             "grid_size_y": {
+#                 "types": int,
+#             },
+#             "pairs": [("x_distance", "y_distance"), ("d_longitude", "d_latitude"),
+#                       ("grid_size_x", "grid_size_y")],
+#         },
+#     },
+#     "inpfc": {
+#         "stratum_names": {
+#                 "types": [int, str]
+#             },
+#         "latitude_max": {
+#             "types": [float],
+#         },
+#     },
+#     "weighted_haul": {
+#         "proximity": {
+#             "choices": ["distance", "time"]
+#         },
+#     },
+# }
+
+
+# reset_db_files(file_configuration, table_exception = "files_read")
+# reset_db_files(file_configuration)
+
+# stamp = 20240714194248
+# stamp.astype(int)
+# int(stamp)
+# import re
+# from datetime import datetime
+
+
+# def infer_datetime_format(timestamp_str: Union[int, str]):
+#     patterns = {
+#         r"^\d{14}$": "%Y%m%d%H%M%S",             # YYYYMMDDHHMMSS
+#         r"^\d{8}$": "%Y%m%d",                     # YYYYMMDD
+#         r"^\d{6}$": "%H%M%S",                     # HHMMSS
+#         r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$": "%Y-%m-%d %H:%M:%S",  # YYYY-MM-DD HH:MM:SS
+#         r"^\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}$": "%Y/%m/%d %H:%M:%S",  # YYYY/MM/DD HH:MM:SS
+#         r"^\d{4}-\d{2}-\d{2}$": "%Y-%m-%d",       # YYYY-MM-DD
+#         r"^\d{4}/\d{2}/\d{2}$": "%Y/%m/%d"        # YYYY/MM/DD
+#     }
+
+#     for pattern, date_format in patterns.items():
+#         if re.match(pattern, timestamp_str):
+#             return date_format
+
+#     raise ValueError("Unknown timestamp format")
+
+# filter_dict = dict(species_filer=species_filter, trawl_filter=trawl_filter)
+
+# def biology_data_filter(biology_data: pd.DataFrame, filter_dict: dict):
+
+#     # Create dataframe copy
+#     data_copy = biology_data.copy()
+
+#     # Iterate through dictionary to apply filters (if present)
+#     for column, value in filter_dict.items():
+#         if column in data_copy.columns:
+#             data_copy = data_copy[data_copy[column] == value]
+
+#     # Return output
+#     return data_copy
+
+
+# df[(df['species_id'] == species_filter if 'species_id' in df.columns else True)]
+# df[(df["species_id"] == 17 if "species_id" in df.columns)]
+
+# (df[df["haul_num"] == 17 if "haul_num" in df.columns] else True)
+
+
+# from datetime import datetime
+
+# df = biology_output["trawl_info_df"]
+# df.loc[(df['species_id'] == species_filter if 'species_id' in df.columns else True), :]
+# df.index
+
+# biology_output["trawl_info_df"].reset_index().index
+# df = biology_output["catch_df"]
+# df = df.loc[0, :].to_frame().T
+# df.index
+# df.loc[(df['species_id'] == species_filter if 'species_id' in df.columns else True)]
+
+# def convert_datetime(timestamp: Union[int, str, pd.Series]):
+
+#     if isinstance(timestamp, pd.Series):
+#         test_timestamp = str(timestamp[0])
+#     else:
+#         test_timestamp = str(timestamp)
+
+#     # Approximate the datetime format
+#     datetime_format = infer_datetime_format(str(test_timestamp))
+
+#     #
+#     if isinstance(timestamp, pd.Series):
+#         return timestamp.apply(lambda x: datetime.strptime(x, datetime_format))
+#     else:
+#         return datetime.strptime(timestamp, datetime_format)
+
+# infer_datetime_format(stamp)
+# convert_datetime(stamp)
+# infer_datetime_format(202407)
+
+# # {'global': False, 'INPFC': True, 'closest_haul': False, 'weighted_haul': False}
+# file_configuration["geospatial"]["link_biology_acoustics"] = "INPFC"
+# file_configuration["geospatial"]
+# spatial_config = file_configuration["geospatial"]
+# ###############
+
+# acoustic_data = self.input["acoustics"]
+# biology_data = self.input["biology"]
+
+
+# from echopop.live.live_core import SPATIAL_CONFIG_MAP
+
+
+# def load_spatial_data(acoustic_data: dict,
+#                       biology_data: dict,
+#                       file_configuration: dict,):
+
+#     # Extract spatial strata *only* if spatial information from the configuration settings
+#     # ---- Get (geo)spatial config
+#     spatial_config = file_configuration["geospatial"]
+#     # ---- Remove case sensitivity
+#     spatial_config = {key.lower(): value for key, value in spatial_config.items()}
+#     # ---- Extract the projection
+#     projection = spatial_config["projection"]
+#     # ---- Extract the biology-acoustics linking method options
+#     acoustics_biology_link = spatial_config["link_biology_acoustics"]
+
+#     # Validate the configuration
+#     validate_spatial_config(spatial_config)
+
+#     # Create spatial dictionary that will be added as an `input`
+#     spatial_dict = {"link_method": acoustics_biology_link}
+
+#     # Assign the spatial link constraints to the acoustic and biological data
+#     if acoustics_biology_link == "INPFC":
+#         spatial_dict.update({"strata": create_inpfc_strata(spatial_config)})
+
+#     # Return the dictionary as an output
+#     return spatial_dict
+
+
+#     # Convert the DataFrame to a GeoDataFrame
+#     acoustic_data_gdf = gpd.GeoDataFrame(
+#         data=acoustic_data,
+#         geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]),
+#         crs=projection
+#     )
+
+#     # Validate the spatial biology-acoustics linking method
+#     # ---- Get the biology-acoustics linking method
+#     link_method = next(key for key, value in acoustics_biology_link.items() if value)
+#     # ---- Flag Error if unexpected method
+#     if link_method not in ["global", "closest_haul", "INPFC", "weighted_haul"]:
+#         raise ValueError(
+#             f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options "
+#             f"include: 'global', 'closest_haul', 'weighted_haul', and 'INPFC'."
+#         )
+
+# ##################################################################################################
+# # TEST: BIOLOGY FILE INGESTION CONFIGURATION
+# # NOTE:
+# # ---- Run function: `load_validated_acoustic_data` using previously defined `file_configuration`
+# biology_data, file_configuration = load_biology_data(file_configuration)
+# biology_data
+# ##################################################################################################
+# prc_nasc_df = acoustic_data["prc_nasc_df"]
+
+# def process_acoustic_data(acoustic_data_df: pd.DataFrame, file_configuration: dict,
+#                           echometrics: bool = True):
+
+#     # Integrate NASC (and compute the echometrics, if necessary)
+#     nasc_data_df = (
+#         acoustic_data_df.groupby(["longitude", "latitude", "ping_time"])
+#         .apply(lambda group: integrate_nasc(group, echometrics))
+#         .reset_index()
+#     )
+#     # ---- Amend the dtypes if echometrics were computed
+#     if echometrics:
+#         nasc_data_df = (
+#             nasc_data_df
+#             .astype({"n_layers": int, "mean_Sv": float, "max_Sv": float, "nasc_db": float,
+#                              "center_of_mass": float, "dispersion": float, "evenness": float,
+#                              "aggregation": float, "occupied_area": float})
+#         )
+
+#     # Get the name of the associated db file
+#     acoustics_db = file_configuration["database"]["acoustics"]
+#     # ---- Get current tables
+#     tables = SQL(acoustics_db, "inspect")
+
+#     #
+#     if "nasc_df" not in tables:
+#         _ = SQL(acoustics_db, "insert", table_name="nasc_df", dataframe=nasc_data_df)
 #     else:
-#         # If intersection is a single Polygon, directly add to clipped geometries
-#         clipped_geometries.append(intersection)
+#         # ----
+#         nasc_sql = SQL(acoustics_db, "select", table_name="nasc_df")
+#         # ----
+#         index_equiv = nasc_data_df[["longitude", "latitude", "ping_time"]].isin(nasc_sql)
+#         # ----
+#         bool_idx = index_equiv.apply(lambda x: np.all(x), axis=1)
+#         # ----
+#         _ = SQL(acoustics_db, "insert", table_name="nasc_df",
+# dataframe=nasc_data_df.loc[~bool_idx])
+#         # ----
+#         nasc_data_df = pd.concat([nasc_sql, nasc_data_df], ignore_index=True)
+
+#     # Return the output
+#     return nasc_data_df
+
+
+# SQL(acoustics_db, command="drop", table_name="nasc_df")
+# SQL(acoustics_db, "inspect")
+
+# nasc_analysis = process_acoustic_data(acoustic_data["prc_nasc_df"], file_configuration)
+
+# SQL(acoustics_db, command="select", table_name="nasc_df")
+
+# TS_SLOPE = 20.0
+# TS_INTERCEPT = -68.0
+
+# # CONVERT TO TS
+# comb_lengths["ts"] = TS_SLOPE * np.log10(comb_lengths["length"]) + TS_INTERCEPT
+# # TO SIGMA_BS
+# comb_lengths["sigma_bs"] = 10 ** (comb_lengths["ts"] / 10)
+# # WEIGHTED MEAN SIGMA_BS
+# sigma_mean = np.average(comb_lengths["sigma_bs"], weights=comb_lengths["length_count"])
+
+# from typing import Optional
+
+# from echopop.acoustics import to_dB, to_linear, ts_length_regression
+# from echopop.utils import operations
+
+# __all__ = ["operations"]
+
+# # Meld bio datasets
+# length_datasets = biology_data["specimen_df"].meld(biology_data["length_df"],
+#                                                    contrasts=["haul_num", "sex",
+# "species_id", "length"])
+
+# # Create distribution
+# distrib_params = file_configuration["biology"]["length_distribution"]["bins"]
+
+# length_bins = np.linspace(**{key: value for key, value in zip(["start", "stop", "num"],
+# distrib_params)}, dtype=float)
+# binwidth = np.diff(length_bins / 2.0).mean()
+# intervals = np.concatenate([length_bins[:1] - binwidth, length_bins + binwidth])
+# length_bins_df = pd.DataFrame({"bin": length_bins, "interval": pd.cut(length_bins, intervals)})
+# #
+# length_datasets["length_bin"] = pd.cut(length_datasets["length"], bins=intervals,
+# labels=length_bins_df["bin"])
+
+# stratify_key = file_configuration["geospatial"]["link_biology_acoustics"]
+
+# if stratify_key == "global":
+#     length_distribution = (
+#         length_datasets.pivot_table(columns=["sex"], index=["length_bin"],
+#                                     values="length_count", aggfunc="sum", observed=False)
+#     )
+#     #
+#     length_distribution["total"] = length_distribution.sum(axis=1)
+
+# length_distribution.transpose()
+# SQL(biology_db, "drop", table_name="length_distribution")
+# # Get the name of the associated db file
+# biology_db = file_configuration["database"]["biology"]
+# # ---- Get current tables
+# tables = SQL(biology_db, "inspect")
+
+
+# if "length_distribution" not in tables:
+#     _ = SQL(biology_db, "insert", table_name="length_distribution",
+#             dataframe=length_distribution.transpose())
+
+
+# SQL(biology_db, "select", table_name="length_distribution")
+# SQL(biology_db, "drop", table_name="length_distribution")
+# SQL(biology_db, "replace", table_name="length_distribution",
+# dataframe=length_distribution.unstack().reset_index(name="count"))
+# length_distribution.unstack().reset_index(name="count")
+# mixed = SQL(biology_db, "select", table_name="length_distribution")
+# length_bins[:1]
+# from typing import Optional
+
+# from echopop.acoustics import to_dB, to_linear, ts_length_regression
+# from echopop.utils import operations
+
+# __all__ = ["operations"]
+
+# biology_data = self.input["biology"]
+
+# # Meld bio datasets
+# length_datasets = biology_data["specimen_df"].meld(biology_data["length_df"],
+#                                                    contrasts=["haul_num", "species_id", "length"])
+
+# ts_length_parameters_spp = [
+#     spp
+#     for spp in file_configuration["acoustics"]["TS_length_regression_parameters"].values()
+#     if spp["number_code"] in np.unique(length_datasets.species_id).astype(int)
+# ]
+
+# # ---- get species info
+# target_species = pd.DataFrame.from_dict(ts_length_parameters_spp)
+
+# ts_lengths_df = length_datasets.merge(
+#     target_species.drop("length_units", axis=1),
+#     left_on=["species_id"],
+#     right_on=["number_code"],
+# )
+# # ---- filter out other spp
+# length_datasets[length_datasets["species_id"].isin(target_species["number_code"])]
+
+# #
+# file_configuration["acoustics"]["TS_length_regression_parameters"][target_species["text_code"]]
+
+# def average_sigma_bs(length: Union[pd.DataFrame, float, int],
+#                      TS_L_slope: Optional[float] = None,
+#                      TS_L_intercept: Optional[float] = None,
+#                      weighted: Optional[Union[float, int, str]] = None):
+
+#     #
+#     if isinstance(length, pd.DataFrame):
+#         if "length" not in length.columns:
+#             raise ValueError(
+#                 "Column [`length`] missing from dataframe input `length`."
+#             )
+#         if "TS_L_slope" not in length.columns and TS_L_slope is None:
+#             raise ValueError(
+#                 "Value [`TS_L_slope`] missing from dataframe input `length` and optional "
+#                 "separate argument `TS_L_slope`."
+#             )
+#         if "TS_L_intercept" not in length.columns and TS_L_intercept is None:
+#             raise ValueError(
+#                 "Value [`TS_L_intercept`] missing from dataframe input `length` and optional "
+#                 "separate argument `TS_L_intercept`."
+#         )
+#     elif isinstance(length, float) or isinstance(length, int):
+#         if TS_L_slope is None:
+#             raise ValueError(
+#                 "Argument [`TS_L_slope`] missing."
+#             )
+#         elif TS_L_slope is not None and not isinstance(TS_L_slope, float):
+#             raise TypeError(
+#                 "Argument `TS_L_slope` must be type `float`."
+#         )
+#         if "TS_L_intercept" not in length.columns and TS_L_intercept is None:
+#             raise ValueError(
+#                 "Argument [`TS_L_intercept`] missing."
+#         )
+#         elif TS_L_intercept is not None and not isinstance(TS_L_intercept, float):
+#             raise TypeError(
+#                 "Argument `TS_L_intercept` must be type `float`."
+#         )
+
+#     #
+#     if TS_L_slope is None:
+#         TS_L_slope = length["TS_L_slope"]
+
+#     #
+#     if TS_L_intercept is None:
+#         TS_L_intercept = length["TS_L_intercept"]
+
+#     #
+#     if isinstance(length, pd.DataFrame):
+#         length_val = length["length"]
+
+#     ts_value = ts_length_regression(length_val, TS_L_slope, TS_L_intercept)
+#     sigma_bs_value = to_linear(ts_value)
+
+
+#     if isinstance(weighted, str):
+#         if weighted not in length.columns:
+#             raise ValueError(
+#                 f"Argument [`weighted` (str)], '{weighted}', is not a column in argument
+# `length` "
+#                 f"(DataFrame)."
+#             )
+#         else:
+#             return (sigma_bs_value * length[weighted]).sum() / length[weighted].sum()
+#     elif weighted is not None:
+#         if weighted.size != sigma_bs_value.size:
+#             raise ValueError(
+#                 f"Argument [`weighted` (float|int)] of size {weighted.size} does not
+# match size of "
+#                 f"argument [`length` (float|int)`] of size {sigma_bs_value.size}."
+#             )
+#         else:
+#             return (sigma_bs_value * weighted).sum() / weighted.sum()
+#     else:
+#         return sigma_bs_value.mean()
+
+# def parse_condition(condition):
+#     # Handle nested conditions and logical operators
+#     condition = condition.replace('&', ' AND ').replace('|', ' OR ')
+
+#     # Handle "IN" lists and replace square brackets with parentheses
+#     condition = re.sub(r'(\w+)\s*IN\s*\[(.*?)\]', lambda m: f"{m.group(1)} IN ({m.group(2)})",
+# condition, flags=re.IGNORECASE)
+
+#     # Handle range conditions for BETWEEN, including floats
+#     condition = re.sub(r'(\d*\.\d+|\d+)\s*<=\s*(\w+)\s*<=\s*(\d*\.\d+|\d+)',
+#                        lambda m: f"{m.group(2)} BETWEEN {m.group(1)} AND {m.group(3)}", condition)
+
+#     # Handle individual comparisons
+#     condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\d*\.\d+|\d+)', lambda m: f"{m.group(1)}
+# {m.group(2)} {m.group(3)}", condition)
+#     condition = re.sub(r'(\w+)\s*([<>!=]+)\s*(\'[^\']*\')', lambda m: f"{m.group(1)}
+# {m.group(2)} {m.group(3)}", condition)
+
+#     # Handle single equal sign
+#     condition = re.sub(r'(\w+)\s*=\s*(\d*\.\d+|\d+)', lambda m: f"{m.group(1)} = {m.group(2)}",
+# condition)
+
+#     # Remove redundant spaces
+#     condition = re.sub(r'\s+', ' ', condition).strip()
+
+#     return condition
+
+# ##################################################################################################
+# def load_spatial_data(file_configuration: dict,
+#                       acoustic_data: pd.DataFrame,
+#                       coordinate_metadata: xr.Dataset):
+
+#     # Extract spatial strata *only* if spatial information from the configuration settings
+#     # ---- Extract the projection
+#     projection = file_configuration["geospatial"]["projection"]
+#     # ---- Extract the biology-acoustics linking method options
+#     acoustics_biology_link = file_configuration["geospatial"]["link_biology_acoustics"]
+
+#     # Convert the DataFrame to a GeoDataFrame
+#     acoustic_data_gdf = gpd.GeoDataFrame(
+#         data=acoustic_data,
+#         geometry=gpd.points_from_xy(acoustic_data["longitude"], acoustic_data["latitude"]),
+#         crs=projection
+#     )
+
+#     # Validate the spatial biology-acoustics linking method
+#     # ---- Get the biology-acoustics linking method
+#     link_method = next(key for key, value in acoustics_biology_link.items() if value)
+#     # ---- Flag Error if unexpected method
+#     if link_method not in ["global", "closest_haul", "INPFC", "weighted_haul"]:
+#         raise ValueError(
+#             f"Unexpected biology-acoustic linking parameter ([{link_method}]). Valid options "
+#             f"include: 'global', 'closest_haul', 'weighted_haul', and 'INPFC'."
+#         )
+
+#     # Create INPFC stratum dataframe
+#     # ---- Extract
+
+#     # Validate projection information
+#     # ---- Create a dummy GeoDataFrame to extract CRS information
+#     # geo_crs = gpd.GeoDataFrame(geometry=[], crs=projection)
+#     # ---- Extract coordinate limits from the acoustic data
+#     # lat_min = coordinate_metadata.attrs['geospatial_lat_min']
+#     # lat_max = coordinate_metadata.attrs['geospatial_lat_max']
+#     # lon_min = coordinate_metadata.attrs['geospatial_lon_min']
+#     # lon_max = coordinate_metadata.attrs['geospatial_lon_max']
+#     # # ---- Create boundary box string
+#     # boundary_box_str = (
+#     #     f"POLYGON(({lon_min} {lat_min}, {lon_max} {lat_min}, {lon_max} {lat_max}, "
+#     #     f"{lon_min} {lat_max}, {lon_min} {lat_min}))"
+#     # )
+
+#     # data_gdf = gpd.GeoDataFrame(acoustic_data, geometry=gpd.points_from_xy(
+# acoustic_data["longitude"], acoustic_data["latitude"]),crs=f"epsg:{
+# utm_string_generator(lon_min, lat_min)}")
+#     # gpd.GeoDataFrame(acoustic_data, geometry=gpd.points_from_xy(acoustic_data["longitude"],
+# acoustic_data["latitude"]),crs=f"epsg:4326").to_crs("epsg:32610")
+
+#     # from pyproj import CRS
+#     # from pyproj.aoi import AreaOfInterest
+#     # from pyproj.database import query_utm_crs_info
+
+#     # utm_crs_list = query_utm_crs_info(
+#     #     datum_name="WGS 84",
+#     #     area_of_interest=AreaOfInterest(
+#     #         west_lon_degree=lon_min,
+#     #         south_lat_degree=lat_min,
+#     #         east_lon_degree=-lon_max,
+#     #         north_lat_degree=lat_max,
+#     #     ),
+#     # )
+#     # CRS.from_epsg(utm_crs_list[0].code).to_epsg("+proj=latlon")
+
+# ##################################################################################################
+# def live_data(file_configuration: dict):
+
+#     # Extract the file directories (or from the configuration) containing acoustic, biological,and
+#     # spatial definitions/data/parameters
+#     # ---- Acoustic data
+#     acoustic_data = load_validated_acoustic_data(file_configuration)
+#     # ---- Biological data
+#     # ---- Spatial data
+
+
+# ##################################################################################################
+# # * Define `LIVE_DATA_STRUCTURE` configuration mapping (this will be in an equivalent `core.py`)
+# # TODO: Update structure with additional information (as needed)
+# # TODO: Documentation
+# LIVE_DATA_STRUCTURE = {
+#     "meta": {
+#         "provenance": dict(),
+#         "date": list(),
+#     },
+#     "input": {
+#         "acoustics": {
+#             "nasc_df": pd.DataFrame(),
+#         },
+#         "biology": {
+#             "catch_df": pd.DataFrame(),
+#             "distributions": {
+#                 "length_bins_df": pd.DataFrame(),
+#             },
+#             "length_df": pd.DataFrame(),
+#             "specimen_df": pd.DataFrame(),
+#         },
+#     },
+#     "results": {
+#         "acoustics": dict(),
+#         "biology": dict(),
+#         "stratified": dict(),
+#     },
+# }
+# ##################################################################################################
+# # * Define `LiveSurvey` class structure
+# # TODO: Incorporate validators
+# # TODO: Scope out full structure including accessors, attributes, and methods
+# # TODO: Configure input arguments (for initialization)
+# # TODO: Documentation
+# class LiveSurvey:
+#     """
+#     A real-time processing version of the `echopop` base `Survey` class that ingests biological,
+#     acoustic, and event meta data to provide population estimates when generated.
+#     """
+
+#     def __init__(
+#         self,
+#         live_init_config_path: Union[str, Path],
+#         live_file_config_path: Union[str, Path],
+#     ):
+#         # Initialize `meta` attribute
+#         self.meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"])
+
+#         # Loading the configuration settings and definitions that are used for defining the
+#         # configuration settings
+#         self.config = live_configuration(live_file_config_path, live_file_config_path)
+
+#         # Loading the datasets defined in the configuration files
+#         self.input = el.load_survey_data(self.config)
+
+#         # Initialize the `results` data attribute
+#         self.results = copy.deepcopy(LIVE_DATA_STRUCTURE["results"])
+
+# current_units = zarr_data_ds["frequency_nominal"].units
+# acoustic_analysis_settings["transmit"]
+# file_configuration
+
+# specimen_df = pd.DataFrame(
+#     {
+#         "haul_num": np.repeat([1,2,3], 4),
+#         "station": "specimen",
+#         "sex": np.tile(["male", "female"], 6),
+#         "length": np.array([11, 11, 11, 18, 21, 23, 13, 11, 19, 25, 18, 9]),
+#         "weight": np.array([11, 14, 16, 18, 21, 23, 13, 11, 19, 25, 18, 9]) / 3.5,
+#     },
+# )
+
+# length_df = pd.DataFrame(
+#     {
+#         "haul_num": np.repeat([1,2,3], 4),
+#         "station": "length",
+#         "sex": np.tile(["male", "female"], 6),
+#         "length": np.array([16, 15, 19, 14, 9, 10, 18, 15, 16, 22, 17, 11]),
+#         "length_count": np.array([103, 123, 257, 106, 52, 329, 131, 72, 101, 212, 93, 81]),
+#     },
+# )
+
+# catch_df = pd.DataFrame(
+#     {
+#         "haul_num": np.array([1, 2, 3]),
+#         "weight": np.array([503.12, 684.32, 978.54])
+#     }
+# )
+
+# TS_SLOPE = 20.0
+# TS_INTERCEPT = -68.0
+
+# acoustic_db = realtime_survey.config["database"]["acoustics"]
+# SQL(acoustic_db, "select", table_name="files_processed")
+# biology_db = realtime_survey.config["database"]["biology"]
+# SQL(biology_db, "select", table_name="files_processedk")
+# ####
+# # CONCATENATE FILE SOURCES
+# specimen_reframed = specimen_df.groupby(["haul_num", "station", "sex", "length"])["length"].val
+# ue_counts().to_frame("length_count").reset_index()
+# specimen_reframed
+# # MELD
+# all_lengths = pd.concat([length_df, specimen_reframed])
+# # COMBINE
+# comb_lengths = all_lengths.groupby(["haul_num", "sex", "length"])["length_count"].sum().to_fra
+# me("length_count").reset_index()
+
+
+# from echopop.live.sql_methods import SQL
+
+# # Assuming that you have a LiveSurvey object defined
+# # ---- Get the database file name (and path)
+# biology_db = livesurvey_object.config["database"]["biology"]
+# # ----
+# # CONVERT TO TS
+# comb_lengths["ts"] = TS_SLOPE * np.log10(comb_lengths["length"]) + TS_INTERCEPT
+# # TO SIGMA_BS
+# comb_lengths["sigma_bs"] = 10 ** (comb_lengths["ts"] / 10)
+# # WEIGHTED MEAN SIGMA_BS
+# sigma_mean = np.average(comb_lengths["sigma_bs"], weights=comb_lengths["length_count"])
+
+# # INTEGRATE NASC
+# path2file = "C:/Users/15052/Downloads/win_1720457505_1720460000_NASC.zarr"
+
+# Path(path2file).exists()
+# xds = xr.open_dataset(path2file, engine="zarr")
+# xds
+# xdf = xds.to_dataframe().reset_index()
+# xdf["NASC"] = xdf["NASC"].fillna(0.0)
+# # convert frequency
+# xdf["frequency_nominal"] = (xdf["frequency_nominal"] * 1e-3).astype(int)
+# # filter
+# xdf_38 = xdf[xdf["frequency_nominal"] == nasc_frequency]
+
+# xdf_38.plot.scatter(x="distance", y="depth", c="NASC")
+# plt.show()
+
+# xdf_int = xdf_38.groupby(["distance", "longitude", "latitude"])["NASC"].sum().reset_index()
+
+# plt.scatter(xdf_int["longitude"], xdf_int["latitude"], c=xdf_int["NASC"])
+# plt.plot(xdf_int["longitude"], xdf_int["latitude"])
+# plt.show()
+
+# # CONVERT TO NUMBER DENSITY
+# xdf_int["number_density"] = xdf_int["NASC"] / (4.0 * np.pi * sigma_mean)
+
+
+# import geopandas as gpd
+# import pyproj
+
+# ###################
+# from geopy.distance import distance
+# from shapely.geometry import Point, Polygon, box
+# from shapely.ops import unary_union
+
+# grid_settings = file_configuration["geospatial"]["griddify"]
+# grid = []
+# lat_step = distance(nautical=grid_settings["grid_resolution"]["x"]).meters
+# lon_step = distance(nautical=grid_settings["grid_resolution"]["y"]).meters
+# lat_min = grid_settings["bounds"]["latitude"][0]
+# lat_max = grid_settings["bounds"]["latitude"][1]
+# lon_min = grid_settings["bounds"]["longitude"][0]
+# lon_max = grid_settings["bounds"]["longitude"][1]
+
+# utm_str = utm_string_generator((lon_max + lon_min)/2, (lat_max + lat_min)/2)
+# utm_proj = pyproj.Proj(f"epsg:{utm_str}")
+# x_min, y_min = utm_proj(lon_min, lat_min)
+# x_max, y_max = utm_proj(lon_max, lat_max)
+
+# lat = 55.5000
+# lon = -134.2500
+# utm_code = int(utm_string_generator(lon, lat))
+# utm_proj = pyproj.Proj(f"epsg:{utm_code}")
+# utm_proj(lon, lat)
+# gpd.GeoDataFrame(geometry=gpd.points_from_xy(np.array([lon]), np.array([lat])), crs=project
+# ion).to_crs(utm_code)
+
+
+# num_lon_steps = int((x_max - x_min) / lon_step)
+# num_lat_steps = int((y_max - y_min) / lat_step)
+
+# lon1 = np.linspace(x_min, x_max - lon_step, num_lon_steps)
+# lat1 = np.linspace(y_min, y_max - lat_step, num_lat_steps)
+# lon2 = lon1 + lon_step
+# lat2 = lat1 + lat_step
+
+# # Convert UTM coordinates back to degrees
+# lon_min_grid, lat_min_grid = np.meshgrid(lon1, lat1)
+# lon_max_grid, lat_max_grid = np.meshgrid(lon2, lat2)
+
+# # Convert UTM coordinates back to degrees with adjusted resolution
+# lon1_deg, lat1_deg = utm_proj(lon_min_grid.ravel(), lat_min_grid.ravel(), inverse=True)
+# lon2_deg, lat2_deg = utm_proj(lon_max_grid.ravel(), lat_max_grid.ravel(), inverse=True)
+
+
+# polygons = [box(lon1, lat1, lon2, lat2) for lon1, lat1, lon2, lat2 in zip(lon1_deg, lat1_deg, lo
+# n2_deg, lat2_deg)]
+# grid_gdf = gpd.GeoDataFrame({'geometry': polygons}, crs="epsg:4326")
+
+
+# world = gpd.read_file("C:/Users/15052/Documents/GitHub/echopop_data/live_2019_files/coastline/
+# ne_110m_land/ne_110m_land.shp")
+# bbox = box(lon_min - 0.25, lat_min - 0.25, lon_max + 0.25, lat_max + 0.25)
+# shapefile = world
+# clipped_shapefile = gpd.clip(shapefile, bbox).to_crs(utm_proj.srs)
+# clipped_shapefile.to_crs(utm_proj.srs)
+# # clipped_geometry = bbox.intersection(world.union_all())
+# # clipped_gdf = gpd.GeoDataFrame(geometry=[clipped_geometry], crs=world.crs)
+
+# from shapely.geometry import MultiPolygon
+
+# # Create an empty list to store clipped geometries
+# # clipped_geometries = []
+
+# # # Iterate over each grid polygon
+# # for index, row in grid_gdf.iterrows():
+# #     # Intersect grid polygon with land shape
+# #     intersection = row['geometry'].intersection(clipped_shapefile.unary_union)
+
+# #     # If intersection is a MultiPolygon, get the difference with the land shape
+# #     if isinstance(intersection, MultiPolygon):
+# #         clipped = row['geometry'].difference(clipped_shapefile.unary_union)
+# #         if clipped.is_empty:
+# #             continue
+# #         clipped_geometries.append(clipped)
+# #     else:
+# #         # If intersection is a single Polygon, directly add to clipped geometries
+# #         clipped_geometries.append(intersection)
 
-# clipped_grid = gpd.GeoDataFrame(geometry=clipped_geometries, crs=grid_gdf.crs)
+# # clipped_grid = gpd.GeoDataFrame(geometry=clipped_geometries, crs=grid_gdf.crs)
 
-clipped_geometries = grid_gdf['geometry'].to_crs(utm_proj.srs).difference(clipped_shapefile.geometry.union_all())
-clipped_gdf = gpd.GeoDataFrame(geometry=clipped_geometries)
-clipped_gdf.to_crs(epsg=32610)
+# clipped_geometries = grid_gdf['geometry'].to_crs(utm_proj.srs).difference(clipped_shapefile
+# .geometry.union_all())
+# clipped_gdf = gpd.GeoDataFrame(geometry=clipped_geometries)
+# clipped_gdf.to_crs(epsg=32610)
 
-invalid_geometries = clipped_gdf[~clipped_gdf.is_valid]
-clipped_gdf = clipped_gdf.buffer(0.001)
-clipped_gdf['area_sqm'] = clipped_gdf.area / 46300.00000000001**2
+# invalid_geometries = clipped_gdf[~clipped_gdf.is_valid]
+# clipped_gdf = clipped_gdf.buffer(0.001)
+# clipped_gdf['area_sqm'] = clipped_gdf.area / 46300.00000000001**2
 
-clipped_gdf.area
+# clipped_gdf.area
 
-fig, ax = plt.subplots(figsize=(10, 8))
-clipped_gdf.plot(ax=ax, facecolor="none", edgecolor="black")
-clipped_shapefile.plot(ax=ax, edgecolor='black', linewidth=0.5)
-plt.tight_layout()
-plt.show()
+# fig, ax = plt.subplots(figsize=(10, 8))
+# clipped_gdf.plot(ax=ax, facecolor="none", edgecolor="black")
+# clipped_shapefile.plot(ax=ax, edgecolor='black', linewidth=0.5)
+# plt.tight_layout()
+# plt.show()
 
 
-bbox.crs = {"init": "epsg:4326"}
-intersection = gpd.overlay(bbox, world, how='intersection')
+# bbox.crs = {"init": "epsg:4326"}
+# intersection = gpd.overlay(bbox, world, how='intersection')
 
-world_cut = gpd.sjoin(world, gpd.GeoDataFrame(geometry=[bbox]), how='inner', op='intersects')
+# world_cut = gpd.sjoin(world, gpd.GeoDataFrame(geometry=[bbox]), how='inner', op='intersects')
 
-world_cut = world[world.geometry.intersects(bbox)]
-world_cut.to_crs("epsg:4326")
+# world_cut = world[world.geometry.intersects(bbox)]
+# world_cut.to_crs("epsg:4326")
 
-import matplotlib.pyplot as plt
-fig, ax = plt.subplots(figsize=(10, 10))
-grid_gdf.plot(ax=ax, facecolor="none", edgecolor="black")
-world_cut.plot(ax=ax, linewidth=2, color='blue')
-plt.show()
+# import matplotlib.pyplot as plt
 
-for cell in grid_gdf:
+# fig, ax = plt.subplots(figsize=(10, 10))
+# grid_gdf.plot(ax=ax, facecolor="none", edgecolor="black")
+# world_cut.plot(ax=ax, linewidth=2, color='blue')
+# plt.show()
 
-    x, y = cell.exterior.xy  # Extract x and y coordinates of the cell
-    ax.fill(x, y, facecolor='none', edgecolor='black')  # Plot the cell as a polygon patch
-# Plot coastline
-# world.plot(ax=ax, linewidth=2, color='blue')
-plt.show()
+# for cell in grid_gdf:
 
+#     x, y = cell.exterior.xy  # Extract x and y coordinates of the cell
+#     ax.fill(x, y, facecolor='none', edgecolor='black')  # Plot the cell as a polygon patch
+# # Plot coastline
+# # world.plot(ax=ax, linewidth=2, color='blue')
+# plt.show()
 
-bbox = (lat_min, lon_min, lat_max, lon_max)
-G = ox.graph_from_bbox(bbox[2], bbox[3], bbox[0], bbox[1], network_type='none', simplify=False)
-G = ox.geometries_from_bbox(north=bbox[2], south=bbox[0], east=bbox[3], west=bbox[1], tags={'natural': ['coastline']})
 
+# bbox = (lat_min, lon_min, lat_max, lon_max)
+# G = ox.graph_from_bbox(bbox[2], bbox[3], bbox[0], bbox[1], network_type='none', simplify=False)
+# G = ox.geometries_from_bbox(north=bbox[2], south=bbox[0], east=bbox[3], west=bbox[1],
+# tags={'natural': ['coastline']})
 
 
-latitudes = range(int(lat_min), int(lat_max) + 1, int(lat_step))
-longitudes = range(int(lon_min), int(lon_max) + 1, int(lon_step))
+# latitudes = range(int(lat_min), int(lat_max) + 1, int(lat_step))
+# longitudes = range(int(lon_min), int(lon_max) + 1, int(lon_step))
 
-# Initialize `meta` attribute
-meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"])
+# # Initialize `meta` attribute
+# meta = copy.deepcopy(LIVE_DATA_STRUCTURE["meta"])
 
-# Loading the configuration settings and definitions that are used to
-# initialize the Survey class object
-config = yaml.safe_load(Path(initialization_config).read_text())
+# # Loading the configuration settings and definitions that are used to
+# # initialize the Survey class object
+# config = yaml.safe_load(Path(initialization_config).read_text())
 
-nasc_frequency = config["acoustics"]["nasc_frequency"]
\ No newline at end of file
+# nasc_frequency = config["acoustics"]["nasc_frequency"]

From 218df8aac98409fd90e6e89816bbe86b58c1e8eb Mon Sep 17 00:00:00 2001
From: Brandyn Lucca <brandyn.lucca@gmail.com>
Date: Wed, 28 Aug 2024 09:32:16 -0700
Subject: [PATCH 77/81] Pruned `test_workflow.py`

---
 echopop/test_workflow.py | 556 ++++++++++++---------------------------
 1 file changed, 163 insertions(+), 393 deletions(-)

diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index 7c462db8..f85abd4b 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -1,393 +1,163 @@
-# from echopop.live.live_survey import LiveSurvey
-# from echopop.live.sql_methods import SQL
-# import echopop.live.live_visualizer as elv
-# from pathlib import Path
-# from echopop.live import live_data_processing as eldp
-# from echopop.live import live_data_loading as eldl
-# from echopop.live.live_core import(
-#     LIVE_DATA_STRUCTURE, LIVE_INPUT_FILE_CONFIG_MAP
-# )
-# import boto3
-# from botocore.exceptions import NoCredentialsError, ClientError
-# import pandas as pd
-# import numpy as np
-# from echopop.live.sql_methods import SQL, sql_data_exchange, get_table_key_names,
-# sql_group_update, query_processed_files, sql_update_strata_summary
-# from echopop.live.live_spatial_methods import apply_spatial_definitions
-# from echopop.live.live_acoustics import average_sigma_bs, compute_nasc
-# from echopop.live.live_biology import compute_sigma_bs
-# from echopop.acoustics import ts_length_regression, to_dB, to_linear
-# from echopop.utils.operations import group_interpolator_creator
-# from functools import reduce
-# from echopop.live.live_data_loading import filter_filenames, read_biology_csv
-
-# ##################################################################################################
-# # TEST: Set up `LiveSurvey` object
-# # NOTE: General initialization parameter configuration
-# live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initializat
-# ion_config.yml"
-# # NOTE: File configuration
-# live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_yea
-# r_2019_config.yml"
-# # NOTE: Create object
-# realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True)
-# realtime_survey = LiveSurvey(live_file_config_path, live_init_config_path, verbose=True)
-
-# # NOTE: String-representation via `LiveSurvey.__repr__`:
-# # NOTE: Lists current files being processed and linked databases (WIP)
-# self = realtime_survey
-# file_configuration = self.config
-
-# input_filenames = ["202407_003_operation_info.csv", "202407_22500_003_lf.csv",
-# "202407_22500_003_spec.csv", "202407_003_catch_perc.csv"]
-# realtime_survey.config["input_directories"]["biology"]["directory"] =
-# "s3://sh2407-upload/data/Echopop-biology"
-
-# survey_data = SQL("C:/Users/Brandyn/Downloads/acoustics.db", "select",
-# table_name="survey_data_df")
-
-
-# del realtime_survey.config["data_root_dir"]
-# self = realtime_survey
-
-# # realtime_survey.config["storage_options"] = aws_credentials
-# realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True)
-# realtime_survey.load_biology_data(input_filenames=input_filenames)
-# realtime_survey.input["biology"]
-# def is_s3_path(path):
-#     """Check if a path is an S3 path."""
-#     return path.startswith("s3://")
-
-# dataset_directory = realtime_survey.config["input_directories"]["biology"]["directory"]
-# s3_path = dataset_directory
-# is_s3_path(dataset_directory)
-
-# cloud_credentials = aws_credentials
-# cloud_credentials = {}
-# def validate_s3_path(s3_path: str, cloud_credentials: dict):
-#     """Check if (parts of) S3 path exists."""
-
-#     # Redundant validation that S3 object validation is appropriate
-#     if not is_s3_path(s3_path):
-#         raise ValueError("The path is not an S3 path.")
-
-#     # Validate credentials
-#     if not all([True if param in cloud_credentials.keys() else False
-#                 for param in ["key", "secret"]]):
-#         # ---- Find missing credentials
-#         missing_creds = set(["key", "secret"]) - set(cloud_credentials)
-#         # ---- Format into string
-#         missing_creds_str = ", ".join(["'{}'".format(x.replace("'", "''")) for x in
-# missing_creds])
-#         # ---- Raise Error
-#         raise PermissionError(
-#             f"Required S3 credentials missing: {missing_creds_str}."
-#         )
-
-#     # Remove the s3:// prefix
-#     s3_path_reduced = s3_path[len("s3://"):]
-
-#     # Split into bucket and key
-#     parts = s3_path_reduced.split("/", 1)
-#     if len(parts) < 2:
-#         raise ValueError(f"Invalid S3 path format for '{s3_path}'.")
-
-#     # Get bucket name and directory keys
-#     bucket_name, directory = parts
-
-#     # Initialize the S3 client
-#     s3_client = boto3.client("s3",
-#                              aws_access_key_id=cloud_credentials["key"],
-#                              aws_secret_access_key=cloud_credentials["secret"])
-
-#     # Check if the bucket exists
-#     try:
-#         s3_client.head_bucket(Bucket=bucket_name)
-#     except ClientError as e:
-#         raise FileNotFoundError(
-#             f"S3 bucket '{bucket_name}' does not exist or you do not have access."
-#         )
-
-#     # Check if the S3 directory exists
-#     try:
-#         # ---- Ping a response from the bucket
-#         response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=directory, MaxKeys=1)
-#         # ---- Check for `Contents`
-#         if "Contents" not in response:
-#             raise FileNotFoundError(f"S3 path '{s3_path}' does not exist.")
-#     except ClientError as e:
-#         # --- Raise Error and propagate it upwards
-#         raise e
-
-# validate_s3_path(s3_path, cloud_credentials)
-
-# import pandas as pd
-
-# self = realtime_survey
-# biology_files = self.meta["provenance"]["biology_files_read"]
-# file_configuration = self.config
-# dataset = "biology"
-
-# # Get the dataset file settings
-# file_settings = file_configuration["input_directories"][dataset]
-
-# def construct_directorypath(file_configuration: dict, file_settings: dict):
-#     """Construct the root directory path."""
-
-#     # Get the general root_directory, if present
-#     if "data_root_dir" in file_configuration:
-#         root_directory = file_configuration["data_root_dir"]
-#     else:
-#         root_directory = ""
-
-#     # Get the local directory (or this may be the root directory depending on the config)
-#     data_directory = file_settings["directory"]
-
-#     # Return the directory path
-#     if root_directory != "":
-#         return "/".join([root_directory, data_directory])
-#     else:
-#         return data_directory
-
-# directory_path = construct_directorypath(file_configuration, file_settings)
-
-# def validate_local_path(directory_path: str):
-
-#     # Validate filepath
-#     # ---- Error evaluation (if applicable)
-#     if not Path(directory_path).exists():
-#         raise FileNotFoundError(
-#             f"The acoustic data directory [{directory_path}] does not exist."
-#         )
-
-#     # Validate that files even exist
-#     # ---- List available files of target extension
-#     data_files = list(directory_path.glob(f"*{'.'+file_settings['extension']}"))
-#     # ---- Error evaluation (if applicable)
-#     if not data_files:
-#         raise FileNotFoundError(
-#             f"No `*.{file_settings['extension']}` files found in [{directory_path}]!"
-#         )
-
-
-# # Get the biology data file settings
-# file_settings = file_configuration["input_directories"]["biology"]
-
-# # Get the file-specific settings, datatypes, columns, etc.
-# # ---- Get defined columns and datatypes from `LIVE_INPUT_FILE_CONFIG_MAP`
-# biology_config_map = LIVE_INPUT_FILE_CONFIG_MAP["biology"]
-# # ---- Extract the expected file name ID's
-# biology_file_ids = file_settings["file_name_formats"]
-# # ---- Extract all of the file ids
-# biology_config_ids = list(biology_file_ids.keys())
-# # ---- Initialize the dictionary that will define this key in the `input` attribute
-# biology_output = {f"{key}_df": pd.DataFrame() for key in biology_config_ids}
-
-
-# # Initialize a session with AWS credentials
-# s3_client = boto3.client(
-#     's3',
-#     aws_access_key_id=aws_credentials["key"],
-#     aws_secret_access_key=aws_credentials["secret"]
-# )
-# response = s3_client.list_buckets()
-# buckets = response.get('Buckets', [])
-# for bucket in buckets:
-#     print(f"Bucket Name: {bucket['Name']}")
-# s3_client.head_bucket(Bucket="sh2407-upload")
-# realtime_survey.load_biology_data(pandas_kwargs=aws_credentials, input_filenames=input_filenames)
-# realtime_survey.config["ship_id"]
-# grid_data = SQL(realtime_survey.config["database"]["grid"], "select", table_name="grid_df")
-# grid_data[grid_data.abundance > 0]
-# bucket = boto3.client("s3", region_name=None)
-# bucket.head_bucket(Bucket=realtime_survey.config["input_directories"]["biology"]["directory"]
-# +"/")
-# bucket.list_objects_v2(Bucket=realtime_survey.config["input_directories"]["biology"]["directory"],
-# Prefix=path, MaxKeys=1)
-# #################################################################################################
-# # TEST: TRIGGER --> NEW ACOUSTIC DATA
-# # NOTE: Load new acoustic data (Either glob file search or `input_filenames Optional[List[str]]`)
-# realtime_survey.load_acoustic_data()
-# # NOTE: Process new acoustic data
-# # NOTE: This will update linked database tables
-# realtime_survey.process_acoustic_data()
-# # NOTE: Generate population estimates (or pass if there are no biological data)
-# # NOTE: `working_dataset = Literal["acoustic", "biology"]`
-# realtime_survey.estimate_population(working_dataset="acoustic")
-# # NOTE: String-representation via `LiveSurvey.__repr__`:
-# # NOTE: Lists current files being processed and linked databases (WIP)
-# realtime_survey.input["acoustics"]
-# ##################################################################################################
-# # TEST: TRIGGER --> NEW BIOLOGY DATA
-# # NOTE: Load new biological data (Either glob file search or `input_filenames Optional[List[str]]`
-# realtime_survey.load_biology_data()
-# len(realtime_survey.meta["provenance"]["biology_files_checkpoint1"])
-# realtime_survey.meta["provenance"]["biology_files_checkpoint3"]
-# # NOTE: Process new biological data
-# # NOTE: This will update linked database tables
-# realtime_survey.process_biology_data()
-# # NOTE: Generate population estimates (or pass if there are no acoustic data)
-# # NOTE: `working_dataset = Literal["acoustic", "biology"]`
-# realtime_survey.estimate_population(working_dataset="biology")
-# # NOTE: String-representation via `LiveSurvey.__repr__`:
-# # NOTE: Lists current files being processed and linked databases (WIP)
-# realtime_survey
-# ##################################################################################################
-# # TEST: `LiveSurvey` --[`files_processed`]--> `Echodataflow`
-# # NOTE: `LiveSurvey.meta` attribute
-# # ---- ACOUSTIC
-# realtime_survey.meta["provenance"]["acoustic_files"]
-# # ---- BIOLOGICAL
-# realtime_survey.meta["provenance"]["biology_files"]
-# # NOTE: SQL function query from database file [cumulative list]
-# # ---- ACOUSTIC
-# SQL(db_file=realtime_survey.config["database"]["acoustics"],
-#     command="select", table_name="files_processed")
-# dat = SQL(db_file=realtime_survey.config["database"]["acoustics"],command="select",
-# table_name="files_processed")
-# # ---- BIOLOGICAL
-# SQL(db_file=realtime_survey.config["database"]["biology"],command="select",
-# table_name="files_processed")
-# dat.loc[0:, "filepath"][105]
-# ##################################################################################################
-# # TEST: `LiveSurvey` --[(key) SQL tables]--> Users
-# # !!! The SQL functions will fail if the tables have not yet been created/initialized
-# # ---- ACOUSTICS
-# # NOTE: Mean linear backscatter coefficient (`sigma_bs`) keyed for each haul and stratum
-# SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df")
-# SQL(realtime_survey.config["database"]["biology"], "select", table_name="specimen_df")
-# .latitude.max()
-# realtime_survey.input["spatial"]["strata"]
-# # NOTE: Along-track acoustically-derived number/biomass densities and NASC
-# SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
-# # ---- BIOLOGICAL
-# # NOTE: Fitted (discretized) length-weight relationship
-# SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_fitted_df")
-# # NOTE: Quantized length-binned weights (summed)
-# SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df")
-# # NOTE: Average weights per stratum
-# SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df")
-# # NOTE: Stratum summary tables
-# SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df")
-# ##################################################################################################
-# # FROM THE `LiveSurvey` object !
-# # ---- Convert to a Panel
-# import panel as pn
-# # ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table
-# survey_data_db = Path(realtime_survey.config["database"]["acoustics"])
-# # grid_db = Path(realtime_survey.config["database"]["grid"])
-# grid_db = Path("C:/Users/Brandyn/Downloads/grid.db")
-# dat = SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
-# dat
-# dat1 = SQL(grid_db, "select", table_name="grid_df")
-# SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df")
-
-# sql_cmd = "SELECT * FROM sigma_bs_mean_df ORDER BY stratum, haul_num, species_id"
-# # Create the engine
-# engine = create_engine(f"sqlite:///{"C:/Users/Brandyn/Downloads/biology.db"}")
-# # Create the SQL database connection and send the script
-# with engine.connect() as connection:
-#     table = connection.execute(text(sql_cmd))
-
-# data = table.fetchall()
-# dd = pd.DataFrame(data, columns=table.keys()).loc[0:1, :]
-# dd = dd[["stratum", "haul_num", "species_id", "sigma_bs", "sigma_bs_count", "sigma_bs_sum", "id"]]
-# dd.loc[:, "id"] = pd.Series([f"{(4,4,4)}", f"{(5,5,5)}"])
-# SQL("C:/Users/Brandyn/Downloads/biology.db", "insert", table_name="sigma_bs_mean_df",
-# dataframe=dd)
-# SQL("C:/Users/Brandyn/Downloads/biology.db", "map")
-# SQL(biology_db, "drop", table_name="sigma_bs_mean_df")
-# SQL(biology_db, "select", table_name="sigma_bs_mean_df")
-# dd.loc[:, "haul_num"] = pd.Series([101, 103])
-# dd = dd[["species_id", "haul_num", "id", "stratum", "sigma_bs", "sigma_bs_count", "sigma_bs_sum"]]
-# SQL(biology_db, "insert", table_name="sigma_bs_mean_df", dataframe=dd, id_columns=key_list+["id"])
-# SQL(biology_db, "select", table_name="sigma_bs_mean_df")
-# import numpy as np; import pandas as pd
-# SQL("C:/Users/Brandyn/Downloads/biology.db", "select", table_name="length_weight_df")
-# sigma_bs_df = SQL("C:/Users/Brandyn/Downloads/biology.db", "select",
-# table_name="sigma_bs_mean_df")
-# table_df = SQL(realtime_survey.config["database"]["biology"], "select",
-# table_name="sigma_bs_mean_df")
-# sigma_bs_df = table_df
-# # ---- Check the table keys
-# table_keys = np.unique(table_df["id"]).tolist()
-# # ---- Get unique values
-# current_keys = np.unique(sigma_bs_df["id"]).tolist()
-# # ---- Get INSERTION keys
-# insertion_keys = list(set(current_keys).difference(set(table_keys)))
-# # ---- Get UPDATE keys
-# update_keys = list(set(current_keys).intersection(set(table_keys)))
-# insertion_df = sigma_bs_df[sigma_bs_df["id"].isin(insertion_keys)]
-# insertion_df.loc[0, "species_id"] = 22500
-# insertion_df.loc[0, "stratum"] = 5
-# insertion_df.loc[0, "haul_num"] = 100
-# insertion_df.loc[0, "sigma_bs"] = 1e-10
-# insertion_df.loc[0, "sigma_bs_count"] = 100
-# insertion_df.loc[0, "sigma_bs_sum"] = 1e10 * 100
-# insertion_df.loc[0, "id"] = f"{(1,1,1)}"
-# SQL(realtime_survey.config["database"]["biology"], "insert", table_name="sigma_bs_mean_df",
-#     dataframe=insertion_df)
-# SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="sigma_bs_mean_df")
-# survey_data = SQL(realtime_survey.config["database"]["acoustics"], "select",
-# table_name="survey_data_df")
-# dat1[dat1.abundance > 0]
-# dat[dat.number_density > 0]
-# coast_db = grid_db
-# biology_db = Path(realtime_survey.config["database"]["biology"])
-# projection = realtime_survey.config["geospatial"]["projection"]
-# # NOTE: PLOTS
-# # Ensure Panel is initialized
-# pn.extension()
-# # ---- Helper function
-# def plt_to_pn(fig):
-#     # Convert to a panel object
-#     panel = pn.panel(fig)
-#     # Display
-#     panel.show() # OR panel.servable() if you want to serve it in a Panel server
-# # ---- PLOT GRID
-# fig = elv.plot_livesurvey_grid(grid_db, projection, coast_db)
-# fig.show()
-# plt_to_pn(fig)
-# # ---- PLOT TRACK
-# from echopop.live.live_visualizer import plot_livesurvey_track
-# fig1 = plot_livesurvey_track(survey_data, projection, coast_db)
-# fig1.show()
-# plt_to_pn(fig1)
-# # ---- PLOT DISTRIBUTIONS
-# weight_table = SQL(biology_db, "select",
-#                    table_name="length_weight_df")
-# stratum_table = SQL(biology_db, "select",
-#                     table_name="strata_summary_df")
-# specimen_table = SQL(biology_db, "select",
-#                      table_name="specimen_data_df")
-# length_table = SQL(biology_db, "select",
-#                    table_name="length_df")
-# fig2 = elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table,
-# length_table)
-# plt_to_pn(fig2)
-# ### MULTIPANEL
-# panel0 = pn.panel(fig, name='Gridded population estimates')
-# panel1 = pn.panel(fig1, name='Alongtrack population estimates')
-# panel2 = pn.panel(fig2, name='Length and weight distributions')
-
-# def serve_panels():
-#     # Create links to each panel
-#     home = pn.Column(
-#         pn.pane.Markdown("# Main Page"),
-#         pn.pane.Markdown("[Gridded population estimates](gridded_population_estimates)",
-# sizing_mode="stretch_width"),
-#         pn.pane.Markdown("[Alongtrack population estimates](alongtrack_population_estimates)",
-# sizing_mode="stretch_width"),
-#         pn.pane.Markdown("[Length and weight distributions](length_weight_distributions)",
-# sizing_mode="stretch_width")
-#     )
-
-#     # Serve the home page and individual panels
-#     pn.serve({
-#         'Main Page': home,
-#         'gridded_population_estimates': panel0,
-#         'alongtrack_population_estimates': panel1,
-#         'length_weight_distributions': panel2
-#     },  show=True)
-# # Run the function to serve panels
-# serve_panels()
+from echopop.live.live_survey import LiveSurvey
+from echopop.live.sql_methods import SQL
+import echopop.live.live_visualizer as elv
+from pathlib import Path
+from echopop.live import live_data_processing as eldp
+from echopop.live import live_data_loading as eldl
+from echopop.live.live_core import(
+    LIVE_DATA_STRUCTURE, LIVE_INPUT_FILE_CONFIG_MAP
+)
+import boto3
+from botocore.exceptions import NoCredentialsError, ClientError
+import pandas as pd
+import numpy as np
+from echopop.live.sql_methods import SQL, sql_data_exchange, get_table_key_names,
+sql_group_update, query_processed_files, sql_update_strata_summary
+from echopop.live.live_spatial_methods import apply_spatial_definitions
+from echopop.live.live_acoustics import average_sigma_bs, compute_nasc
+from echopop.live.live_biology import compute_sigma_bs
+from echopop.acoustics import ts_length_regression, to_dB, to_linear
+from echopop.utils.operations import group_interpolator_creator
+from functools import reduce
+from echopop.live.live_data_loading import filter_filenames, read_biology_csv
+
+##################################################################################################
+# TEST: Set up `LiveSurvey` object
+# NOTE: General initialization parameter configuration
+live_init_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_initialization_config.yml"
+# NOTE: File configuration
+live_file_config_path = "C:/Users/Brandyn/Documents/GitHub/echopop/config_files/live_survey_year_2019_config.yml"
+# NOTE: Create object
+realtime_survey = LiveSurvey(live_init_config_path, live_file_config_path, verbose=True)
+
+# NOTE: String-representation via `LiveSurvey.__repr__`:
+# NOTE: Lists current files being processed and linked databases (WIP)
+realtime_survey
+#################################################################################################
+# TEST: TRIGGER --> NEW ACOUSTIC DATA
+# NOTE: Load new acoustic data (Either glob file search or `input_filenames Optional[List[str]]`)
+realtime_survey.load_acoustic_data()
+# NOTE: Process new acoustic data
+# NOTE: This will update linked database tables
+realtime_survey.process_acoustic_data()
+# NOTE: Generate population estimates (or pass if there are no biological data)
+# NOTE: `working_dataset = Literal["acoustic", "biology"]`
+realtime_survey.estimate_population(working_dataset="acoustic")
+# NOTE: String-representation via `LiveSurvey.__repr__`:
+# NOTE: Lists current files being processed and linked databases (WIP)
+realtime_survey.input["acoustics"]
+##################################################################################################
+# TEST: TRIGGER --> NEW BIOLOGY DATA
+# NOTE: Load new biological data (Either glob file search or `input_filenames Optional[List[str]]`
+realtime_survey.load_biology_data()
+# NOTE: Process new biological data
+# NOTE: This will update linked database tables
+realtime_survey.process_biology_data()
+# NOTE: Generate population estimates (or pass if there are no acoustic data)
+# NOTE: `working_dataset = Literal["acoustic", "biology"]`
+realtime_survey.estimate_population(working_dataset="biology")
+# NOTE: String-representation via `LiveSurvey.__repr__`:
+# NOTE: Lists current files being processed and linked databases (WIP)
+realtime_survey
+##################################################################################################
+# TEST: `LiveSurvey` --[`files_processed`]--> `Echodataflow`
+# NOTE: `LiveSurvey.meta` attribute
+# ---- ACOUSTIC
+realtime_survey.meta["provenance"]["acoustic_files"]
+# ---- BIOLOGICAL
+realtime_survey.meta["provenance"]["biology_files"]
+# NOTE: SQL function query from database file [cumulative list]
+# ---- ACOUSTIC
+SQL(db_file=realtime_survey.config["database"]["acoustics"],
+    command="select", table_name="files_processed")
+dat = SQL(db_file=realtime_survey.config["database"]["acoustics"],command="select",
+table_name="files_processed")
+# ---- BIOLOGICAL
+SQL(db_file=realtime_survey.config["database"]["biology"],command="select",
+table_name="files_processed")
+##################################################################################################
+# TEST: `LiveSurvey` --[(key) SQL tables]--> Users
+# !!! The SQL functions will fail if the tables have not yet been created/initialized
+# ---- ACOUSTICS
+# NOTE: Mean linear backscatter coefficient (`sigma_bs`) keyed for each haul and stratum
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="sigma_bs_mean_df")
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="specimen_df")
+.latitude.max()
+realtime_survey.input["spatial"]["strata"]
+# NOTE: Along-track acoustically-derived number/biomass densities and NASC
+SQL(realtime_survey.config["database"]["acoustics"], "select", table_name="survey_data_df")
+# ---- BIOLOGICAL
+# NOTE: Fitted (discretized) length-weight relationship
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_fitted_df")
+# NOTE: Quantized length-binned weights (summed)
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="length_weight_df")
+# NOTE: Average weights per stratum
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="weight_stratum_df")
+# NOTE: Stratum summary tables
+SQL(realtime_survey.config["database"]["biology"], "select", table_name="strata_summary_df")
+##################################################################################################
+# FROM THE `LiveSurvey` object !
+# ---- Convert to a Panel
+import panel as pn
+# ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table
+survey_data_db = Path(realtime_survey.config["database"]["acoustics"])
+grid_db = Path(realtime_survey.config["database"]["grid"])
+coast_db = grid_db
+biology_db = Path(realtime_survey.config["database"]["biology"])
+projection = realtime_survey.config["geospatial"]["projection"]
+# NOTE: PLOTS
+# Ensure Panel is initialized
+pn.extension()
+# ---- Helper function
+def plt_to_pn(fig):
+    # Convert to a panel object
+    panel = pn.panel(fig)
+    # Display
+    panel.show() # OR panel.servable() if you want to serve it in a Panel server
+# ---- PLOT GRID
+fig = elv.plot_livesurvey_grid(grid_db, projection, coast_db)
+fig.show()
+plt_to_pn(fig)
+# ---- PLOT TRACK
+from echopop.live.live_visualizer import plot_livesurvey_track
+fig1 = plot_livesurvey_track(survey_data, projection, coast_db)
+fig1.show()
+plt_to_pn(fig1)
+# ---- PLOT DISTRIBUTIONS
+weight_table = SQL(biology_db, "select",
+                   table_name="length_weight_df")
+stratum_table = SQL(biology_db, "select",
+                    table_name="strata_summary_df")
+specimen_table = SQL(biology_db, "select",
+                     table_name="specimen_data_df")
+length_table = SQL(biology_db, "select",
+                   table_name="length_df")
+fig2 = elv.plot_livesurvey_distributions(weight_table, stratum_table, specimen_table,
+length_table)
+plt_to_pn(fig2)
+### MULTIPANEL
+panel0 = pn.panel(fig, name='Gridded population estimates')
+panel1 = pn.panel(fig1, name='Alongtrack population estimates')
+panel2 = pn.panel(fig2, name='Length and weight distributions')
+
+def serve_panels():
+    # Create links to each panel
+    home = pn.Column(
+        pn.pane.Markdown("# Main Page"),
+        pn.pane.Markdown("[Gridded population estimates](gridded_population_estimates)",
+sizing_mode="stretch_width"),
+        pn.pane.Markdown("[Alongtrack population estimates](alongtrack_population_estimates)",
+sizing_mode="stretch_width"),
+        pn.pane.Markdown("[Length and weight distributions](length_weight_distributions)",
+sizing_mode="stretch_width")
+    )
+
+    # Serve the home page and individual panels
+    pn.serve({
+        'Main Page': home,
+        'gridded_population_estimates': panel0,
+        'alongtrack_population_estimates': panel1,
+        'length_weight_distributions': panel2
+    },  show=True)
+# Run the function to serve panels
+serve_panels()

From 5a0eac8634cec200f05642dc7f62ab6f16af2f22 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 28 Aug 2024 16:32:39 +0000
Subject: [PATCH 78/81] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 echopop/test_workflow.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/echopop/test_workflow.py b/echopop/test_workflow.py
index f85abd4b..d274d021 100644
--- a/echopop/test_workflow.py
+++ b/echopop/test_workflow.py
@@ -1,25 +1,25 @@
-from echopop.live.live_survey import LiveSurvey
-from echopop.live.sql_methods import SQL
-import echopop.live.live_visualizer as elv
 from pathlib import Path
-from echopop.live import live_data_processing as eldp
-from echopop.live import live_data_loading as eldl
-from echopop.live.live_core import(
-    LIVE_DATA_STRUCTURE, LIVE_INPUT_FILE_CONFIG_MAP
-)
+
 import boto3
-from botocore.exceptions import NoCredentialsError, ClientError
-import pandas as pd
 import numpy as np
-from echopop.live.sql_methods import SQL, sql_data_exchange, get_table_key_names,
+import pandas as pd
+from botocore.exceptions import ClientError, NoCredentialsError
+
+import echopop.live.live_visualizer as elv
+from echopop.live import live_data_loading as eldl, live_data_processing as eldp
+from echopop.live.live_core import LIVE_DATA_STRUCTURE, LIVE_INPUT_FILE_CONFIG_MAP
+from echopop.live.live_survey import LiveSurvey
+from echopop.live.sql_methods import SQL, get_table_key_names, sql_data_exchange
+
 sql_group_update, query_processed_files, sql_update_strata_summary
-from echopop.live.live_spatial_methods import apply_spatial_definitions
+from functools import reduce
+
+from echopop.acoustics import to_dB, to_linear, ts_length_regression
 from echopop.live.live_acoustics import average_sigma_bs, compute_nasc
 from echopop.live.live_biology import compute_sigma_bs
-from echopop.acoustics import ts_length_regression, to_dB, to_linear
-from echopop.utils.operations import group_interpolator_creator
-from functools import reduce
 from echopop.live.live_data_loading import filter_filenames, read_biology_csv
+from echopop.live.live_spatial_methods import apply_spatial_definitions
+from echopop.utils.operations import group_interpolator_creator
 
 ##################################################################################################
 # TEST: Set up `LiveSurvey` object
@@ -99,6 +99,7 @@
 # FROM THE `LiveSurvey` object !
 # ---- Convert to a Panel
 import panel as pn
+
 # ---- Either have the db file already called in as a `pandas.DataFrame`, or query the table
 survey_data_db = Path(realtime_survey.config["database"]["acoustics"])
 grid_db = Path(realtime_survey.config["database"]["grid"])
@@ -120,6 +121,7 @@ def plt_to_pn(fig):
 plt_to_pn(fig)
 # ---- PLOT TRACK
 from echopop.live.live_visualizer import plot_livesurvey_track
+
 fig1 = plot_livesurvey_track(survey_data, projection, coast_db)
 fig1.show()
 plt_to_pn(fig1)

From 2a277a63c7421e2506103ca2034cc9741ef4a40e Mon Sep 17 00:00:00 2001
From: Sohambutala <sohambutala@yahoo.in>
Date: Sun, 27 Oct 2024 02:46:54 +0000
Subject: [PATCH 79/81] add echopop live viz cmap and fig seq tweaks

---
 echopop/live/live_visualizer.py | 84 ++++++++++++++++++---------------
 1 file changed, 45 insertions(+), 39 deletions(-)

diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py
index a1d55a26..a23baa88 100644
--- a/echopop/live/live_visualizer.py
+++ b/echopop/live/live_visualizer.py
@@ -66,28 +66,28 @@ def plot_livesurvey_grid(
         "number_density_mean": {
             "name": "Mean number density",
             "units": "fish $\\mathregular{nmi^{-2}}$",
-            "colormap": "viridis",
+            "colormap": "cividis",
             "color_threshold": {"minimum": 1e1, "maximum": 1e6},
         },
         "biomass_density_mean": {
             "name": "Mean biomass density",
             "units": "kg $\\mathregular{nmi^{-2}}$",
-            "colormap": "plasma",
+            "colormap": "magma",
             "color_threshold": {"minimum": 1e1, "maximum": 1e6},
         },
-        "biomass": {
-            "name": "Biomass",
-            "units": "kg",
-            "colormap": "cividis",
+        "abundance": {
+            "name": "Abundance",
+            "units": "$\\it{N}$",
+            "colormap": "viridis",
             "color_threshold": {
                 "minimum": 1e1 * grid_gdf["area"].max(),
                 "maximum": 1e6 * grid_gdf["area"].max(),
             },
         },
-        "abundance": {
-            "name": "Abundance",
-            "units": "$\\it{N}$",
-            "colormap": "inferno",
+        "biomass": {
+            "name": "Biomass",
+            "units": "kg",
+            "colormap": "plasma",
             "color_threshold": {
                 "minimum": 1e1 * grid_gdf["area"].max(),
                 "maximum": 1e6 * grid_gdf["area"].max(),
@@ -106,7 +106,7 @@ def plot_livesurvey_grid(
         # ---- Get the colormap
         colormap = plt.colormaps.get_cmap(VARIABLE_MAP[var]["colormap"]).resampled(256)
         # ---- Invert
-        newcolors = colormap(np.linspace(0, 1, 256))[::-1]
+        newcolors = colormap(np.linspace(0, 1, 256))#[::-1]
         # ---- Define `white`
         white = np.array([1, 1, 1, 1])
         # ---- Replace "start" color
@@ -244,12 +244,21 @@ def plot_livesurvey_track(
 
     # Variable label dictionary map
     VARIABLE_MAP = {
+        "nasc": {
+            "name": "Nautical area scattering coefficient",
+            "units": "$\\mathregular{m^{2}~nmi^{-2}}$",
+            "colormap": "YlOrRd",
+            "minimum": 0.0,
+            "cbar_reverse": False,
+            "color_threshold": {"minimum": 1e2, "maximum": 1e4},
+            "size": [25, 150],
+        },
         "number_density": {
             "name": "Mean number density",
             "units": "fish $\\mathregular{nmi^{-2}}$",
-            "colormap": "inferno",
+            "colormap": "Purples",
             "minimum": 0.0,
-            "cbar_reverse": True,
+            "cbar_reverse": False,
             "color_threshold": {
                 "minimum": 1e1,
                 "maximum": 1e6,
@@ -259,30 +268,21 @@ def plot_livesurvey_track(
         "biomass_density": {
             "name": "Mean biomass density",
             "units": "kg $\\mathregular{nmi^{-2}}$",
-            "colormap": "plasma",
+            "colormap": "Greens",
             "minimum": 0.0,
-            "cbar_reverse": True,
+            "cbar_reverse": False,
             "color_threshold": {
                 "minimum": 1e1,
                 "maximum": 1e6,
             },
             "size": [25, 150],
         },
-        "nasc": {
-            "name": "Nautical area scattering coefficient",
-            "units": "$\\mathregular{m^{2}~nmi^{-2}}$",
-            "colormap": "viridis",
-            "minimum": 0.0,
-            "cbar_reverse": False,
-            "color_threshold": {"minimum": 1e2, "maximum": 1e4},
-            "size": [25, 150],
-        },
         "max_Sv": {
             "name": "Max $\\mathregular{S_V}$",
             "units": "dB re. 1 $\\mathregular{m^-1}$",
-            "colormap": "viridis",
+            "colormap": "Blues",
             "minimum": -999,
-            "cbar_reverse": True,
+            "cbar_reverse": False,
             "color_threshold": {"minimum": -80.0, "maximum": -36.0},
             "size": [5, 100],
         },
@@ -331,13 +331,16 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
 
     # Iterate through and plot all subplots
     for ax, var in zip(axes.flat, intact_variables):
-        # ---- Get the colormap
-        colormap = plt.colormaps.get_cmap(VARIABLE_MAP[var]["colormap"]).resampled(256)
-        # ---- Invert
-        if VARIABLE_MAP[var]["cbar_reverse"]:
-            newcolors = colormap(np.linspace(0, 1, 256))[::-1]
-        # ---- Create the new custom colormap
-        custom_cmap = ListedColormap(newcolors)
+        # # ---- Get the colormap
+        # colormap = plt.colormaps.get_cmap(VARIABLE_MAP[var]["colormap"]).resampled(256)
+        # # ---- Invert
+        # if VARIABLE_MAP[var]["cbar_reverse"]:
+        #     newcolors = colormap(np.linspace(0, 1, 256))[::-1]
+        # else:
+        #     newcolors = colormap
+        # # ---- Create the new custom colormap
+        # custom_cmap = ListedColormap(newcolors)
+        custom_cmap = VARIABLE_MAP[var]["colormap"]
         # ---- Plot cruisetrack
         # survey_gdf.plot(ax=ax, color="dimgray", linewidth=0.25, linestyle="-")
         # ax.plot(survey_gdf.geometry.x, survey_gdf.geometry.y, color="dimgray",
@@ -346,11 +349,11 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
         for ship_id, group in survey_gdf.groupby("ship_id"):
             # Sort the group by latitude or longitude
             # group = group.sort_values(by=["latitude", "longitude"])
-            color = ship_id_colors.get(ship_id, "gray")
+            # color = ship_id_colors.get(ship_id, "gray")
             (line_handle,) = ax.plot(
                 group.geometry.x,
                 group.geometry.y,
-                color=color,
+                color="gray",
                 linewidth=0.25,
                 linestyle="-",
                 label=ship_id,
@@ -375,6 +378,7 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
             [geom.x for geom in sub_gdf.geometry],
             [geom.y for geom in sub_gdf.geometry],
             c=sub_gdf[var],
+            # s=20,
             s=scale_sizes(
                 values=sub_gdf[var],
                 min_value=min_value,
@@ -548,9 +552,10 @@ def plot_livesurvey_distributions(
             ax_weight.plot(
                 group["length_bin"],
                 group["proportions"],
-                marker="o",
+                marker=".",
                 label=f"Stratum {stratum}",
                 color=color,
+                lw=1,
                 ms=ms,
             )
         if i == 0:
@@ -561,7 +566,7 @@ def plot_livesurvey_distributions(
             ax_weight.set_ylabel("Within-stratum proportion [0, 1]")
         if i == num_sexes - 1:  # Bottom plot
             ax_weight.set_xlabel("Length bin (cm)")
-        ax_weight.set_ylim(0.0, 1.0)
+        ax_weight.set_ylim(0.0, 0.8)
         # Add label in the top-left corner
         ax_weight.text(
             0.05,
@@ -582,9 +587,10 @@ def plot_livesurvey_distributions(
             ax_count.plot(
                 group["length_bin"],
                 group["number_proportion"],
-                marker="o",
+                marker=".",
                 label=f"Stratum {stratum}",
                 color=color,
+                lw=1,
                 ms=ms,
             )
         if i == 0:
@@ -593,7 +599,7 @@ def plot_livesurvey_distributions(
             ax_count.set_xlabel("")
         if i == num_sexes - 1:  # Bottom plot
             ax_count.set_xlabel("Length bin (cm)")
-        ax_count.set_ylim(0.0, 1.0)
+        ax_count.set_ylim(0.0, 0.8)
         # Add label in the top-left corner
         ax_count.text(
             0.05,

From 30c05ffff87061f76ce358b643c794ab508962dc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 27 Oct 2024 02:47:28 +0000
Subject: [PATCH 80/81] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 echopop/live/live_visualizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py
index a23baa88..1bba7fff 100644
--- a/echopop/live/live_visualizer.py
+++ b/echopop/live/live_visualizer.py
@@ -106,7 +106,7 @@ def plot_livesurvey_grid(
         # ---- Get the colormap
         colormap = plt.colormaps.get_cmap(VARIABLE_MAP[var]["colormap"]).resampled(256)
         # ---- Invert
-        newcolors = colormap(np.linspace(0, 1, 256))#[::-1]
+        newcolors = colormap(np.linspace(0, 1, 256))  # [::-1]
         # ---- Define `white`
         white = np.array([1, 1, 1, 1])
         # ---- Replace "start" color

From 99a91a7d91a833b05004dd6c0d4cc8d9b9d398eb Mon Sep 17 00:00:00 2001
From: Sohambutala <sohambutala@yahoo.in>
Date: Sun, 27 Oct 2024 03:02:36 +0000
Subject: [PATCH 81/81] tweak zorder of scatter and line in track plot, tweak
 units

---
 echopop/live/live_visualizer.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/echopop/live/live_visualizer.py b/echopop/live/live_visualizer.py
index a23baa88..35fa8bbc 100644
--- a/echopop/live/live_visualizer.py
+++ b/echopop/live/live_visualizer.py
@@ -65,7 +65,7 @@ def plot_livesurvey_grid(
     VARIABLE_MAP = {
         "number_density_mean": {
             "name": "Mean number density",
-            "units": "fish $\\mathregular{nmi^{-2}}$",
+            "units": "Number of fish per $\\mathregular{nmi^2}$",
             "colormap": "cividis",
             "color_threshold": {"minimum": 1e1, "maximum": 1e6},
         },
@@ -77,7 +77,7 @@ def plot_livesurvey_grid(
         },
         "abundance": {
             "name": "Abundance",
-            "units": "$\\it{N}$",
+            "units": "Number of fish",
             "colormap": "viridis",
             "color_threshold": {
                 "minimum": 1e1 * grid_gdf["area"].max(),
@@ -255,7 +255,7 @@ def plot_livesurvey_track(
         },
         "number_density": {
             "name": "Mean number density",
-            "units": "fish $\\mathregular{nmi^{-2}}$",
+            "units": "Number of fish per $\\mathregular{nmi^2}$",
             "colormap": "Purples",
             "minimum": 0.0,
             "cbar_reverse": False,
@@ -279,7 +279,7 @@ def plot_livesurvey_track(
         },
         "max_Sv": {
             "name": "Max $\\mathregular{S_V}$",
-            "units": "dB re. 1 $\\mathregular{m^-1}$",
+            "units": "dB re 1 $\\mathregular{m^-1}$",
             "colormap": "Blues",
             "minimum": -999,
             "cbar_reverse": False,
@@ -357,7 +357,7 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
                 linewidth=0.25,
                 linestyle="-",
                 label=ship_id,
-                zorder=1,
+                zorder=2,
             )
             handles.append(line_handle)  # Add handle to legend
             # ax.plot(group.geometry.x, group.geometry.y, label=ship_id, linewidth=0.25,
@@ -388,7 +388,9 @@ def scale_sizes(values, min_value, max_value, min_size=25, max_size=250):
             ),
             cmap=custom_cmap,
             norm=norm,
-            zorder=2,
+            zorder=1,
+            alpha=0.6,
+            lw=0,
             # edgecolor="black",
             # linewidths=0.1
         )