Merge branch 'release/2.6.0' into feature/issue-189

podaac · Sep 15, 2023 · ce49371 · ce49371
2 parents cf35bbe + 5a336d6
commit ce49371
Show file tree

Hide file tree

Showing 9 changed files with 117 additions and 31 deletions.
diff --git a/.github/workflows/build-pipeline.yml b/.github/workflows/build-pipeline.yml
@@ -86,6 +86,29 @@ jobs:
         run: |
           poetry run pytest --junitxml=build/reports/pytest.xml --cov=podaac/ --cov-report=xml:build/reports/coverage.xml -m "not aws and not integration" tests/
       - name: SonarCloud Scan
+        id: sonarcloud
+        uses: sonarsource/sonarcloud-github-action@master
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
+        with:
+          args: >
+            -Dsonar.organization=${{ github.repository_owner }}
+            -Dsonar.projectKey=${{ github.repository_owner }}_l2ss-py
+            -Dsonar.python.coverage.reportPaths=build/reports/coverage.xml
+            -Dsonar.sources=podaac/
+            -Dsonar.tests=tests/
+            -Dsonar.projectName=l2ss-py
+            -Dsonar.projectVersion=${{ env.software_version }}
+            -Dsonar.python.version=3.8,3.9,3.10
+        continue-on-error: true
+      - name: Wait to retry sonarcloud scan
+        if: steps.sonarcloud.outcome == 'failure'
+        run: |
+          sleep 40
+      - name: SonarCloud Scan Retry
+        id: sonarcloud-retry
+        if: steps.sonarcloud.outcome == 'failure'
         uses: sonarsource/sonarcloud-github-action@master
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -142,19 +165,43 @@ jobs:
           git tag -a "${{ env.software_version }}" -m "Version ${{ env.software_version }}"
           git push origin "${{ env.software_version }}"
       - name: Publish UMM-S with new version
+        id: publish-umm-s
         uses: podaac/[email protected]
         if: |
           github.ref == 'refs/heads/main'    ||
           startsWith(github.ref, 'refs/heads/release')
         with:
-          umm-s-json: 'cmr/l2ss_cmr_umm_s.json'
+          umm-json: 'cmr/l2ss_cmr_umm_s.json'
+          provider: 'POCLOUD'
+          env: ${{ env.venue }}
+          version: ${{ env.software_version }}
+          timeout: 60
+          disable_removal: 'true'
+          umm_type: 'umm-s'
+          use_associations: 'false'        
+        env:
+          LAUNCHPAD_TOKEN_SIT: ${{secrets.LAUNCHPAD_TOKEN_SIT}}
+          LAUNCHPAD_TOKEN_UAT: ${{secrets.LAUNCHPAD_TOKEN_UAT}}
+          LAUNCHPAD_TOKEN_OPS: ${{secrets.LAUNCHPAD_TOKEN_OPS}}
+        continue-on-error: true
+      - name: Wait to retry publishing umm-s
+        if: steps.publish-umm-s.outcome == 'failure'
+        run: |
+          sleep 120
+      - name: Publish UMM-S with new version retry
+        id: publish-umm-s-retry
+        uses: podaac/[email protected]
+        if: |
+          steps.publish-umm-s.outcome == 'failure'
+        with:
+          umm-json: 'cmr/l2ss_cmr_umm_s.json'
           provider: 'POCLOUD'
           env: ${{ env.venue }}
           version: ${{ env.software_version }}
           timeout: 60
           disable_removal: 'true'
           umm_type: 'umm-s'
-          use_associations: 'false'
+          use_associations: 'false'        
         env:
           LAUNCHPAD_TOKEN_SIT: ${{secrets.LAUNCHPAD_TOKEN_SIT}}
           LAUNCHPAD_TOKEN_UAT: ${{secrets.LAUNCHPAD_TOKEN_UAT}}

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,12 +19,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [issue/182](https://github.com/podaac/l2ss-py/issues/182): Update code so doesn't remove '/' on attribute values.
 - [issue/178](https://github.com/podaac/l2ss-py/issues/178): Add function to make sure dimension in subset is same as original file
 - Update github action umm updater to 0.5.0
+- [issue/172](https://github.com/podaac/l2ss-py/issues/178): Fix shapefile subsetting by passing correct variable to subset function.
 ### Deprecated 
 ### Removed
 ### Fixed
 - [issue/119](https://github.com/podaac/l2ss-py/issues/119): GPM variable dimensions are renamed from "phony_dim" to the dimension names in the variable attribute "DimensionNames"
+- [issue/184](https://github.com/podaac/l2ss-py/issues/184): boundary box argument at the command line is changed to allow decimal numbers (i.e., floats) for coordinates
 - [issue/189](https://github.com/podaac/l2ss-py/issues/189): Fix temporal subsetting for SWOT collections, use mask_and_scale args for opening granule file if we have an overflow in time fill value, use original dataset encoding when writing file.
-
+- [issue/194](https://github.com/podaac/l2ss-py/issues/194): Return coordinate variables if requested in a variable subset
 ### Security
 
 

diff --git a/podaac/subsetter/group_handling.py b/podaac/subsetter/group_handling.py
@@ -242,11 +242,18 @@ def walk_h5py(data_new, group):
     walk_h5py(data_new, data_new.name)
 
     # Get the instrument name from the file attributes
-    instrument = data_new['__HDFEOS__ADDITIONAL__FILE_ATTRIBUTES'].attrs['InstrumentName'].decode("utf-8")
+
+    additional_file_attributes = data_new.get('__HDFEOS__ADDITIONAL__FILE_ATTRIBUTES')
+    instrument = ""
+
+    if additional_file_attributes:
+        instrument = additional_file_attributes.attrs['InstrumentName'].decode("utf-8")
     if 'OMI' in instrument:
         hdf_type = 'OMI'
     elif 'MLS' in instrument:
         hdf_type = 'MLS'
+    else:
+        hdf_type = None
 
     for del_group in del_group_list:
         del data_new[del_group]

diff --git a/podaac/subsetter/run_subsetter.py b/podaac/subsetter/run_subsetter.py
@@ -35,7 +35,7 @@ def parse_args(args: list) -> tuple:
     )
     parser.add_argument(
         '--bbox',
-        type=int,
+        type=float,
         default=[-180, -90, 180, 90],
         nargs=4,
         action='store',

diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py
@@ -24,7 +24,8 @@
 import operator
 import os
 from itertools import zip_longest
-from typing import List, Tuple, Union
+from typing import List, Optional, Tuple, Union
+import traceback
 import dateutil
 from dateutil import parser
 
@@ -744,7 +745,7 @@ def build_cond(str_timestamp, compare):
     return temporal_cond
 
 
-def get_base_group_names(lats):  # pylint: disable=too-many-branches
+def get_base_group_names(lats: List[str]) -> Tuple[List[str], List[Union[int, str]]]:  # pylint: disable=too-many-branches
     """Latitude groups may be at different depths. This function gets the level
     number that makes each latitude group unique from the other latitude names"""
     unique_groups = []
@@ -756,7 +757,7 @@ def get_base_group_names(lats):  # pylint: disable=too-many-branches
     # put the groups in the same levels in the same list
     group_list_transpose = np.array(group_list).T.tolist()
 
-    diff_count = ['' for i in range(len(group_list))]
+    diff_count = ['' for _ in range(len(group_list))]
     group_count = 0
     # loop through each group level
     for my_list in group_list_transpose:
@@ -788,7 +789,7 @@ def subset_with_bbox(dataset: xr.Dataset,  # pylint: disable=too-many-branches
                      lat_var_names: list,
                      lon_var_names: list,
                      time_var_names: list,
-                     variables=None,
+                     variables: Optional[List[str]] = None,
                      bbox: np.ndarray = None,
                      cut: bool = True,
                      min_time: str = None,
@@ -806,6 +807,8 @@ def subset_with_bbox(dataset: xr.Dataset,  # pylint: disable=too-many-branches
         Name of the longitude variables in the given dataset
     time_var_names : list
         Name of the time variables in the given dataset
+    variables : list[str]
+        List of variables to include in the result
     bbox : np.array
         Spatial bounding box to subset Dataset with.
     cut : bool
@@ -836,13 +839,6 @@ def subset_with_bbox(dataset: xr.Dataset,  # pylint: disable=too-many-branches
     else:
         unique_groups = [f'{GROUP_DELIM}{GROUP_DELIM.join(x.strip(GROUP_DELIM).split(GROUP_DELIM)[:-1])}' for x in lat_var_names]
 
-    # get unique group names for latitude coordinates
-    diff_count = [-1]
-    if len(lat_var_names) > 1:
-        unique_groups, diff_count = get_base_group_names(lat_var_names)
-    else:
-        unique_groups = [f'{GROUP_DELIM}{GROUP_DELIM.join(x.strip(GROUP_DELIM).split(GROUP_DELIM)[:-1])}' for x in lat_var_names]
-
     datasets = []
     total_list = []  # don't include repeated variables
     for lat_var_name, lon_var_name, time_var_name, diffs in zip(  # pylint: disable=too-many-nested-blocks
@@ -853,6 +849,11 @@ def subset_with_bbox(dataset: xr.Dataset,  # pylint: disable=too-many-branches
 
             if diffs == -1:  # if the lat name is in the root group: take only the root group vars
                 group_vars = list(dataset.data_vars.keys())
+                # include the coordinate variables if user asks for
+                group_vars.extend([
+                        var for var in list(dataset.coords.keys())
+                        if var in variables and var not in group_vars
+                    ])
             else:
                 group_vars = [
                     var for var in dataset.data_vars.keys()
@@ -861,13 +862,20 @@ def subset_with_bbox(dataset: xr.Dataset,  # pylint: disable=too-many-branches
                 # include variables that aren't in a latitude group
                 if variables:
                     group_vars.extend([
-                        var for var in dataset.data_vars.keys()
-                        if var in variables and var not in group_vars and var not in total_list and not var.startswith(tuple(unique_groups))
+                        var for var in dataset.variables.keys()
+                        if (var in variables and
+                            var not in group_vars and
+                            var not in total_list and
+                            not var.startswith(tuple(unique_groups))
+                            )
                     ])
                 else:
                     group_vars.extend([
                         var for var in dataset.data_vars.keys()
-                        if var not in group_vars and var not in total_list and not var.startswith(tuple(unique_groups))
+                        if (var not in group_vars and
+                            var not in total_list and
+                            not var.startswith(tuple(unique_groups))
+                            )
                         ])
 
                 # group dimensions do not get carried over if unused by data variables (MLS nTotalTimes var)
@@ -1065,6 +1073,33 @@ def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None):
     xarray.coding.times.decode_cf_datetime = decode_cf_datetime
 
 
+def open_dataset_test(file, args):
+    """
+    Open a NetCDF dataset using xarray, handling specific exceptions.
+
+    This function attempts to open a NetCDF dataset using the provided arguments.
+    If an OverflowError with a specific message is encountered, it modifies the
+    'mask_and_scale' argument to True and retries opening the dataset.
+
+    Args:
+        file (str): Path to the NetCDF file.
+        args (dict): Dictionary of arguments to pass to xr.open_dataset.
+
+    Returns:
+        None: The function modifies the 'args' dictionary in place.
+
+    """
+    try:
+        test_xr_open = xr.open_dataset(file, **args)
+        test_xr_open.close()
+    except Exception:  # pylint: disable=broad-except
+        traceback_str = traceback.format_exc()
+
+        # Check for the specific OverflowError message
+        if "Python int too large to convert to C long" in traceback_str and "Failed to decode variable 'time': unable to decode time units" in traceback_str:
+            args["mask_and_scale"] = True
+
+
 def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
            variables: Union[List[str], str, None] = (),
            # pylint: disable=too-many-branches, disable=too-many-statements
@@ -1162,14 +1197,7 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
 
     if min_time or max_time:
         args['decode_times'] = True
-        # check fill value and dtype, we know that this will cause an integer Overflow with xarray
-        if 'time' in nc_dataset.variables.keys():
-            try:
-                if nc_dataset['time'].getncattr('_FillValue') == nc.default_fillvals.get('f8') and \
-                 nc_dataset['time'].dtype == 'float64':
-                    args['mask_and_scale'] = True
-            except AttributeError:
-                pass
+        open_dataset_test(file_to_subset, args)
 
     with xr.open_dataset(
             xr.backends.NetCDF4DataStore(nc_dataset),

diff --git a/podaac/subsetter/subset_harmony.py b/podaac/subsetter/subset_harmony.py
@@ -136,7 +136,7 @@ def process_item(self, item: pystac.Item, source: harmony.message.Source) -> pys
                 harmony_bbox = message.subset.bbox
 
             if message.subset and message.subset.shape:
-                subset_params['shapefile_path'] = download(
+                subset_params['shapefile'] = download(
                     message.subset.shape.href,
                     temp_dir,
                     logger=self.logger,

diff --git a/podaac/subsetter/xarray_enhancements.py b/podaac/subsetter/xarray_enhancements.py
@@ -54,7 +54,7 @@ def get_indexers_from_1d(cond: xr.Dataset) -> dict:
 
 def get_indexers_from_nd(cond: xr.Dataset, cut: bool) -> dict:
     """
-    Get indexers from a dataset with more than 1 dimensions.
+    Get indexers from a dataset with more than one dimension.
 
     Parameters
     ----------

diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@
 
 [tool.poetry]
 name = "l2ss-py"
-version = "2.6.0rc2"
+version = "2.6.0rc11"
 description = "L2 Subsetter Service"
 authors = ["podaac-tva <[email protected]>"]
 license = "Apache-2.0"

diff --git a/tests/test_subset.py b/tests/test_subset.py
@@ -2141,7 +2141,7 @@ def test_tropomi_utc_time(data_dir, subset_output_dir, request):
     """Verify that the time UTC values are conserved in S5P files"""
     trop_dir = join(data_dir, 'tropomi')
     trop_file = 'S5P_OFFL_L2__CH4____20190319T110835_20190319T125006_07407_01_010202_20190325T125810_subset.nc4'
-    variable = ['/PRODUCT/time_utc']
+    variable = ['/PRODUCT/time_utc', '/PRODUCT/corner']
     bbox = np.array(((-180, 180), (-90, 90)))
     output_file = "{}_{}".format(request.node.name, trop_file)
     shutil.copyfile(
@@ -2161,6 +2161,8 @@ def test_tropomi_utc_time(data_dir, subset_output_dir, request):
     assert in_nc_dataset.groups['PRODUCT'].variables['time_utc'][:].squeeze()[0] ==\
                     out_nc_dataset.groups['PRODUCT'].variables['time_utc'][:].squeeze()[0]
 
+    assert out_nc_dataset.groups['PRODUCT'].variables['corner']
+
 def test_bad_time_unit(subset_output_dir):
     """TODO: give this function a description
     """