From 6d6ebfb44e366f6e01f94b408116c4d041641f7e Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Fri, 27 Dec 2024 18:26:46 +0100 Subject: [PATCH] BUG: fix writing fids to e.g. GPKG file with use_arrow (#511) --- CHANGES.md | 1 + pyogrio/_io.pyx | 23 ++++++++++++++++ pyogrio/tests/test_geopandas_io.py | 42 ++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 39c0ff8f..117751cd 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -9,6 +9,7 @@ ### Bug fixes - Fix WKB writing on big-endian systems (#497). +- Fix writing fids to e.g. GPKG file with use_arrow (#511). ### Packaging diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 55245799..d7334838 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -2741,6 +2741,18 @@ cdef create_fields_from_arrow_schema( IF CTE_GDAL_VERSION < (3, 8, 0): raise RuntimeError("Need GDAL>=3.8 for Arrow write support") + # Some formats store the FID explicitly in a real column, e.g. GPKG. + # For those formats, OGR_L_GetFIDColumn will return the column name used + # for this and otherwise it returns "". GDAL typically also provides a + # layer creation option to overrule the column name to be used as FID + # column. When a column with the appropriate name is present in the data, + # GDAL will automatically use it for the FID. Reference: + # https://gdal.org/en/stable/tutorials/vector_api_tut.html#writing-to-ogr-using-the-arrow-c-data-interface + # Hence, the column should not be created as an ordinary field as well. + # Doing so triggers a bug in GDAL < 3.10.1: + # https://github.com/OSGeo/gdal/issues/11527#issuecomment-2556092722 + fid_column = get_string(OGR_L_GetFIDColumn(destLayer)) + # The schema object is a struct type where each child is a column. cdef ArrowSchema* child for i in range(schema.n_children): @@ -2752,6 +2764,17 @@ cdef create_fields_from_arrow_schema( # Don't create property for geometry column if get_string(child.name) == geometry_name or is_arrow_geometry_field(child): continue + + # Don't create property for column that will already be used as FID + # Note: it seems that GDAL initially uses a case-sensitive check of the + # FID column, but then falls back to case-insensitive matching via + # the "ordinary" field being added. So, the check here needs to be + # case-sensitive so the column is still added as regular column if the + # casing isn't matched, otherwise the column is simply "lost". + # Note2: in the non-arrow path, the FID column is also treated + # case-insensitive, so this is consistent with that. + if fid_column != "" and get_string(child.name) == fid_column: + continue if not OGR_L_CreateFieldFromArrowSchema(destLayer, child, options): exc = check_last_error() diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 96d9e3a0..a65b5baa 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -993,6 +993,48 @@ def test_write_csv_encoding(tmp_path, encoding): assert csv_bytes == csv_pyogrio_bytes +@pytest.mark.parametrize( + "ext, fid_column, fid_param_value", + [ + (".gpkg", "fid", None), + (".gpkg", "FID", None), + (".sqlite", "ogc_fid", None), + (".gpkg", "fid_custom", "fid_custom"), + (".gpkg", "FID_custom", "fid_custom"), + (".sqlite", "ogc_fid_custom", "ogc_fid_custom"), + ], +) +@pytest.mark.requires_arrow_write_api +def test_write_custom_fids(tmp_path, ext, fid_column, fid_param_value, use_arrow): + """Test to specify FIDs to save when writing to a file. + + Saving custom FIDs is only supported for formats that actually store the FID, like + e.g. GPKG and SQLite. The fid_column name check is case-insensitive. + + Typically, GDAL supports using a custom FID column for these file formats via a + `FID` layer creation option, which is also tested here. If `fid_param_value` is + specified (not None), an `fid` parameter is passed to `write_dataframe`, causing + GDAL to use the column name specified for the FID. + """ + input_gdf = gp.GeoDataFrame( + {fid_column: [5]}, geometry=[shapely.Point(0, 0)], crs="epsg:4326" + ) + kwargs = {} + if fid_param_value is not None: + kwargs["fid"] = fid_param_value + path = tmp_path / f"test{ext}" + + write_dataframe(input_gdf, path, use_arrow=use_arrow, **kwargs) + + assert path.exists() + output_gdf = read_dataframe(path, fid_as_index=True, use_arrow=use_arrow) + output_gdf = output_gdf.reset_index() + + # pyogrio always sets "fid" as index name with `fid_as_index` + expected_gdf = input_gdf.rename(columns={fid_column: "fid"}) + assert_geodataframe_equal(output_gdf, expected_gdf) + + @pytest.mark.parametrize("ext", ALL_EXTS) @pytest.mark.requires_arrow_write_api def test_write_dataframe(tmp_path, naturalearth_lowres, ext, use_arrow):