From 6d6ebfb44e366f6e01f94b408116c4d041641f7e Mon Sep 17 00:00:00 2001
From: Pieter Roggemans <pieter.roggemans@gmail.com>
Date: Fri, 27 Dec 2024 18:26:46 +0100
Subject: [PATCH] BUG: fix writing fids to e.g. GPKG file with use_arrow (#511)

---
 CHANGES.md                         |  1 +
 pyogrio/_io.pyx                    | 23 ++++++++++++++++
 pyogrio/tests/test_geopandas_io.py | 42 ++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+)

diff --git a/CHANGES.md b/CHANGES.md
index 39c0ff8f..117751cd 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -9,6 +9,7 @@
 ### Bug fixes
 
 -   Fix WKB writing on big-endian systems (#497).
+-   Fix writing fids to e.g. GPKG file with use_arrow (#511).
 
 ### Packaging
 
diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
index 55245799..d7334838 100644
--- a/pyogrio/_io.pyx
+++ b/pyogrio/_io.pyx
@@ -2741,6 +2741,18 @@ cdef create_fields_from_arrow_schema(
     IF CTE_GDAL_VERSION < (3, 8, 0):
         raise RuntimeError("Need GDAL>=3.8 for Arrow write support")
 
+    # Some formats store the FID explicitly in a real column, e.g. GPKG.
+    # For those formats, OGR_L_GetFIDColumn will return the column name used
+    # for this and otherwise it returns "". GDAL typically also provides a
+    # layer creation option to overrule the column name to be used as FID
+    # column. When a column with the appropriate name is present in the data,
+    # GDAL will automatically use it for the FID. Reference:
+    # https://gdal.org/en/stable/tutorials/vector_api_tut.html#writing-to-ogr-using-the-arrow-c-data-interface
+    # Hence, the column should not be created as an ordinary field as well.
+    # Doing so triggers a bug in GDAL < 3.10.1:
+    # https://github.com/OSGeo/gdal/issues/11527#issuecomment-2556092722
+    fid_column = get_string(OGR_L_GetFIDColumn(destLayer))
+
     # The schema object is a struct type where each child is a column.
     cdef ArrowSchema* child
     for i in range(schema.n_children):
@@ -2752,6 +2764,17 @@ cdef create_fields_from_arrow_schema(
         # Don't create property for geometry column
         if get_string(child.name) == geometry_name or is_arrow_geometry_field(child):
             continue
+        
+        # Don't create property for column that will already be used as FID
+        # Note: it seems that GDAL initially uses a case-sensitive check of the
+        # FID column, but then falls back to case-insensitive matching via
+        # the "ordinary" field being added. So, the check here needs to be
+        # case-sensitive so the column is still added as regular column if the
+        # casing isn't matched, otherwise the column is simply "lost".
+        # Note2: in the non-arrow path, the FID column is also treated
+        # case-insensitive, so this is consistent with that.
+        if fid_column != "" and get_string(child.name) == fid_column:
+            continue
 
         if not OGR_L_CreateFieldFromArrowSchema(destLayer, child, options):
             exc = check_last_error()
diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index 96d9e3a0..a65b5baa 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -993,6 +993,48 @@ def test_write_csv_encoding(tmp_path, encoding):
     assert csv_bytes == csv_pyogrio_bytes
 
 
+@pytest.mark.parametrize(
+    "ext, fid_column, fid_param_value",
+    [
+        (".gpkg", "fid", None),
+        (".gpkg", "FID", None),
+        (".sqlite", "ogc_fid", None),
+        (".gpkg", "fid_custom", "fid_custom"),
+        (".gpkg", "FID_custom", "fid_custom"),
+        (".sqlite", "ogc_fid_custom", "ogc_fid_custom"),
+    ],
+)
+@pytest.mark.requires_arrow_write_api
+def test_write_custom_fids(tmp_path, ext, fid_column, fid_param_value, use_arrow):
+    """Test to specify FIDs to save when writing to a file.
+
+    Saving custom FIDs is only supported for formats that actually store the FID, like
+    e.g. GPKG and SQLite. The fid_column name check is case-insensitive.
+
+    Typically, GDAL supports using a custom FID column for these file formats via a
+    `FID` layer creation option, which is also tested here. If `fid_param_value` is
+    specified (not None), an `fid` parameter is passed to `write_dataframe`, causing
+    GDAL to use the column name specified for the FID.
+    """
+    input_gdf = gp.GeoDataFrame(
+        {fid_column: [5]}, geometry=[shapely.Point(0, 0)], crs="epsg:4326"
+    )
+    kwargs = {}
+    if fid_param_value is not None:
+        kwargs["fid"] = fid_param_value
+    path = tmp_path / f"test{ext}"
+
+    write_dataframe(input_gdf, path, use_arrow=use_arrow, **kwargs)
+
+    assert path.exists()
+    output_gdf = read_dataframe(path, fid_as_index=True, use_arrow=use_arrow)
+    output_gdf = output_gdf.reset_index()
+
+    # pyogrio always sets "fid" as index name with `fid_as_index`
+    expected_gdf = input_gdf.rename(columns={fid_column: "fid"})
+    assert_geodataframe_equal(output_gdf, expected_gdf)
+
+
 @pytest.mark.parametrize("ext", ALL_EXTS)
 @pytest.mark.requires_arrow_write_api
 def test_write_dataframe(tmp_path, naturalearth_lowres, ext, use_arrow):