Skip to content

Commit

Permalink
BUG: fix writing fids to e.g. GPKG file with use_arrow (#511)
Browse files Browse the repository at this point in the history
  • Loading branch information
theroggy authored Dec 27, 2024
1 parent 07416f4 commit 6d6ebfb
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
### Bug fixes

- Fix WKB writing on big-endian systems (#497).
- Fix writing fids to e.g. GPKG file with use_arrow (#511).

### Packaging

Expand Down
23 changes: 23 additions & 0 deletions pyogrio/_io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2741,6 +2741,18 @@ cdef create_fields_from_arrow_schema(
IF CTE_GDAL_VERSION < (3, 8, 0):
raise RuntimeError("Need GDAL>=3.8 for Arrow write support")

# Some formats store the FID explicitly in a real column, e.g. GPKG.
# For those formats, OGR_L_GetFIDColumn will return the column name used
# for this and otherwise it returns "". GDAL typically also provides a
# layer creation option to overrule the column name to be used as FID
# column. When a column with the appropriate name is present in the data,
# GDAL will automatically use it for the FID. Reference:
# https://gdal.org/en/stable/tutorials/vector_api_tut.html#writing-to-ogr-using-the-arrow-c-data-interface
# Hence, the column should not be created as an ordinary field as well.
# Doing so triggers a bug in GDAL < 3.10.1:
# https://github.com/OSGeo/gdal/issues/11527#issuecomment-2556092722
fid_column = get_string(OGR_L_GetFIDColumn(destLayer))

# The schema object is a struct type where each child is a column.
cdef ArrowSchema* child
for i in range(schema.n_children):
Expand All @@ -2752,6 +2764,17 @@ cdef create_fields_from_arrow_schema(
# Don't create property for geometry column
if get_string(child.name) == geometry_name or is_arrow_geometry_field(child):
continue

# Don't create property for column that will already be used as FID
# Note: it seems that GDAL initially uses a case-sensitive check of the
# FID column, but then falls back to case-insensitive matching via
# the "ordinary" field being added. So, the check here needs to be
# case-sensitive so the column is still added as regular column if the
# casing isn't matched, otherwise the column is simply "lost".
# Note2: in the non-arrow path, the FID column is also treated
# case-insensitive, so this is consistent with that.
if fid_column != "" and get_string(child.name) == fid_column:
continue

if not OGR_L_CreateFieldFromArrowSchema(destLayer, child, options):
exc = check_last_error()
Expand Down
42 changes: 42 additions & 0 deletions pyogrio/tests/test_geopandas_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -993,6 +993,48 @@ def test_write_csv_encoding(tmp_path, encoding):
assert csv_bytes == csv_pyogrio_bytes


@pytest.mark.parametrize(
"ext, fid_column, fid_param_value",
[
(".gpkg", "fid", None),
(".gpkg", "FID", None),
(".sqlite", "ogc_fid", None),
(".gpkg", "fid_custom", "fid_custom"),
(".gpkg", "FID_custom", "fid_custom"),
(".sqlite", "ogc_fid_custom", "ogc_fid_custom"),
],
)
@pytest.mark.requires_arrow_write_api
def test_write_custom_fids(tmp_path, ext, fid_column, fid_param_value, use_arrow):
"""Test to specify FIDs to save when writing to a file.
Saving custom FIDs is only supported for formats that actually store the FID, like
e.g. GPKG and SQLite. The fid_column name check is case-insensitive.
Typically, GDAL supports using a custom FID column for these file formats via a
`FID` layer creation option, which is also tested here. If `fid_param_value` is
specified (not None), an `fid` parameter is passed to `write_dataframe`, causing
GDAL to use the column name specified for the FID.
"""
input_gdf = gp.GeoDataFrame(
{fid_column: [5]}, geometry=[shapely.Point(0, 0)], crs="epsg:4326"
)
kwargs = {}
if fid_param_value is not None:
kwargs["fid"] = fid_param_value
path = tmp_path / f"test{ext}"

write_dataframe(input_gdf, path, use_arrow=use_arrow, **kwargs)

assert path.exists()
output_gdf = read_dataframe(path, fid_as_index=True, use_arrow=use_arrow)
output_gdf = output_gdf.reset_index()

# pyogrio always sets "fid" as index name with `fid_as_index`
expected_gdf = input_gdf.rename(columns={fid_column: "fid"})
assert_geodataframe_equal(output_gdf, expected_gdf)


@pytest.mark.parametrize("ext", ALL_EXTS)
@pytest.mark.requires_arrow_write_api
def test_write_dataframe(tmp_path, naturalearth_lowres, ext, use_arrow):
Expand Down

0 comments on commit 6d6ebfb

Please sign in to comment.