Skip to content

Commit

Permalink
Update CRS handling in FlatGeobuf (#883)
Browse files Browse the repository at this point in the history
- [x] Still need to fix piping through the CRS and using pyproj to
transform the WKT
- [x] Update Rust and Python write docs.
  • Loading branch information
kylebarron authored Dec 5, 2024
1 parent d54cbb2 commit fefa36f
Show file tree
Hide file tree
Showing 21 changed files with 648 additions and 266 deletions.
5 changes: 2 additions & 3 deletions python/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions python/geoarrow-io/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@ async = [
[dependencies]
arrow = { workspace = true }
bytes = "1"
# For geo-traits impl
flatgeobuf = { git = "https://github.com/flatgeobuf/flatgeobuf", rev = "f554f2768b612e131e9f55d014eaa5b911a7f1b5", default-features = false }
futures = { version = "0.3", optional = true }
object_store = { workspace = true, features = [
"aws",
Expand Down
146 changes: 146 additions & 0 deletions python/geoarrow-io/python/geoarrow/rust/io/_flatgeobuf.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
from __future__ import annotations

from pathlib import Path
from typing import BinaryIO, Optional, Tuple, Union

from arro3.core import Table
from arro3.core.types import ArrowStreamExportable

def read_flatgeobuf(
file: Union[str, Path, BinaryIO],
*,
store: Optional[ObjectStore] = None,
batch_size: int = 65536,
bbox: Tuple[float, float, float, float] | None = None,
) -> Table:
"""
Read a FlatGeobuf file from a path on disk or a remote location into an Arrow Table.
Example:
Reading from a local path:
```py
from geoarrow.rust.io import read_flatgeobuf
table = read_flatgeobuf("path/to/file.fgb")
```
Reading from a Python file object:
```py
from geoarrow.rust.io import read_flatgeobuf
with open("path/to/file.fgb", "rb") as file:
table = read_flatgeobuf(file)
```
Reading from an HTTP(S) url:
```py
from geoarrow.rust.io import read_flatgeobuf
url = "http://flatgeobuf.org/test/data/UScounties.fgb"
table = read_flatgeobuf(url)
```
Reading from a remote file on an S3 bucket.
```py
from geoarrow.rust.io import ObjectStore, read_flatgeobuf
options = {
"aws_access_key_id": "...",
"aws_secret_access_key": "...",
"aws_region": "..."
}
store = ObjectStore('s3://bucket', options=options)
table = read_flatgeobuf("path/in/bucket.fgb", store=store)
```
Args:
file: the path to the file or a Python file object in binary read mode.
Other args:
store: an ObjectStore instance for this url. This is required only if the file is at a remote
location.
batch_size: the number of rows to include in each internal batch of the table.
bbox: A spatial filter for reading rows, of the format (minx, miny, maxx, maxy). If set to
`None`, no spatial filtering will be performed.
Returns:
Table from FlatGeobuf file.
"""

async def read_flatgeobuf_async(
path: str,
*,
store: Optional[ObjectStore] = None,
batch_size: int = 65536,
bbox: Tuple[float, float, float, float] | None = None,
) -> Table:
"""
Read a FlatGeobuf file from a url into an Arrow Table.
Example:
Reading from an HTTP(S) url:
```py
from geoarrow.rust.io import read_flatgeobuf_async
url = "http://flatgeobuf.org/test/data/UScounties.fgb"
table = await read_flatgeobuf_async(url)
```
Reading from an S3 bucket:
```py
from geoarrow.rust.io import ObjectStore, read_flatgeobuf_async
options = {
"aws_access_key_id": "...",
"aws_secret_access_key": "...",
"aws_region": "..."
}
store = ObjectStore('s3://bucket', options=options)
table = await read_flatgeobuf_async("path/in/bucket.fgb", store=store)
```
Args:
path: the url or relative path to a remote FlatGeobuf file. If an argument is passed for
`store`, this should be a path fragment relative to the root passed to the `ObjectStore`
constructor.
Other args:
store: an ObjectStore instance for this url. This is required for non-HTTP urls.
batch_size: the number of rows to include in each internal batch of the table.
bbox: A spatial filter for reading rows, of the format (minx, miny, maxx, maxy). If set to
`None`, no spatial filtering will be performed.
Returns:
Table from FlatGeobuf file.
"""

def write_flatgeobuf(
table: ArrowStreamExportable,
file: str | Path | BinaryIO,
*,
write_index: bool = True,
promote_to_multi: bool = True,
title: str | None = None,
description: str | None = None,
metadata: str | None = None,
) -> None:
"""
Write to a FlatGeobuf file on disk.
Args:
table: the Arrow RecordBatch, Table, or RecordBatchReader to write.
file: the path to the file or a Python file object in binary write mode.
Other args:
write_index: whether to write a spatial index in the FlatGeobuf file. Defaults to True.
title: Dataset title. Defaults to `None`.
description: Dataset description (intended for free form long text).
metadata: Dataset metadata (intended to be application specific).
"""
145 changes: 5 additions & 140 deletions python/geoarrow-io/python/geoarrow/rust/io/_io.pyi
Original file line number Diff line number Diff line change
@@ -1,14 +1,7 @@
from __future__ import annotations

from pathlib import Path
from typing import (
BinaryIO,
List,
Optional,
Sequence,
Tuple,
Union,
)
from typing import BinaryIO, List, Optional, Sequence, Union

from arro3.core import Schema, Table
from arro3.core.types import (
Expand All @@ -22,6 +15,10 @@ from pyproj import CRS
from .enums import GeoParquetEncoding
from .types import BboxCovering, GeoParquetEncodingT

from ._flatgeobuf import read_flatgeobuf as read_flatgeobuf
from ._flatgeobuf import read_flatgeobuf_async as read_flatgeobuf_async
from ._flatgeobuf import write_flatgeobuf as write_flatgeobuf

class ParquetFile:
def __init__(self, path: str, store: ObjectStore) -> None:
"""
Expand Down Expand Up @@ -258,121 +255,6 @@ def read_csv(
Table from CSV file.
"""

def read_flatgeobuf(
file: Union[str, Path, BinaryIO],
*,
store: Optional[ObjectStore] = None,
batch_size: int = 65536,
bbox: Tuple[float, float, float, float] | None = None,
) -> Table:
"""
Read a FlatGeobuf file from a path on disk or a remote location into an Arrow Table.
Example:
Reading from a local path:
```py
from geoarrow.rust.io import read_flatgeobuf
table = read_flatgeobuf("path/to/file.fgb")
```
Reading from a Python file object:
```py
from geoarrow.rust.io import read_flatgeobuf
with open("path/to/file.fgb", "rb") as file:
table = read_flatgeobuf(file)
```
Reading from an HTTP(S) url:
```py
from geoarrow.rust.io import read_flatgeobuf
url = "http://flatgeobuf.org/test/data/UScounties.fgb"
table = read_flatgeobuf(url)
```
Reading from a remote file on an S3 bucket.
```py
from geoarrow.rust.io import ObjectStore, read_flatgeobuf
options = {
"aws_access_key_id": "...",
"aws_secret_access_key": "...",
"aws_region": "..."
}
store = ObjectStore('s3://bucket', options=options)
table = read_flatgeobuf("path/in/bucket.fgb", store=store)
```
Args:
file: the path to the file or a Python file object in binary read mode.
Other args:
store: an ObjectStore instance for this url. This is required only if the file is at a remote
location.
batch_size: the number of rows to include in each internal batch of the table.
bbox: A spatial filter for reading rows, of the format (minx, miny, maxx, maxy). If set to
`None`, no spatial filtering will be performed.
Returns:
Table from FlatGeobuf file.
"""

async def read_flatgeobuf_async(
path: str,
*,
store: Optional[ObjectStore] = None,
batch_size: int = 65536,
bbox: Tuple[float, float, float, float] | None = None,
) -> Table:
"""
Read a FlatGeobuf file from a url into an Arrow Table.
Example:
Reading from an HTTP(S) url:
```py
from geoarrow.rust.io import read_flatgeobuf_async
url = "http://flatgeobuf.org/test/data/UScounties.fgb"
table = await read_flatgeobuf_async(url)
```
Reading from an S3 bucket:
```py
from geoarrow.rust.io import ObjectStore, read_flatgeobuf_async
options = {
"aws_access_key_id": "...",
"aws_secret_access_key": "...",
"aws_region": "..."
}
store = ObjectStore('s3://bucket', options=options)
table = await read_flatgeobuf_async("path/in/bucket.fgb", store=store)
```
Args:
path: the url or relative path to a remote FlatGeobuf file. If an argument is passed for
`store`, this should be a path fragment relative to the root passed to the `ObjectStore`
constructor.
Other args:
store: an ObjectStore instance for this url. This is required for non-HTTP urls.
batch_size: the number of rows to include in each internal batch of the table.
bbox: A spatial filter for reading rows, of the format (minx, miny, maxx, maxy). If set to
`None`, no spatial filtering will be performed.
Returns:
Table from FlatGeobuf file.
"""

def read_geojson(file: Union[str, Path, BinaryIO], *, batch_size: int = 65536) -> Table:
"""
Read a GeoJSON file from a path on disk into an Arrow Table.
Expand Down Expand Up @@ -547,23 +429,6 @@ def write_csv(table: ArrowStreamExportable, file: str | Path | BinaryIO) -> None
None
"""

def write_flatgeobuf(
table: ArrowStreamExportable,
file: str | Path | BinaryIO,
*,
write_index: bool = True,
) -> None:
"""
Write to a FlatGeobuf file on disk.
Args:
table: the Arrow RecordBatch, Table, or RecordBatchReader to write.
file: the path to the file or a Python file object in binary write mode.
Other args:
write_index: whether to write a spatial index in the FlatGeobuf file. Defaults to True.
"""

def write_geojson(
table: ArrowStreamExportable, file: Union[str, Path, BinaryIO]
) -> None:
Expand Down
Loading

0 comments on commit fefa36f

Please sign in to comment.