diff --git a/python/geoarrow-io/src/io/csv.rs b/python/geoarrow-io/src/io/csv.rs index a976f5fa..5b5de8e9 100644 --- a/python/geoarrow-io/src/io/csv.rs +++ b/python/geoarrow-io/src/io/csv.rs @@ -64,7 +64,7 @@ pub fn read_csv( file.seek(SeekFrom::Start(pos))?; - let record_batch_reader = csv::read_csv(file, schema.into(), options)?; + let record_batch_reader = csv::read_csv(file, schema, options)?; let schema = record_batch_reader.schema(); let batches = record_batch_reader.collect::, _>>()?; Ok(PyTable::try_new(batches, schema)?.into()) diff --git a/rust/geoarrow/src/array/metadata.rs b/rust/geoarrow/src/array/metadata.rs index 84c7359d..8cab7577 100644 --- a/rust/geoarrow/src/array/metadata.rs +++ b/rust/geoarrow/src/array/metadata.rs @@ -104,6 +104,10 @@ impl ArrayMetadata { Self::default().with_unknown_crs_type(value) } + pub fn from_authority_code(value: String) -> Self { + Self::default().with_authority_code(value) + } + pub fn with_projjson(mut self, value: Value) -> Self { self.crs = Some(value); self.crs_type = Some(CRSType::Projjson); @@ -122,6 +126,12 @@ impl ArrayMetadata { self } + pub fn with_authority_code(mut self, value: String) -> Self { + self.crs = Some(Value::String(value)); + self.crs_type = Some(CRSType::AuthorityCode); + self + } + pub fn with_edges(mut self, edges: Edges) -> Self { self.edges = Some(edges); self diff --git a/rust/geoarrow/src/io/crs.rs b/rust/geoarrow/src/io/crs.rs index 55424de0..e391cad4 100644 --- a/rust/geoarrow/src/io/crs.rs +++ b/rust/geoarrow/src/io/crs.rs @@ -1,3 +1,6 @@ +//! Defines CRS transforms used for writing GeoArrow data to file formats that require different +//! CRS representations. + use std::fmt::Debug; use serde_json::Value; @@ -51,7 +54,9 @@ pub trait CRSTransform: Debug { } } -/// A default implementation for [CRSTransform] which errors on any CRS conversion. +/// A default implementation for [CRSTransform] which does not do any CRS conversion. +/// +/// Instead of raising an error, this will **silently drop any CRS information when writing data**. #[derive(Debug, Clone, Default)] pub struct DefaultCRSTransform {} diff --git a/rust/geoarrow/src/io/csv/mod.rs b/rust/geoarrow/src/io/csv/mod.rs index 5d0c94fa..ebe2394e 100644 --- a/rust/geoarrow/src/io/csv/mod.rs +++ b/rust/geoarrow/src/io/csv/mod.rs @@ -1,4 +1,46 @@ //! Read from and write to CSV files. +//! +//! # Examples +//! +//! ``` +//! use std::io::{Cursor, Seek}; +//! +//! use arrow_array::RecordBatchReader; +//! +//! use geoarrow::array::CoordType; +//! use geoarrow::io::csv::{infer_csv_schema, read_csv, CSVReaderOptions}; +//! use geoarrow::table::Table; +//! +//! let s = r#" +//! address,type,datetime,report location,incident number +//! 904 7th Av,Car Fire,05/22/2019 12:55:00 PM,POINT (-122.329051 47.6069),F190051945 +//! 9610 53rd Av S,Aid Response,05/22/2019 12:55:00 PM,POINT (-122.266529 47.515984),F190051946"#; +//! let mut cursor = Cursor::new(s); +//! +//! let options = CSVReaderOptions { +//! coord_type: CoordType::Separated, +//! geometry_column_name: Some("report location".to_string()), +//! has_header: Some(true), +//! ..Default::default() +//! }; +//! +//! // Note: this initial schema currently represents the CSV data _on disk_. That is, the +//! // geometry column is represented as a string. This may change in the future. +//! let (schema, _read_records, _geometry_column_name) = +//! infer_csv_schema(&mut cursor, &options).unwrap(); +//! cursor.rewind().unwrap(); +//! +//! // `read_csv` returns a RecordBatchReader, which enables streaming the CSV without reading +//! // all of it. +//! let record_batch_reader = read_csv(cursor, schema, options).unwrap(); +//! let geospatial_schema = record_batch_reader.schema(); +//! let table = Table::try_new( +//! record_batch_reader.collect::>().unwrap(), +//! geospatial_schema, +//! ) +//! .unwrap(); +//! ``` +//! pub use reader::{infer_csv_schema, read_csv, CSVReaderOptions}; pub use writer::write_csv; diff --git a/rust/geoarrow/src/io/csv/reader.rs b/rust/geoarrow/src/io/csv/reader.rs index 73317bcc..8fae0d8d 100644 --- a/rust/geoarrow/src/io/csv/reader.rs +++ b/rust/geoarrow/src/io/csv/reader.rs @@ -95,29 +95,34 @@ impl Default for CSVReaderOptions { } } -/// Infer a CSV file's schema +/// Infer a CSV file's schema. +/// +/// By default, the reader will **scan the entire CSV file** to infer the data's +/// schema. If your data is large, you can limit the number of records scanned +/// with the [CSVReaderOptions]. +/// /// Returns (Schema, records_read, geometry column name) /// /// Note that the geometry column in the Schema is still left as a String. pub fn infer_csv_schema( reader: impl Read, options: &CSVReaderOptions, -) -> Result<(Schema, usize, String)> { +) -> Result<(SchemaRef, usize, String)> { let format = options.to_format(); let (schema, records_read) = format.infer_schema(reader, options.max_records)?; let geometry_col_name = find_geometry_column(&schema, options.geometry_column_name.as_deref())?; - Ok((schema, records_read, geometry_col_name)) + Ok((Arc::new(schema), records_read, geometry_col_name)) } -/// Read a CSV file to a Table +/// Read a CSV file to a [RecordBatchReader]. /// /// This expects a geometry to be encoded as WKT within one column. /// -/// Note that this is Read and not Read + Seek. This means that you must infer the schema yourself -/// before calling this function. This allows using with objects that are only `Read` in the case -/// when you already know the file's schema. +/// Note that the input required here is [`Read`] and not [`Read`] + [`Seek`][std::io::Seek]. This +/// means that you must infer the schema yourself before calling this function. This allows using +/// with objects that are only `Read` in the case when you already know the file's schema. /// /// This schema is expected to be the schema inferred by `arrow-csv`'s /// [`infer_schema`][Format::infer_schema]. That means the geometry should be a string in the diff --git a/rust/geoarrow/src/io/gdal/mod.rs b/rust/geoarrow/src/io/gdal/mod.rs index a6d363c8..7981536d 100644 --- a/rust/geoarrow/src/io/gdal/mod.rs +++ b/rust/geoarrow/src/io/gdal/mod.rs @@ -1,3 +1,5 @@ +//! Read-only integration with [GDAL][gdal]. + mod reader; pub use reader::read_gdal; diff --git a/rust/geoarrow/src/io/geos/mod.rs b/rust/geoarrow/src/io/geos/mod.rs index aced9efe..cf2f6c8b 100644 --- a/rust/geoarrow/src/io/geos/mod.rs +++ b/rust/geoarrow/src/io/geos/mod.rs @@ -1,4 +1,4 @@ //! Export to and import from data structures of the [`geos`] crate. mod array; -pub mod scalar; +pub(crate) mod scalar; diff --git a/rust/geoarrow/src/io/mod.rs b/rust/geoarrow/src/io/mod.rs index 5f450c88..38eeadd3 100644 --- a/rust/geoarrow/src/io/mod.rs +++ b/rust/geoarrow/src/io/mod.rs @@ -6,7 +6,7 @@ pub mod crs; #[cfg(feature = "csv")] pub mod csv; -pub mod display; +pub(crate) mod display; #[cfg(feature = "flatgeobuf")] pub mod flatgeobuf; #[cfg(feature = "gdal")] @@ -14,7 +14,7 @@ pub mod gdal; pub mod geojson; pub mod geojson_lines; #[cfg(feature = "geos")] -pub mod geos; +pub(crate) mod geos; pub mod geozero; pub mod ipc; #[cfg(feature = "parquet")] diff --git a/rust/geoarrow/src/io/shapefile/mod.rs b/rust/geoarrow/src/io/shapefile/mod.rs index ac1f77ff..868124c5 100644 --- a/rust/geoarrow/src/io/shapefile/mod.rs +++ b/rust/geoarrow/src/io/shapefile/mod.rs @@ -1,3 +1,7 @@ +//! Read from [Shapefile](https://www.esri.com/content/dam/esrisites/sitecore-archive/Files/Pdfs/library/whitepapers/pdfs/shapefile.pdf) datasets. +//! +//! This wraps the [shapefile] crate. + mod reader; mod scalar; diff --git a/rust/geoarrow/src/io/stream.rs b/rust/geoarrow/src/io/stream.rs index 19534e91..7076f51c 100644 --- a/rust/geoarrow/src/io/stream.rs +++ b/rust/geoarrow/src/io/stream.rs @@ -3,8 +3,8 @@ use crate::table::Table; use arrow_array::{RecordBatchIterator, RecordBatchReader as _RecordBatchReader}; use arrow_schema::SchemaRef; -/// A newtype wrapper around an [arrow_array::RecordBatchReader] so that we can impl the -/// [geozero::GeozeroDatasource] trait. +/// A newtype wrapper around an [`arrow_array::RecordBatchReader`] so that we can implement the +/// [`geozero::GeozeroDatasource`] trait on it. pub struct RecordBatchReader(Option>); impl RecordBatchReader { diff --git a/rust/geoarrow/src/io/wkb/mod.rs b/rust/geoarrow/src/io/wkb/mod.rs index 03b68f02..ecb9fc20 100644 --- a/rust/geoarrow/src/io/wkb/mod.rs +++ b/rust/geoarrow/src/io/wkb/mod.rs @@ -1,4 +1,7 @@ -//! An optimized implementation of reading and writing ISO-flavored WKB-encoded geometries. +//! Read and write geometries encoded as [Well-Known Binary](https://libgeos.org/specifications/wkb/). +//! +//! This wraps the [wkb] crate. As such, it currently supports reading the ISO and extended (EWKB) +//! variants of WKB. Currently, it always writes the ISO WKB variant. mod api; pub(crate) mod writer; diff --git a/rust/geoarrow/src/io/wkt/mod.rs b/rust/geoarrow/src/io/wkt/mod.rs index 66ed5386..026f332c 100644 --- a/rust/geoarrow/src/io/wkt/mod.rs +++ b/rust/geoarrow/src/io/wkt/mod.rs @@ -1,3 +1,60 @@ +//! Read and write geometries encoded as [Well-Known Text](https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry). +//! +//! ## Example +//! +//! ``` +//! use std::sync::Arc; +//! +//! use arrow_array::StringArray; +//! +//! use geoarrow::array::metadata::ArrayMetadata; +//! use geoarrow::array::{AsNativeArray, CoordType, GeometryArray, WKTArray}; +//! use geoarrow::datatypes::NativeType; +//! use geoarrow::io::wkt::{read_wkt, ToWKT}; +//! use geoarrow::trait_::ArrayAccessor; +//! use geoarrow::NativeArray; +//! +//! // Start with some WKT data +//! let wkt_strings = vec![ +//! "POINT(30 10)", +//! "LINESTRING(30 10, 10 30, 40 40)", +//! "POLYGON((30 10, 40 40, 20 40, 10 20, 30 10))", +//! ]; +//! +//! // Construct an Arrow StringArray from this data +//! let arrow_arr = StringArray::from_iter_values(wkt_strings); +//! +//! // GeoArrow has a `WKTArray` concept in order to associate geospatial metadata with WKT data. +//! // Here, we associate CRS information with the WKT array, which will be maintained in the +//! // parsed representation. +//! let array_metadata = Arc::new(ArrayMetadata::from_authority_code("EPSG:4326".to_string())); +//! let wkt_array = WKTArray::new(arrow_arr, array_metadata); +//! +//! // Parse this WKT array to an `Arc` +//! let geometry_array: Arc = +//! read_wkt(&wkt_array, CoordType::Separated, false).unwrap(); +//! +//! // All parsed WKT data currently has `NativeType::Geometry`, because there's no way to know in +//! // advance what the geometry type of the WKT is. +//! assert!(matches!( +//! geometry_array.data_type(), +//! NativeType::Geometry(CoordType::Separated) +//! )); +//! +//! // Now we can downcast the dynamic reference to a concrete `GeometryArray`, and access a value +//! // as a `geo::Geometry` +//! let geometry_array_ref = geometry_array.as_ref(); +//! let downcasted: &GeometryArray = geometry_array_ref.as_geometry(); +//! matches!( +//! downcasted.value_as_geo(0), +//! geo::Geometry::Point(geo::Point(geo::Coord { x: 30.0, y: 10.0 })) +//! ); +//! +//! // Then we can write back to WKT +//! let wkt_array_again: WKTArray = downcasted.as_ref().to_wkt().unwrap(); +//! assert_eq!(wkt_array_again.into_inner().value(0), "POINT(30 10)") +//! ``` + mod reader; mod writer; diff --git a/rust/geoarrow/src/io/wkt/writer.rs b/rust/geoarrow/src/io/wkt/writer.rs index 957d1727..fa01630d 100644 --- a/rust/geoarrow/src/io/wkt/writer.rs +++ b/rust/geoarrow/src/io/wkt/writer.rs @@ -12,9 +12,13 @@ use wkt::to_wkt::{ write_multi_point, write_multi_polygon, write_point, write_polygon, write_rect, }; +/// Serialize a geometry array to Well-Known Text pub trait ToWKT { + /// The output type of the operation. You can specify whether you want to use i32 or i64 + /// offsets for the Arrow string array. type Output; + /// Convert to WKT. fn to_wkt(&self) -> Self::Output; } @@ -41,16 +45,6 @@ impl ToWKT for &dyn NativeArray { } match self.data_type() { - // Point(_, _) => { - // for maybe_geom in self.as_point().iter() { - // if let Some(geom) = maybe_geom { - // write_point(&mut output_array, &geom)?; - // output_array.append_value(""); - // } else { - // output_array.append_null(); - // } - // } - // } Point(_, _) => impl_to_wkt!(as_point, write_point), LineString(_, _) => impl_to_wkt!(as_line_string, write_linestring), Polygon(_, _) => impl_to_wkt!(as_polygon, write_polygon), diff --git a/rust/geoarrow/src/table.rs b/rust/geoarrow/src/table.rs index 6052483f..75cfd9a0 100644 --- a/rust/geoarrow/src/table.rs +++ b/rust/geoarrow/src/table.rs @@ -32,7 +32,7 @@ pub(crate) static GEOARROW_EXTENSION_NAMES: Set<&'static str> = phf_set! { "ogc.wkb", }; -/// An Arrow table that MAY contain one or more geospatial columns. +/// An Arrow table that may contain one or more geospatial columns. /// /// This Table object is designed to be interoperable with non-geospatial Arrow libraries, and thus /// does not _require_ a geometry column.