From f51443ae951fa199e6f78fa81651f19f7654026e Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Fri, 13 Dec 2024 17:36:44 -0500 Subject: [PATCH] Add `ST_MakeBox2D`, `ST_Expand`, fix RectArray round trip (#946) ### Change list - Add `ST_MakeBox2D`, `ST_Expand`. - Add test for each. - Fix round-tripping `RectArray` to an `ArrayRef` - Add test of round-tripping `RectArray` to an `ArrayRef` --- Cargo.lock | 1 + rust/geoarrow/src/array/rect/array.rs | 73 ++++--- rust/geoarrow/src/array/rect/builder.rs | 10 +- rust/geodatafusion/Cargo.toml | 1 + rust/geodatafusion/README.md | 4 +- .../native/bounding_box/{box.rs => box_2d.rs} | 0 .../src/udf/native/bounding_box/expand.rs | 179 ++++++++++++++++++ .../udf/native/bounding_box/make_box_2d.rs | 132 +++++++++++++ .../src/udf/native/bounding_box/mod.rs | 12 +- 9 files changed, 381 insertions(+), 31 deletions(-) rename rust/geodatafusion/src/udf/native/bounding_box/{box.rs => box_2d.rs} (100%) create mode 100644 rust/geodatafusion/src/udf/native/bounding_box/expand.rs create mode 100644 rust/geodatafusion/src/udf/native/bounding_box/make_box_2d.rs diff --git a/Cargo.lock b/Cargo.lock index 6b98331e..d6102c4b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1849,6 +1849,7 @@ dependencies = [ name = "geodatafusion" version = "0.1.0-dev" dependencies = [ + "approx", "arrow", "arrow-array", "arrow-buffer", diff --git a/rust/geoarrow/src/array/rect/array.rs b/rust/geoarrow/src/array/rect/array.rs index 26371cba..d4ece60f 100644 --- a/rust/geoarrow/src/array/rect/array.rs +++ b/rust/geoarrow/src/array/rect/array.rs @@ -2,8 +2,8 @@ use std::sync::Arc; use arrow::array::AsArray; use arrow::datatypes::Float64Type; -use arrow_array::{Array, ArrayRef, Float64Array, StructArray}; -use arrow_buffer::NullBuffer; +use arrow_array::{Array, ArrayRef, StructArray}; +use arrow_buffer::{NullBuffer, ScalarBuffer}; use arrow_schema::{DataType, Field}; use crate::array::metadata::ArrayMetadata; @@ -182,14 +182,12 @@ impl IntoArrow for RectArray { fn into_arrow(self) -> Self::ArrowArray { let fields = rect_fields(self.data_type.dimension().unwrap()); let mut arrays: Vec = vec![]; - for buf in self.lower.buffers { - arrays.push(Arc::new(Float64Array::new(buf, None))); - } - for buf in self.upper.buffers { - arrays.push(Arc::new(Float64Array::new(buf, None))); - } - let validity = self.validity; + // values_array takes care of the correct number of dimensions + arrays.extend_from_slice(self.lower.values_array().as_slice()); + arrays.extend_from_slice(self.upper.values_array().as_slice()); + + let validity = self.validity; StructArray::new(fields, arrays, validity) } } @@ -202,23 +200,24 @@ impl TryFrom<(&StructArray, Dimension)> for RectArray { let columns = value.columns(); assert_eq!(columns.len(), dim.size() * 2); - let lower = match dim { - Dimension::XY => { - core::array::from_fn(|i| columns[i].as_primitive::().values().clone()) - } - Dimension::XYZ => { - core::array::from_fn(|i| columns[i].as_primitive::().values().clone()) + let dim_size = dim.size(); + let lower = core::array::from_fn(|i| { + if i < dim_size { + columns[i].as_primitive::().values().clone() + } else { + ScalarBuffer::from(vec![]) } - }; - let upper = match dim { - Dimension::XY => { - core::array::from_fn(|i| columns[i].as_primitive::().values().clone()) + }); + let upper = core::array::from_fn(|i| { + if i < dim_size { + columns[dim_size + i] + .as_primitive::() + .values() + .clone() + } else { + ScalarBuffer::from(vec![]) } - Dimension::XYZ => { - core::array::from_fn(|i| columns[i].as_primitive::().values().clone()) - } - }; - + }); Ok(Self::new( SeparatedCoordBuffer::new(lower, dim), SeparatedCoordBuffer::new(upper, dim), @@ -271,3 +270,29 @@ impl> From<(Vec>, Dimension)> for RectArray { mut_arr.into() } } + +#[cfg(test)] +mod test { + use super::*; + use crate::algorithm::native::eq::rect_eq; + use crate::array::RectBuilder; + use crate::datatypes::Dimension; + + #[test] + fn rect_array_round_trip() { + let rect = geo::Rect::new( + geo::coord! { x: 0.0, y: 5.0 }, + geo::coord! { x: 10.0, y: 15.0 }, + ); + let mut builder = + RectBuilder::with_capacity_and_options(Dimension::XY, 1, Default::default()); + builder.push_rect(Some(&rect)); + builder.push_min_max(&rect.min(), &rect.max()); + let rect_arr = builder.finish(); + + let arrow_arr = rect_arr.into_array_ref(); + let rect_arr_again = RectArray::try_from((arrow_arr.as_ref(), Dimension::XY)).unwrap(); + let rect_again = rect_arr_again.value(0); + assert!(rect_eq(&rect, &rect_again)); + } +} diff --git a/rust/geoarrow/src/array/rect/builder.rs b/rust/geoarrow/src/array/rect/builder.rs index 4f589156..3669e85a 100644 --- a/rust/geoarrow/src/array/rect/builder.rs +++ b/rust/geoarrow/src/array/rect/builder.rs @@ -6,7 +6,7 @@ use crate::scalar::Rect; use crate::trait_::IntoArrow; use arrow_array::{Array, StructArray}; use arrow_buffer::NullBufferBuilder; -use geo_traits::RectTrait; +use geo_traits::{CoordTrait, RectTrait}; use std::sync::Arc; /// The GeoArrow equivalent to `Vec>`: a mutable collection of Rects. @@ -168,6 +168,14 @@ impl RectBuilder { } } + /// Push min and max coordinates of a rect to the builder. + #[inline] + pub fn push_min_max(&mut self, min: &impl CoordTrait, max: &impl CoordTrait) { + self.lower.push_coord(min); + self.upper.push_coord(max); + self.validity.append_non_null() + } + /// Create this builder from a iterator of Rects. pub fn from_rects<'a>( geoms: impl ExactSizeIterator + 'a)>, diff --git a/rust/geodatafusion/Cargo.toml b/rust/geodatafusion/Cargo.toml index 943b02b9..360928ed 100644 --- a/rust/geodatafusion/Cargo.toml +++ b/rust/geodatafusion/Cargo.toml @@ -29,4 +29,5 @@ geoarrow = { path = "../geoarrow", features = ["flatgeobuf"] } thiserror = "1" [dev-dependencies] +approx = "0.5.1" tokio = { version = "1.9", features = ["macros", "fs", "rt-multi-thread"] } diff --git a/rust/geodatafusion/README.md b/rust/geodatafusion/README.md index c8f1cde1..474e84a9 100644 --- a/rust/geodatafusion/README.md +++ b/rust/geodatafusion/README.md @@ -337,10 +337,10 @@ Spatial extensions for [Apache DataFusion](https://datafusion.apache.org/), an e | Box2D | ✅ | Returns a BOX2D representing the 2D extent of a geometry. | | Box3D | | Returns a BOX3D representing the 3D extent of a geometry. | | ST_EstimatedExtent | | Returns the estimated extent of a spatial table. | -| ST_Expand | | Returns a bounding box expanded from another bounding box or a geometry. | +| ST_Expand | ✅ | Returns a bounding box expanded from another bounding box or a geometry. | | ST_Extent | | Aggregate function that returns the bounding box of geometries. | | ST_3DExtent | | Aggregate function that returns the 3D bounding box of geometries. | -| ST_MakeBox2D | | Creates a BOX2D defined by two 2D point geometries. | +| ST_MakeBox2D | ✅ | Creates a BOX2D defined by two 2D point geometries. | | ST_3DMakeBox | | Creates a BOX3D defined by two 3D point geometries. | | ST_XMax | ✅ | Returns the X maxima of a 2D or 3D bounding box or a geometry. | | ST_XMin | ✅ | Returns the X minima of a 2D or 3D bounding box or a geometry. | diff --git a/rust/geodatafusion/src/udf/native/bounding_box/box.rs b/rust/geodatafusion/src/udf/native/bounding_box/box_2d.rs similarity index 100% rename from rust/geodatafusion/src/udf/native/bounding_box/box.rs rename to rust/geodatafusion/src/udf/native/bounding_box/box_2d.rs diff --git a/rust/geodatafusion/src/udf/native/bounding_box/expand.rs b/rust/geodatafusion/src/udf/native/bounding_box/expand.rs new file mode 100644 index 00000000..515cbbb1 --- /dev/null +++ b/rust/geodatafusion/src/udf/native/bounding_box/expand.rs @@ -0,0 +1,179 @@ +use std::any::Any; +use std::sync::OnceLock; + +use arrow::array::AsArray; +use arrow::datatypes::Float64Type; +use arrow_schema::DataType; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature, Volatility, +}; +use geo_traits::{CoordTrait, RectTrait}; +use geoarrow::array::{RectArray, RectBuilder}; +use geoarrow::datatypes::Dimension; +use geoarrow::error::GeoArrowError; +use geoarrow::trait_::ArrayAccessor; +use geoarrow::ArrayBase; + +use crate::data_types::BOX2D_TYPE; +use crate::error::GeoDataFusionResult; + +#[derive(Debug)] +pub(super) struct Expand { + signature: Signature, +} + +impl Expand { + pub fn new() -> Self { + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Exact(vec![BOX2D_TYPE.into(), DataType::Float64]), + TypeSignature::Exact(vec![ + BOX2D_TYPE.into(), + DataType::Float64, + DataType::Float64, + ]), + ], + Volatility::Immutable, + ), + } + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for Expand { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_expand" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> datafusion::error::Result { + Ok(arg_types.first().unwrap().clone()) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + Ok(expand_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(DOCUMENTATION.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns a bounding box expanded from the bounding box of the input, either by specifying a single distance with which the box should be expanded on both axes, or by specifying an expansion distance for each axis. Uses double-precision. Can be used for distance queries, or to add a bounding box filter to a query to take advantage of a spatial index.", + "ST_Expand(box)", + ) + .with_argument("box", "box2d") + .build() + })) + } +} + +fn expand_impl(args: &[ColumnarValue]) -> GeoDataFusionResult { + let mut args = ColumnarValue::values_to_arrays(args)?.into_iter(); + let rect_array = args.next().unwrap(); + let factor1 = args.next().unwrap(); + let factor2 = args.next(); + + let dx = factor1.as_primitive::(); + + if BOX2D_TYPE + .to_data_type() + .equals_datatype(rect_array.data_type()) + { + let rect_array = RectArray::try_from((rect_array.as_ref(), Dimension::XY))?; + let mut builder = RectBuilder::with_capacity_and_options( + Dimension::XY, + rect_array.len(), + rect_array.metadata().clone(), + ); + + if let Some(dy) = factor2 { + let dy = dy.as_primitive::(); + + for val in rect_array.iter().zip(dx.iter()).zip(dy.iter()) { + if let ((Some(rect), Some(dx)), Some(dy)) = val { + builder.push_rect(Some(&expand_2d_rect(rect, dx, dy))); + } else { + builder.push_null(); + } + } + } else { + for val in rect_array.iter().zip(dx.iter()) { + if let (Some(rect), Some(dx)) = val { + builder.push_rect(Some(&expand_2d_rect(rect, dx, dx))); + } else { + builder.push_null(); + } + } + } + + return Ok(builder.finish().into_array_ref().into()); + } + + Err(Err(GeoArrowError::General(format!( + "Unexpected data type: {:?}", + rect_array.data_type() + )))?) +} + +#[inline] +fn expand_2d_rect(rect: impl RectTrait, dx: f64, dy: f64) -> geo::Rect { + let min = rect.min(); + let max = rect.max(); + + let new_min = geo::coord! { x: min.x() - dx, y: min.y() - dy }; + let new_max = geo::coord! { x: max.x() + dx, y: max.y() + dy }; + + geo::Rect::new(new_min, new_max) +} + +#[cfg(test)] +mod test { + use approx::relative_eq; + use datafusion::prelude::*; + use geo_traits::{CoordTrait, RectTrait}; + use geoarrow::array::RectArray; + use geoarrow::datatypes::Dimension; + use geoarrow::trait_::ArrayAccessor; + + use crate::data_types::BOX2D_TYPE; + use crate::udf::native::register_native; + + #[tokio::test] + async fn test() { + let ctx = SessionContext::new(); + register_native(&ctx); + + let out = ctx + .sql("SELECT ST_Expand(ST_MakeBox2D(ST_Point(0, 5), ST_Point(10, 20)), 10, 20);") + .await + .unwrap(); + + let batches = out.collect().await.unwrap(); + assert_eq!(batches.len(), 1); + let batch = batches.into_iter().next().unwrap(); + assert_eq!(batch.columns().len(), 1); + assert!(batch + .schema() + .field(0) + .data_type() + .equals_datatype(&BOX2D_TYPE.into())); + + let rect_array = RectArray::try_from((batch.columns()[0].as_ref(), Dimension::XY)).unwrap(); + let rect = rect_array.value(0); + + assert!(relative_eq!(rect.min().x(), -10.0)); + assert!(relative_eq!(rect.min().y(), -15.0)); + assert!(relative_eq!(rect.max().x(), 20.0)); + assert!(relative_eq!(rect.max().y(), 40.0)); + } +} diff --git a/rust/geodatafusion/src/udf/native/bounding_box/make_box_2d.rs b/rust/geodatafusion/src/udf/native/bounding_box/make_box_2d.rs new file mode 100644 index 00000000..2967b56d --- /dev/null +++ b/rust/geodatafusion/src/udf/native/bounding_box/make_box_2d.rs @@ -0,0 +1,132 @@ +use std::any::Any; +use std::sync::OnceLock; + +use arrow_schema::DataType; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; +use geo_traits::PointTrait; +use geoarrow::array::{PointArray, RectBuilder}; +use geoarrow::datatypes::Dimension; +use geoarrow::trait_::ArrayAccessor; +use geoarrow::ArrayBase; + +use crate::data_types::{BOX2D_TYPE, POINT2D_TYPE}; +use crate::error::GeoDataFusionResult; + +#[derive(Debug)] +pub(super) struct MakeBox2D { + signature: Signature, +} + +impl MakeBox2D { + pub fn new() -> Self { + Self { + signature: Signature::exact( + vec![POINT2D_TYPE.into(), POINT2D_TYPE.into()], + Volatility::Immutable, + ), + } + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for MakeBox2D { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_makebox2d" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { + Ok(BOX2D_TYPE.into()) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + Ok(make_box2d_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(DOCUMENTATION.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Creates a box2d defined by two Point geometries. This is useful for doing range queries.", + "ST_MakeBox2D(ST_Point(-989502.1875, 528439.5625), ST_Point(-987121.375, 529933.1875))", + ) + .with_argument("pointLowLeft", "geometry") + .with_argument("pointUpRight", "geometry") + .build() + })) + } +} + +fn make_box2d_impl(args: &[ColumnarValue]) -> GeoDataFusionResult { + let mut args = ColumnarValue::values_to_arrays(args)?.into_iter(); + let min = args.next().unwrap(); + let max = args.next().unwrap(); + + let min = PointArray::try_from((min.as_ref(), Dimension::XY))?; + let max = PointArray::try_from((max.as_ref(), Dimension::XY))?; + + let mut builder = + RectBuilder::with_capacity_and_options(Dimension::XY, min.len(), min.metadata().clone()); + + for val in min.iter().zip(max.iter()) { + if let (Some(min), Some(max)) = val { + builder.push_min_max(&min.coord().unwrap(), &max.coord().unwrap()); + } else { + builder.push_null(); + } + } + + Ok(builder.finish().into_array_ref().into()) +} + +#[cfg(test)] +mod test { + use approx::relative_eq; + use datafusion::prelude::*; + use geo_traits::{CoordTrait, RectTrait}; + use geoarrow::array::RectArray; + use geoarrow::datatypes::Dimension; + use geoarrow::trait_::ArrayAccessor; + + use crate::data_types::BOX2D_TYPE; + use crate::udf::native::register_native; + + #[tokio::test] + async fn test() { + let ctx = SessionContext::new(); + register_native(&ctx); + + let out = ctx + .sql("SELECT ST_MakeBox2D(ST_Point(0, 5), ST_Point(10, 20));") + .await + .unwrap(); + let batches = out.collect().await.unwrap(); + assert_eq!(batches.len(), 1); + let batch = batches.into_iter().next().unwrap(); + assert_eq!(batch.columns().len(), 1); + assert!(batch + .schema() + .field(0) + .data_type() + .equals_datatype(&BOX2D_TYPE.into())); + + let rect_array = RectArray::try_from((batch.columns()[0].as_ref(), Dimension::XY)).unwrap(); + let rect = rect_array.value(0); + + assert!(relative_eq!(rect.min().x(), 0.0)); + assert!(relative_eq!(rect.min().y(), 5.0)); + assert!(relative_eq!(rect.max().x(), 10.0)); + assert!(relative_eq!(rect.max().y(), 20.0)); + } +} diff --git a/rust/geodatafusion/src/udf/native/bounding_box/mod.rs b/rust/geodatafusion/src/udf/native/bounding_box/mod.rs index 962ec875..bbb7b58a 100644 --- a/rust/geodatafusion/src/udf/native/bounding_box/mod.rs +++ b/rust/geodatafusion/src/udf/native/bounding_box/mod.rs @@ -1,13 +1,17 @@ -mod r#box; +mod box_2d; +mod expand; mod extrema; +mod make_box_2d; use datafusion::prelude::SessionContext; /// Register all provided bounding box functions pub fn register_udfs(ctx: &SessionContext) { - ctx.register_udf(extrema::XMin::new().into()); - ctx.register_udf(extrema::YMin::new().into()); + ctx.register_udf(box_2d::Box2D::new().into()); + ctx.register_udf(expand::Expand::new().into()); ctx.register_udf(extrema::XMax::new().into()); + ctx.register_udf(extrema::XMin::new().into()); ctx.register_udf(extrema::YMax::new().into()); - ctx.register_udf(r#box::Box2D::new().into()); + ctx.register_udf(extrema::YMin::new().into()); + ctx.register_udf(make_box_2d::MakeBox2D::new().into()); }