Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parquet expand glob pattern #925

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion rust/geoarrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ flatgeobuf_async = [
gdal = ["dep:gdal"]
geos = ["dep:geos"]
ipc_compression = ["arrow-ipc/lz4", "arrow-ipc/zstd"]
parquet = ["dep:parquet"]
parquet = ["dep:parquet", "dep:glob"]
parquet_async = [
"parquet",
"parquet/async",
Expand Down Expand Up @@ -69,6 +69,7 @@ geo-index = "0.1.1"
geo-traits = "0.2"
geos = { version = "9.0", features = ["v3_10_0", "geo"], optional = true }
geozero = { version = "0.14", features = ["with-wkb"] }
glob = { version = "0.3.1", optional = true }
half = { version = "2.4.1" }
http-range-client = { version = "0.8", optional = true }
indexmap = { version = "2" }
Expand Down
2 changes: 1 addition & 1 deletion rust/geoarrow/src/io/parquet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ mod test;
mod writer;

pub use reader::{
GeoParquetDatasetMetadata, GeoParquetReaderMetadata, GeoParquetReaderOptions,
expand_glob, GeoParquetDatasetMetadata, GeoParquetReaderMetadata, GeoParquetReaderOptions,
GeoParquetRecordBatchReader, GeoParquetRecordBatchReaderBuilder,
};
#[cfg(feature = "parquet_async")]
Expand Down
112 changes: 112 additions & 0 deletions rust/geoarrow/src/io/parquet/reader/glob.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
use std::sync::Arc;

use futures::StreamExt;
use glob::{self, Pattern};
use object_store::{ObjectMeta, ObjectStore};

use crate::error::Result;

/// Find all files within an object store with the specified pattern and suffix
pub async fn expand_glob(
store: Arc<dyn ObjectStore>,
pattern: &Pattern,
suffix: Option<&str>,
) -> Result<Vec<ObjectMeta>> {
let mut out = Vec::new();

// There are glob characters in the pattern
if let Some(first_glob_char_idx) = pattern.as_str().find(['?', '*', '[']) {
dbg!("glob branch");

// Strip off the chars before the glob char
// If there's a / char before the glob char, we use that as the prefix for listing in the
// object store
if let Some((prefix, _suffix)) = pattern.as_str()[..first_glob_char_idx].rsplit_once('/') {
while let Some(item) = store.list(Some(&prefix.into())).next().await {
let item = item?;
if item_matches(&item, pattern, suffix) {
out.push(item);
}
}

return Ok(out);
} else {
dbg!("branch 2");
while let Some(item) = store.list(None).next().await {
dbg!("item");
dbg!(&item);
let item = item?;
if item_matches(&item, pattern, suffix) {
out.push(item);
}
}

return Ok(out);
}
} else {
// Otherwise, list without a prefix
while let Some(item) = store.list(None).next().await {
let item = item?;
if item_matches(&item, pattern, suffix) {
out.push(item);
}
}
}

Ok(out)
}

fn item_matches(item: &ObjectMeta, pattern: &Pattern, suffix: Option<&str>) -> bool {
if pattern.matches(item.location.as_ref()) {
if let Some(suffix) = suffix {
if item.location.as_ref().ends_with(suffix) {
return true;
}
} else {
return true;
}
};

false
}

#[cfg(test)]
mod test {
use std::sync::Arc;

use super::*;
use futures::TryStreamExt;
use object_store::memory::InMemory;
use object_store::PutPayload;

#[tokio::test]
async fn test_matches_path() {
dbg!("hello world");

let store = Arc::new(InMemory::new());
store
.put(&"file1.txt".into(), PutPayload::new())
.await
.unwrap();
store
.put(&"file2.txt".into(), PutPayload::new())
.await
.unwrap();
store
.put(&"file3.txt".into(), PutPayload::new())
.await
.unwrap();

dbg!("done put");

let list = store.as_ref().list(None);
let x = list.try_collect::<Vec<_>>().await.unwrap();
dbg!(x);

let pattern = Pattern::new("file*.txt").unwrap();
let result = expand_glob(store, &pattern, Some(".txt")).await.unwrap();

// result.iter().map(||)
dbg!(result);
}
}
2 changes: 2 additions & 0 deletions rust/geoarrow/src/io/parquet/reader/mod.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
#[cfg(feature = "parquet_async")]
mod r#async;
mod builder;
mod glob;
mod metadata;
mod options;
mod parse;
mod spatial_filter;

pub use builder::{GeoParquetRecordBatchReader, GeoParquetRecordBatchReaderBuilder};
pub use glob::expand_glob;
pub use metadata::{GeoParquetDatasetMetadata, GeoParquetReaderMetadata};
pub use options::GeoParquetReaderOptions;
#[cfg(feature = "parquet_async")]
Expand Down
Loading