From 47c0f63af470936c19ec397ecb2bb98eb002a7fb Mon Sep 17 00:00:00 2001 From: Cyril Matthey-Doret Date: Wed, 25 Sep 2024 15:41:31 +0000 Subject: [PATCH] feat: support additional rdf formats (#18) * feat(io): support common rdf formats * chore(deps): add oxrdfio * chore: bump to 0.2.2 * chor: bump deps --- Cargo.lock | 49 +++++++++++++++++++++++++++++++++++++++------- fuzon/Cargo.toml | 3 ++- fuzon/src/lib.rs | 27 ++++++++++++++++--------- pyfuzon/Cargo.toml | 4 ++-- 4 files changed, 64 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7f8a494..f2bc17e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -463,13 +463,14 @@ dependencies = [ [[package]] name = "fuzon" -version = "0.2.0" +version = "0.2.2" dependencies = [ "anyhow", "clap 4.5.17", "crossterm", "lazy_static", "oxrdf 0.1.7", + "oxrdfio", "oxttl", "ratatui", "reqwest", @@ -928,9 +929,9 @@ dependencies = [ [[package]] name = "oxrdf" -version = "0.2.0-rc.2" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1606c7721d7bae945e9b90f3d7a9cfb9e0e7e3e2ec6ff5d53c290fb8336b187" +checksum = "4c23ac4556485d4d4c31f3448a7ccc53d2444617a5e46ccc39f2f5a74e28091e" dependencies = [ "oxilangtag", "oxiri", @@ -938,16 +939,41 @@ dependencies = [ "thiserror", ] +[[package]] +name = "oxrdfio" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4706c6af55788842d2db36cff725001f4d3b84882bcccb6c19d798542742cf6" +dependencies = [ + "oxrdf 0.2.0", + "oxrdfxml", + "oxttl", + "thiserror", +] + +[[package]] +name = "oxrdfxml" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb88436b3c4375d87d320387f9b6fa84a772ea927ad0a9458fc35780605cda47" +dependencies = [ + "oxilangtag", + "oxiri", + "oxrdf 0.2.0", + "quick-xml", + "thiserror", +] + [[package]] name = "oxttl" -version = "0.1.0-rc.2" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7081a82ab0371b3ab5d9411531fe1de5b594a7fcf36f3b70f3f70ce417c9e11f" +checksum = "9795d5819708ef1669906eb96783a1764bf4b283f7615d46f66a798e8101fd3b" dependencies = [ "memchr", "oxilangtag", "oxiri", - "oxrdf 0.2.0-rc.2", + "oxrdf 0.2.0", "thiserror", ] @@ -1050,7 +1076,7 @@ dependencies = [ [[package]] name = "pyfuzon" -version = "0.2.0" +version = "0.2.2" dependencies = [ "anyhow", "clap 4.5.17", @@ -1127,6 +1153,15 @@ dependencies = [ "syn", ] +[[package]] +name = "quick-xml" +version = "0.36.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7649a7b4df05aed9ea7ec6f628c67c9953a43869b8bc50929569b2999d443fe" +dependencies = [ + "memchr", +] + [[package]] name = "quote" version = "1.0.37" diff --git a/fuzon/Cargo.toml b/fuzon/Cargo.toml index a41d890..eda2106 100644 --- a/fuzon/Cargo.toml +++ b/fuzon/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fuzon" -version = "0.2.1" +version = "0.2.2" edition = "2021" [lib] @@ -12,6 +12,7 @@ clap = { version = "4.5.16", features = ["derive"] } crossterm = "0.28.1" lazy_static = "1.5.0" oxrdf = "0.1.7" +oxrdfio = "0.1.0" oxttl = "0.1.0-rc.1" ratatui = "0.28.1" reqwest = { version = "0.12.0", features = ["blocking", "native-tls-vendored"] } diff --git a/fuzon/src/lib.rs b/fuzon/src/lib.rs index a079418..e58a23c 100644 --- a/fuzon/src/lib.rs +++ b/fuzon/src/lib.rs @@ -5,7 +5,7 @@ use std::io::{BufRead, BufReader}; use anyhow::Result; use lazy_static::lazy_static; -use oxttl::TurtleParser; +use oxrdfio::{RdfFormat, RdfParser}; use reqwest::blocking::Client; use reqwest::Url; @@ -54,7 +54,7 @@ impl TermMatcher { .map(|t| t.0) .collect() } - pub fn from_readers(readers: Vec) -> Self { + pub fn from_readers(readers: Vec<(impl BufRead, RdfFormat)>) -> Self { let terms = gather_terms(readers).collect(); TermMatcher { terms } } @@ -78,18 +78,26 @@ impl fmt::Display for Term { } } -pub fn get_source(path: &str) -> Result> { +/// Get an rdf reader along with its format from a path +pub fn get_source(path: &str) -> Result<(Box, RdfFormat)> { + let file_ext = path.split('.').last().unwrap(); + let ext = match file_ext { + "owl" => "xml", + "rdf" => "xml", + _ => file_ext, + }; + let format = RdfFormat::from_extension(ext).expect("Unkown file extension"); if let Ok(url) = Url::parse(path) { // Handle URL let client = Client::new(); let response = client.get(url).send()?.error_for_status()?; let reader = BufReader::new(response); - Ok(Box::new(reader)) // Return boxed reader for URL + Ok((Box::new(reader), format)) // Return boxed reader for URL } else { // Handle file path let file = File::open(path)?; let reader = BufReader::new(file); - Ok(Box::new(reader)) // Return boxed reader for file + Ok((Box::new(reader), format)) // Return boxed reader for file } } /// Returns the input term vector sorted by match score (best first), @@ -111,12 +119,13 @@ pub fn rank_terms<'a>(query: &str, terms: Vec<&'a Term>) -> Vec<(&'a Term, f64)> return ranked; } -// Load URI-label pairs from all source. -pub fn gather_terms(readers: Vec) -> impl Iterator { + +// Load URI-label pairs from all sources. +pub fn gather_terms(readers: Vec<(impl BufRead, RdfFormat)>) -> impl Iterator { // NOTE: May want to use bulk loader for better performances let mut terms = Vec::new(); - for reader in readers { - let parser = TurtleParser::new().for_reader(reader); + for (reader, format) in readers { + let parser = RdfParser::from_format(format).for_reader(reader); let mut out = parser .map(|t| t.expect("Error parsing RDF")) .filter(|t| ANNOTATIONS.contains(t.predicate.as_str())) diff --git a/pyfuzon/Cargo.toml b/pyfuzon/Cargo.toml index 620ded0..a4ee49b 100644 --- a/pyfuzon/Cargo.toml +++ b/pyfuzon/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pyfuzon" -version = "0.2.1" +version = "0.2.2" edition = "2021" [lib] @@ -11,7 +11,7 @@ crate-type = ["cdylib"] anyhow = "1.0.86" clap = { version = "4.5.16", features = ["derive"] } crossterm = "0.28.1" -fuzon = { version = "0.2.0", path = "../fuzon" } +fuzon = { version = "0.2.2", path = "../fuzon" } lazy_static = "1.5.0" oxrdf = "0.1.7" oxttl = "0.1.0-rc.1"