diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 00b40e31..fbcde5ab 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -8,7 +8,7 @@ jobs: runs-on: windows-latest steps: - uses: actions/checkout@v4 - - uses: dtolnay/rust-toolchain@1.79.0 + - uses: dtolnay/rust-toolchain@1.83.0 - name: run tests run: | @@ -34,7 +34,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: dtolnay/rust-toolchain@1.79.0 + - uses: dtolnay/rust-toolchain@1.83.0 - name: Install toml-lint run: | @@ -49,7 +49,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: dtolnay/rust-toolchain@1.79.0 + - uses: dtolnay/rust-toolchain@1.83.0 - name: Install cargo-machete run: | @@ -64,7 +64,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: dtolnay/rust-toolchain@1.79.0 + - uses: dtolnay/rust-toolchain@1.83.0 - name: Login to GitHub Container Registry uses: docker/login-action@v1 diff --git a/Cargo.lock b/Cargo.lock index ee679b19..e4120314 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "addr2line" @@ -269,7 +269,7 @@ dependencies = [ "chrono", "comfy-table", "half", - "lexical-core", + "lexical-core 0.8.5", "num", "ryu", ] @@ -289,7 +289,7 @@ dependencies = [ "csv", "csv-core", "lazy_static", - "lexical-core", + "lexical-core 0.8.5", "regex", ] @@ -334,7 +334,7 @@ dependencies = [ "chrono", "half", "indexmap 2.4.0", - "lexical-core", + "lexical-core 0.8.5", "num", "serde", "serde_json", @@ -1187,9 +1187,9 @@ dependencies = [ [[package]] name = "bit-vec" -version = "0.7.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2c54ff287cfc0a34f38a6b832ea1bd8e448a330b3e40a50859e6488bee07f22" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" [[package]] name = "bitflags" @@ -1257,11 +1257,12 @@ dependencies = [ [[package]] name = "bstr" -version = "1.10.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c" +checksum = "786a307d683a5bf92e6fd5fd69a7eb613751668d1d8d67d802846dfe367c62c8" dependencies = [ "memchr", + "regex-automata 0.4.7", "serde", ] @@ -2522,6 +2523,7 @@ name = "exon-bed" version = "0.32.2" dependencies = [ "arrow", + "bstr", "exon-common", "futures", "noodles", @@ -3465,11 +3467,24 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" dependencies = [ - "lexical-parse-float", - "lexical-parse-integer", - "lexical-util", - "lexical-write-float", - "lexical-write-integer", + "lexical-parse-float 0.8.5", + "lexical-parse-integer 0.8.6", + "lexical-util 0.8.5", + "lexical-write-float 0.8.5", + "lexical-write-integer 0.8.5", +] + +[[package]] +name = "lexical-core" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" +dependencies = [ + "lexical-parse-float 1.0.5", + "lexical-parse-integer 1.0.5", + "lexical-util 1.0.6", + "lexical-write-float 1.0.5", + "lexical-write-integer 1.0.5", ] [[package]] @@ -3478,8 +3493,19 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" dependencies = [ - "lexical-parse-integer", - "lexical-util", + "lexical-parse-integer 0.8.6", + "lexical-util 0.8.5", + "static_assertions", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" +dependencies = [ + "lexical-parse-integer 1.0.5", + "lexical-util 1.0.6", "static_assertions", ] @@ -3489,7 +3515,17 @@ version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" dependencies = [ - "lexical-util", + "lexical-util 0.8.5", + "static_assertions", +] + +[[package]] +name = "lexical-parse-integer" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" +dependencies = [ + "lexical-util 1.0.6", "static_assertions", ] @@ -3502,14 +3538,34 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "lexical-util" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" +dependencies = [ + "static_assertions", +] + [[package]] name = "lexical-write-float" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" dependencies = [ - "lexical-util", - "lexical-write-integer", + "lexical-util 0.8.5", + "lexical-write-integer 0.8.5", + "static_assertions", +] + +[[package]] +name = "lexical-write-float" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" +dependencies = [ + "lexical-util 1.0.6", + "lexical-write-integer 1.0.5", "static_assertions", ] @@ -3519,7 +3575,17 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" dependencies = [ - "lexical-util", + "lexical-util 0.8.5", + "static_assertions", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" +dependencies = [ + "lexical-util 1.0.6", "static_assertions", ] @@ -3765,9 +3831,9 @@ dependencies = [ [[package]] name = "noodles" -version = "0.79.0" +version = "0.86.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a67f576cd91c3d6fbc0d47b6817abb511519276380005788d8f69852cbfd621" +checksum = "9b6b7e0baaeaa2d3fc0af0271a711f9e1a1f32171ae2a921d292ac18dab8d412" dependencies = [ "noodles-bam", "noodles-bcf", @@ -3787,15 +3853,15 @@ dependencies = [ [[package]] name = "noodles-bam" -version = "0.66.0" +version = "0.71.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69ab4109074a07e6066cc0ba46db7e1c2ffc8f3e12d1c7aeb9811e6ff37e67c6" +checksum = "98dce2342ffe5d96e3d56d4f24dee62e4025a3277f2f5cc755bf7ee6a615e286" dependencies = [ "bstr", "byteorder", - "bytes", "futures", "indexmap 2.4.0", + "memchr", "noodles-bgzf", "noodles-core", "noodles-csi", @@ -3805,9 +3871,9 @@ dependencies = [ [[package]] name = "noodles-bcf" -version = "0.59.0" +version = "0.65.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db5300a46f7409e08d95d2aa151eaca151bc6c929ab8aae1939ba6634c734af8" +checksum = "7e0ab64aa6497c437aafa89663c4c4476afa0117495330a62847617ce0ce29fa" dependencies = [ "byteorder", "futures", @@ -3821,18 +3887,21 @@ dependencies = [ [[package]] name = "noodles-bed" -version = "0.15.0" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c781b824d10b621df17d26fb5cd9d3cc8188aeabda7de7e36cde51f6015b31d" +checksum = "13396d1aa63855efe5289b3711a191c007f08a89366ebe85acab2ae07883f8f7" dependencies = [ + "bstr", + "lexical-core 1.0.5", + "memchr", "noodles-core", ] [[package]] name = "noodles-bgzf" -version = "0.32.0" +version = "0.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b2fba0f4a64cc897d9396d730a0c444d148daed7de31ad5904ecc673178fc9d" +checksum = "3e624384981e5847bfd6a026f157c45d687187c30ee21b8c435310267c7aa7ab" dependencies = [ "byteorder", "bytes", @@ -3855,9 +3924,9 @@ dependencies = [ [[package]] name = "noodles-cram" -version = "0.67.0" +version = "0.72.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "832bb907f2fa5696add81c53eb97493bb6ba6015de4334f7a707422ca97b0e25" +checksum = "ad4b91620a740d26de912091445a063f8c2bcb8295b654e071962f6cba17c951" dependencies = [ "async-compression", "bitflags 2.6.0", @@ -3880,11 +3949,12 @@ dependencies = [ [[package]] name = "noodles-csi" -version = "0.37.0" +version = "0.41.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4bc8001c54f1d8e47e1ac6041a5f27edc99b68bacea3fade9c89059de285aea" +checksum = "199113fe53fef2d79b0a9f670d1cad524b4ddcefdc1629dc69f0eb2707212c9e" dependencies = [ "bit-vec", + "bstr", "byteorder", "indexmap 2.4.0", "noodles-bgzf", @@ -3894,9 +3964,9 @@ dependencies = [ [[package]] name = "noodles-fasta" -version = "0.42.0" +version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d60db7c4c514211598f2d7eb38e499e3b42d3eb690779fd0b36f224650c75c82" +checksum = "16862f9e1bf1ad825a1fab6fc29da9e950dd477cfcd0cb1a2b14fa8ee1a72575" dependencies = [ "bstr", "bytes", @@ -3908,9 +3978,9 @@ dependencies = [ [[package]] name = "noodles-fastq" -version = "0.14.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c596792c857f37e6a85e2cf1e68578f5b70f867cad028bf95c1e2b5d7c9c84eb" +checksum = "1606247d99eae65370cdb0ef5590f109a5286d57c06da8e738466cf95a4509d5" dependencies = [ "bstr", "futures", @@ -3920,9 +3990,9 @@ dependencies = [ [[package]] name = "noodles-gff" -version = "0.35.0" +version = "0.40.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adef59012090b5694b58cad0e4426cd18af404803f942d02e664af607d89ee28" +checksum = "108dfa5c377374ab61fd060ab9c08a99f118659218e83d5f71125b553d9b0d2b" dependencies = [ "futures", "indexmap 2.4.0", @@ -3935,9 +4005,9 @@ dependencies = [ [[package]] name = "noodles-gtf" -version = "0.30.0" +version = "0.35.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fd29a4d1bb243412ebd5090d0908c0c797ad3dd7832cc1623175c9c7c9d1a5a" +checksum = "fe48bdf59c757e63369f0fd14a90c467352d97a6c6ddf749ace30fa5f61b4602" dependencies = [ "noodles-bgzf", "noodles-core", @@ -3946,29 +4016,29 @@ dependencies = [ [[package]] name = "noodles-sam" -version = "0.63.0" +version = "0.67.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "460564b59da27a291616ea9f515e8eda8624a5d64b214423fcf1f1819b05cd78" +checksum = "676b1113ce4e25abbbd369ffab9891f80818f5cce89f1ffe700be839ff1aa0bf" dependencies = [ "bitflags 2.6.0", "bstr", "futures", "indexmap 2.4.0", - "lexical-core", + "lexical-core 1.0.5", "memchr", "noodles-bgzf", "noodles-core", "noodles-csi", + "pin-project-lite", "tokio", ] [[package]] name = "noodles-tabix" -version = "0.43.0" +version = "0.47.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "545e16e229b7f8734b0a2a36bd4c98a5b70128663b16b5201ddadc0d09c28d4a" +checksum = "fde991a31c6203845117944c1d5f697b69c382e37eb2d70f3e3f2b575fbca62d" dependencies = [ - "bit-vec", "byteorder", "indexmap 2.4.0", "noodles-bgzf", @@ -3979,9 +4049,9 @@ dependencies = [ [[package]] name = "noodles-vcf" -version = "0.62.0" +version = "0.69.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6f64c43315f757fe42ae014cf83996698cc9e47388080db165d0eb7b5f74092" +checksum = "70c72cbcfdfb14f0f76980dbe8f573f1bb7f9fa95eaa4780ee99aaa090c6d421" dependencies = [ "futures", "indexmap 2.4.0", diff --git a/Cargo.toml b/Cargo.toml index 334aec5a..7a8ff9e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,7 +39,7 @@ arrow = { version = "52.2.0" } async-trait = "0.1.82" datafusion = { version = "41", features = ["compression", "parquet"] } futures = "0.3" -noodles = { version = "0.79" } +noodles = { version = "0.86" } object_store = { version = "0.10.2" } tokio = { version = "1", features = ["io-util"] } tokio-util = { version = "0.7.11", features = ["compat"] } diff --git a/Dockerfile b/Dockerfile index a2213e81..0fc53be8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM rust:1.80 as builder +FROM rust:1.83 as builder COPY . /usr/src/exon WORKDIR /usr/src/exon diff --git a/exon/exon-bed/Cargo.toml b/exon/exon-bed/Cargo.toml index e5298f27..9e37fe56 100644 --- a/exon/exon-bed/Cargo.toml +++ b/exon/exon-bed/Cargo.toml @@ -12,6 +12,7 @@ version.workspace = true [dependencies] arrow = { workspace = true } +bstr = "1.11.1" exon-common = { path = "../exon-common", version = "0.32.2" } futures = { workspace = true } noodles = { workspace = true, features = ["bed", "core"] } diff --git a/exon/exon-bed/src/batch_reader.rs b/exon/exon-bed/src/batch_reader.rs index 17581042..a73abf4d 100644 --- a/exon/exon-bed/src/batch_reader.rs +++ b/exon/exon-bed/src/batch_reader.rs @@ -18,27 +18,14 @@ use arrow::{error::ArrowError, record_batch::RecordBatch}; use exon_common::ExonArrayBuilder; use futures::Stream; -use noodles::bed::Record; +use noodles::{ + bed::feature::{record::Strand, record_buf::OtherFields, RecordBuf}, + core::Position, +}; use tokio::io::{AsyncBufRead, AsyncBufReadExt}; use super::{array_builder::BEDArrayBuilder, bed_record_builder::BEDRecord, config::BEDConfig}; -macro_rules! extract_record { - ($buf:expr, $num:expr) => {{ - let r: Record<$num> = match Record::from_str(&$buf) { - Ok(r) => r, - Err(e) => { - return Err(std::io::Error::new( - std::io::ErrorKind::InvalidData, - format!("invalid record: {e}"), - )); - } - }; - - BEDRecord::from(r) - }}; -} - /// A batch reader for BED files. pub struct BatchReader { /// The underlying BED reader. @@ -116,9 +103,6 @@ where } } - // Get the number of tab separated fields - let num_fields = buf.split('\t').count(); - // Remove the newline buf.pop(); @@ -128,15 +112,111 @@ where buf.pop(); } + // Get the number of tab separated fields + let split = buf.split('\t').collect::>(); + let num_fields = split.len(); + let bed_record = match num_fields { - 12 => extract_record!(buf, 12), - 9 => extract_record!(buf, 9), - 8 => extract_record!(buf, 8), - 7 => extract_record!(buf, 7), - 6 => extract_record!(buf, 6), - 5 => extract_record!(buf, 5), - 4 => extract_record!(buf, 4), - 3 => extract_record!(buf, 3), + 12 => { + let buf_builder = RecordBuf::<6>::builder(); + + let other_fields = OtherFields::default(); + + let mut record = buf_builder + .set_reference_sequence_name(split[0].as_bytes().to_vec()) + .set_feature_start(Position::from_str(split[1]).unwrap()) + .set_feature_end(Position::from_str(split[2]).unwrap()) + .set_name(split[3].as_bytes().to_vec()) + .set_score(split[4].parse().unwrap()); + + match split[5] { + "+" => { + record = record.set_strand(Strand::Forward); + } + "-" => { + record = record.set_strand(Strand::Reverse); + } + "." => {} + _ => { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("invalid strand: {}", split[5]), + )) + } + }; + + let record = record.set_other_fields(other_fields).build(); + + BEDRecord::from(record) + } + // 9 => extract_record!(buf, 9), + // 8 => extract_record!(buf, 8), + // 7 => extract_record!(buf, 7), + 6 => { + // let record = read_record_6(buf.as_bytes())?; + let mut buf_builder = RecordBuf::<6>::builder() + .set_reference_sequence_name(split[0].as_bytes().to_vec()) + .set_feature_start(Position::from_str(split[1]).unwrap()) + .set_feature_end(Position::from_str(split[2]).unwrap()) + .set_name(split[3].as_bytes().to_vec()) + .set_score(split[4].parse().unwrap()); + + match split[5] { + "+" => { + buf_builder = buf_builder.set_strand(Strand::Forward); + } + "-" => { + buf_builder = buf_builder.set_strand(Strand::Reverse); + } + "." => {} + _ => { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("invalid strand: {}", split[5]), + )) + } + }; + + let record = buf_builder.build(); + + BEDRecord::from(record) + } + 5 => { + let buf_builder = RecordBuf::<5>::builder(); + + let record = buf_builder + .set_reference_sequence_name(split[0].as_bytes().to_vec()) + .set_feature_start(Position::from_str(split[1]).unwrap()) + .set_feature_end(Position::from_str(split[2]).unwrap()) + .set_name(split[3].as_bytes().to_vec()) + .set_score(split[4].parse().unwrap()) + .build(); + + BEDRecord::from(record) + } + 4 => { + let buf_builder = RecordBuf::<4>::builder(); + + let record = buf_builder + .set_reference_sequence_name(split[0].as_bytes().to_vec()) + .set_feature_start(Position::from_str(split[1]).unwrap()) + .set_feature_end(Position::from_str(split[2]).unwrap()) + .set_name(split[3].as_bytes().to_vec()) + .build(); + + BEDRecord::from(record) + } + 3 => { + let buf_builder = RecordBuf::<3>::builder(); + + let record = buf_builder + .set_reference_sequence_name(split[0].as_bytes().to_vec()) + .set_feature_start(Position::from_str(split[1]).unwrap()) + .set_feature_end(Position::from_str(split[2]).unwrap()) + .build(); + + BEDRecord::from(record) + } _ => { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, diff --git a/exon/exon-bed/src/bed_record_builder.rs b/exon/exon-bed/src/bed_record_builder.rs index 2a69be36..36ae3753 100644 --- a/exon/exon-bed/src/bed_record_builder.rs +++ b/exon/exon-bed/src/bed_record_builder.rs @@ -13,13 +13,12 @@ // limitations under the License. use noodles::{ - bed::{ - record::{Color, Name, Score, Strand}, - Record, - }, + bed::feature::{record::Strand, RecordBuf}, core::Position, }; +use bstr::BStr; + pub struct BEDRecord { reference_sequence_name: String, start: u64, @@ -150,176 +149,101 @@ impl BEDRecordBuilder { self } - pub fn name(mut self, name: Option<&Name>) -> Self { + pub fn name(mut self, name: Option<&BStr>) -> Self { self.name = name.map(|n| n.to_string()); self } - pub fn score(mut self, score: Option) -> Self { + pub fn score(mut self, score: Option) -> Self { self.score = score.map(|i| u16::from(i) as i64); self } pub fn strand(mut self, strand: Option) -> Self { - self.strand = strand.map(|s| s.to_string()); - self - } - - pub fn thick_start(mut self, thick_start: Position) -> Self { - self.thick_start = Some(thick_start.get() as u64); - self - } - - pub fn thick_end(mut self, thick_end: Position) -> Self { - self.thick_end = Some(thick_end.get() as u64); - self - } - - pub fn color(mut self, color: Option) -> Self { - self.color = color.map(|c| c.to_string()); - self - } + self.strand = match strand { + Some(Strand::Forward) => Some("+".to_string()), + Some(Strand::Reverse) => Some("-".to_string()), + None => None, + }; - pub fn block_count(mut self, block_count: Option) -> Self { - self.block_count = block_count; self } - pub fn block_sizes(mut self, block_sizes: Option) -> Self { - self.block_sizes = block_sizes; - self - } - - pub fn block_starts(mut self, block_starts: Option) -> Self { - self.block_starts = block_starts; - self - } -} + // pub fn thick_start(mut self, thick_start: Position) -> Self { + // self.thick_start = Some(thick_start.get() as u64); + // self + // } -impl From> for BEDRecord { - fn from(value: Record<12>) -> Self { - let mut block_starts = Vec::new(); - let mut block_sizes = Vec::new(); + // pub fn thick_end(mut self, thick_end: Position) -> Self { + // self.thick_end = Some(thick_end.get() as u64); + // self + // } - value.blocks().iter().for_each(|(start, size)| { - block_starts.push(start.to_string()); - block_sizes.push(size.to_string()); - }); + // pub fn color(mut self, color: Option<&BStr>) -> Self { + // self.color = color.map(|c| c.to_string()); + // self + // } - let block_start_csv = block_starts.join(","); - let block_size_csv = block_sizes.join(","); + // pub fn block_count(mut self, block_count: Option) -> Self { + // self.block_count = block_count; + // self + // } - let builder = BEDRecordBuilder::new() - .reference_sequence_name(value.reference_sequence_name().to_string()) - .start(value.start_position()) - .end(value.end_position()) - .name(value.name()) - .score(value.score()) - .strand(value.strand()) - .thick_start(value.thick_start()) - .thick_end(value.thick_end()) - .color(value.color()) - .block_count(Some(block_starts.len() as u64)) - .block_sizes(Some(block_size_csv)) - .block_starts(Some(block_start_csv)); + // pub fn block_sizes(mut self, block_sizes: Option) -> Self { + // self.block_sizes = block_sizes; + // self + // } - builder.finish() - } -} - -impl From> for BEDRecord { - fn from(value: Record<9>) -> Self { - let builder = BEDRecordBuilder::new() - .reference_sequence_name(value.reference_sequence_name().to_string()) - .start(value.start_position()) - .end(value.end_position()) - .name(value.name()) - .score(value.score()) - .strand(value.strand()) - .thick_start(value.thick_start()) - .thick_end(value.thick_end()) - .color(value.color()); - - builder.finish() - } -} - -impl From> for BEDRecord { - fn from(value: Record<8>) -> Self { - let builder = BEDRecordBuilder::new() - .reference_sequence_name(value.reference_sequence_name().to_string()) - .start(value.start_position()) - .end(value.end_position()) - .name(value.name()) - .score(value.score()) - .strand(value.strand()) - .thick_start(value.thick_start()) - .thick_end(value.thick_end()); - - builder.finish() - } -} - -impl From> for BEDRecord { - fn from(value: Record<7>) -> Self { - let builder = BEDRecordBuilder::new() - .reference_sequence_name(value.reference_sequence_name().to_string()) - .start(value.start_position()) - .end(value.end_position()) - .name(value.name()) - .score(value.score()) - .strand(value.strand()) - .thick_start(value.thick_start()); - - builder.finish() - } + // pub fn block_starts(mut self, block_starts: Option) -> Self { + // self.block_starts = block_starts; + // self + // } } -impl From> for BEDRecord { - fn from(value: Record<6>) -> Self { +impl From> for BEDRecord { + fn from(value: RecordBuf<6>) -> Self { let builder = BEDRecordBuilder::new() .reference_sequence_name(value.reference_sequence_name().to_string()) - .start(value.start_position()) - .end(value.end_position()) + .start(value.feature_start()) + .end(value.feature_end().unwrap()) .name(value.name()) - .score(value.score()) + .score(Some(value.score())) .strand(value.strand()); builder.finish() } } -impl From> for BEDRecord { - fn from(value: Record<5>) -> Self { +impl From> for BEDRecord { + fn from(value: RecordBuf<5>) -> Self { let builder = BEDRecordBuilder::new() .reference_sequence_name(value.reference_sequence_name().to_string()) - .start(value.start_position()) - .end(value.end_position()) + .start(value.feature_start()) + .end(value.feature_end().unwrap()) .name(value.name()) - .score(value.score()); + .score(Some(value.score())); builder.finish() } } -impl From> for BEDRecord { - fn from(value: Record<4>) -> Self { +impl From> for BEDRecord { + fn from(value: RecordBuf<4>) -> Self { let builder = BEDRecordBuilder::new() .reference_sequence_name(value.reference_sequence_name().to_string()) - .start(value.start_position()) - .end(value.end_position()) - .name(value.name()); + .start(value.feature_start()) + .end(value.feature_end().unwrap()); builder.finish() } } -impl From> for BEDRecord { - fn from(value: Record<3>) -> Self { +impl From> for BEDRecord { + fn from(value: RecordBuf<3>) -> Self { let builder = BEDRecordBuilder::new() .reference_sequence_name(value.reference_sequence_name().to_string()) - .start(value.start_position()) - .end(value.end_position()); + .start(value.feature_start()) + .end(value.feature_end().unwrap()); builder.finish() } diff --git a/exon/exon-core/src/datasources/indexed_file/indexed_bgzf_file.rs b/exon/exon-core/src/datasources/indexed_file/indexed_bgzf_file.rs index 8d4fbe05..c61e13a5 100644 --- a/exon/exon-core/src/datasources/indexed_file/indexed_bgzf_file.rs +++ b/exon/exon-core/src/datasources/indexed_file/indexed_bgzf_file.rs @@ -73,12 +73,7 @@ pub async fn get_byte_range_for_file( std::io::Error::new(std::io::ErrorKind::InvalidInput, "missing tabix header") })?; - let region_name = std::str::from_utf8(region.name()).map_err(|e| { - std::io::Error::new( - std::io::ErrorKind::InvalidInput, - format!("invalid region name: {}", e), - ) - })?; + let region_name = region.name(); let id = header.reference_sequence_names().get_index_of(region_name); match id { diff --git a/exon/exon-core/src/session_context/exon_context_ext.rs b/exon/exon-core/src/session_context/exon_context_ext.rs index 97d4ea51..83cd8764 100644 --- a/exon/exon-core/src/session_context/exon_context_ext.rs +++ b/exon/exon-core/src/session_context/exon_context_ext.rs @@ -723,7 +723,6 @@ impl ExonSession { #[cfg(test)] mod tests { use datafusion::datasource::file_format::file_compression_type::FileCompressionType; - use exon_test::test_listing_table_dir; use crate::{ datasources::{ diff --git a/exon/exon-core/tests/sqllogictests/slt/bed-select-tests.slt b/exon/exon-core/tests/sqllogictests/slt/bed-select-tests.slt index b58c190c..4fa2d5de 100644 --- a/exon/exon-core/tests/sqllogictests/slt/bed-select-tests.slt +++ b/exon/exon-core/tests/sqllogictests/slt/bed-select-tests.slt @@ -6,7 +6,7 @@ CREATE EXTERNAL TABLE bed STORED AS BED LOCATION '$CARGO_MANIFEST_DIR/test-data/ query T SELECT * FROM bed LIMIT 1 ---- -chr1 11874 12227 NR_046018_exon_0_0_chr1_11874_f NULL + NULL NULL NULL NULL NULL NULL +chr1 11873 12227 NR_046018_exon_0_0_chr1_11874_f 0 + NULL NULL NULL NULL NULL NULL query T SELECT COUNT(*) as cnt FROM bed; @@ -22,7 +22,7 @@ CREATE EXTERNAL TABLE bed STORED AS BED PARTITIONED BY (sample) LOCATION '$CARGO query T SELECT * FROM bed WHERE sample = '1' LIMIT 1; ---- -chr1 11874 12227 NR_046018_exon_0_0_chr1_11874_f NULL + NULL NULL NULL NULL NULL NULL 1 +chr1 11873 12227 NR_046018_exon_0_0_chr1_11874_f 0 + NULL NULL NULL NULL NULL NULL 1 statement ok DROP TABLE bed; diff --git a/exon/exon-gff/src/array_builder.rs b/exon/exon-gff/src/array_builder.rs index d45b93c0..7076e5d2 100644 --- a/exon/exon-gff/src/array_builder.rs +++ b/exon/exon-gff/src/array_builder.rs @@ -23,7 +23,7 @@ use arrow::{ error::ArrowError, }; use exon_common::ExonArrayBuilder; -use noodles::gff::lazy::Record; +use noodles::gff::Record; pub struct GFFArrayBuilder { seqnames: GenericStringBuilder, @@ -96,13 +96,12 @@ impl GFFArrayBuilder { 5 => { let score = record.score(); - if score.is_empty() || score == "." { - self.scores.append_null(); - } else { - let score_f32 = score - .parse::() - .map_err(|e| ArrowError::ExternalError(Box::new(e)))?; - self.scores.append_value(score_f32); + match score { + Some(Ok(score)) => { + self.scores.append_value(score); + } + Some(Err(e)) => return Err(ArrowError::ExternalError(Box::new(e))), + None => self.scores.append_null(), } } 6 => { @@ -117,10 +116,12 @@ impl GFFArrayBuilder { 7 => { let phase = record.phase(); - if phase.is_empty() || phase == "." { - self.phases.append_null(); - } else { - self.phases.append_value(phase); + match phase { + Some(Ok(phase)) => { + self.phases.append_value(phase); + } + Some(Err(e)) => return Err(ArrowError::ExternalError(Box::new(e))), + None => self.phases.append_null(), } } 8 => { @@ -130,15 +131,15 @@ impl GFFArrayBuilder { self.attributes.keys().append_value(key); match value { - noodles::gff::lazy::record::attributes::field::Value::String(value) => { + noodles::gff::record::attributes::field::Value::String(value) => { self.attributes.values().append(true); self.attributes.values().values().append_value(value); } - noodles::gff::lazy::record::attributes::field::Value::Array( - attr_values, - ) => { + noodles::gff::record::attributes::field::Value::Array(attr_values) => { let list_values = self.attributes.values().values(); for value in attr_values.iter() { + let value = value?; + list_values.append_value(value); } self.attributes.values().append(true); diff --git a/exon/exon-gff/src/batch_reader.rs b/exon/exon-gff/src/batch_reader.rs index 87c85d62..126db1f6 100644 --- a/exon/exon-gff/src/batch_reader.rs +++ b/exon/exon-gff/src/batch_reader.rs @@ -63,17 +63,17 @@ where }) } - async fn read_line(&mut self) -> Result> { - let mut line = noodles::gff::lazy::Line::default(); + async fn read_line(&mut self) -> Result> { + let mut line = noodles::gff::Line::default(); - match self.reader.read_lazy_line(&mut line).await { + match self.reader.read_line(&mut line).await { Ok(0) => Ok(None), Ok(_) => Ok(Some(line)), Err(e) => Err(e.into()), } } - fn filter(&self, record: &noodles::gff::lazy::Record) -> Result { + fn filter(&self, record: &noodles::gff::Record) -> Result { let chrom = record.reference_sequence_name(); match &self.region { @@ -105,21 +105,16 @@ where loop { match self.read_line().await? { None => break, - Some(line) => match line { - noodles::gff::lazy::Line::Comment(_) => {} - noodles::gff::lazy::Line::Directive(_) => {} - noodles::gff::lazy::Line::Record(record) => { - // Filter on region if provided. + Some(line) => match line.as_record() { + Some(Ok(record)) => { if !self.filter(&record)? { continue; } gff_array_builder.append(&record)?; - - if gff_array_builder.len() == self.config.batch_size { - break; - } } + Some(Err(e)) => return Err(e.into()), + None => {} }, } } diff --git a/exon/exon-gff/src/error.rs b/exon/exon-gff/src/error.rs index 3bbb8549..b81390ab 100644 --- a/exon/exon-gff/src/error.rs +++ b/exon/exon-gff/src/error.rs @@ -37,19 +37,6 @@ impl Display for ExonGFFError { impl Error for ExonGFFError {} -impl From for ExonGFFError { - fn from(e: noodles::gff::line::ParseError) -> Self { - match e { - noodles::gff::line::ParseError::InvalidRecord(s) => { - ExonGFFError::InvalidRecord(s.to_string()) - } - noodles::gff::line::ParseError::InvalidDirective(s) => { - ExonGFFError::InvalidDirective(s.to_string()) - } - } - } -} - impl From for ExonGFFError { fn from(e: ArrowError) -> Self { ExonGFFError::ExternalError(Box::new(e)) diff --git a/exon/exon-vcf/src/array_builder/lazy_array_builder.rs b/exon/exon-vcf/src/array_builder/lazy_array_builder.rs index 01fb4b05..91eafe9c 100644 --- a/exon/exon-vcf/src/array_builder/lazy_array_builder.rs +++ b/exon/exon-vcf/src/array_builder/lazy_array_builder.rs @@ -229,7 +229,7 @@ impl LazyVCFArrayBuilder { s.push('='); match value { - InfosValue::String(v) => s.push_str(v), + InfosValue::String(v) => s.push_str(&v), InfosValue::Character(v) => s.push(v), InfosValue::Float(v) => s.push_str(&v.to_string()), InfosValue::Flag => {