From a2eb1c05559ad4bb522ad9ef0081dbfa185e533d Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Fri, 24 May 2024 09:52:46 -0700 Subject: [PATCH] fix: better handling for ambiguous AAs --- .../sqllogictests/slt/fasta-scan-tests.slt | 11 ++- exon/exon-fasta/src/array_builder.rs | 98 ++++++++++++------- exon/exon-fasta/src/batch_reader.rs | 18 ++-- exon/exon-fasta/src/config.rs | 14 +++ exon/exon-fasta/src/error.rs | 6 +- 5 files changed, 95 insertions(+), 52 deletions(-) diff --git a/exon/exon-core/tests/sqllogictests/slt/fasta-scan-tests.slt b/exon/exon-core/tests/sqllogictests/slt/fasta-scan-tests.slt index 9eb489a3..41235779 100644 --- a/exon/exon-core/tests/sqllogictests/slt/fasta-scan-tests.slt +++ b/exon/exon-core/tests/sqllogictests/slt/fasta-scan-tests.slt @@ -142,8 +142,8 @@ CREATE EXTERNAL TABLE exon_table STORED AS FASTA OPTIONS (file_extension 'faa') query T SELECT id, description, sequence FROM exon_table; ---- -a description [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] -b description2 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] +a description [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23] +b description2 [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23] statement ok DROP TABLE exon_table; @@ -154,5 +154,8 @@ CREATE EXTERNAL TABLE exon_table STORED AS FASTA OPTIONS (file_extension 'faa', query T SELECT id, description, sequence FROM exon_table; ---- -a description [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] -b description2 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] +a description [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23] +b description2 [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23] + +statement ok +DROP TABLE exon_table; diff --git a/exon/exon-fasta/src/array_builder.rs b/exon/exon-fasta/src/array_builder.rs index 94d61eb7..45d68b79 100644 --- a/exon/exon-fasta/src/array_builder.rs +++ b/exon/exon-fasta/src/array_builder.rs @@ -15,7 +15,7 @@ use std::{str::FromStr, sync::Arc}; use arrow::{ - array::{ArrayBuilder, ArrayRef, GenericListBuilder, GenericStringBuilder, Int32Builder}, + array::{ArrayRef, GenericListBuilder, GenericStringBuilder, Int32Builder}, datatypes::SchemaRef, error::ArrowError, }; @@ -29,6 +29,8 @@ pub struct FASTAArrayBuilder { descriptions: GenericStringBuilder, sequences: SequenceBuilder, projection: Vec, + append_name: bool, + append_description: bool, append_sequence: bool, rows: usize, } @@ -67,10 +69,16 @@ impl FASTAArrayBuilder { GenericStringBuilder::::with_capacity(capacity, capacity), ), SequenceDataType::IntegerEncodeProtein => SequenceBuilder::IntegerEncodeProtein( - GenericListBuilder::::new(Int32Builder::with_capacity(capacity)), + GenericListBuilder::::with_capacity( + Int32Builder::with_capacity(60), + capacity, + ), ), SequenceDataType::IntegerEncodeDNA => SequenceBuilder::IntegerEncodeDNA( - GenericListBuilder::::new(Int32Builder::with_capacity(capacity)), + GenericListBuilder::::with_capacity( + Int32Builder::with_capacity(60), + capacity, + ), ), }; @@ -79,7 +87,9 @@ impl FASTAArrayBuilder { None => (0..schema.fields().len()).collect(), }; - let append_sequence = true; + let append_name = projection.contains(&0); + let append_description = projection.contains(&1); + let append_sequence = projection.contains(&2); Ok(Self { names: GenericStringBuilder::::with_capacity(capacity, capacity), @@ -88,11 +98,13 @@ impl FASTAArrayBuilder { projection, rows: 0, append_sequence, + append_name, + append_description, }) } pub fn len(&self) -> usize { - self.names.len() + self.rows } pub fn is_empty(&self) -> bool { @@ -100,17 +112,23 @@ impl FASTAArrayBuilder { } pub fn append(&mut self, definition: &str, sequence: &[u8]) -> Result<(), ArrowError> { - let definition = - Definition::from_str(definition).map_err(|e| ArrowError::ExternalError(Box::new(e)))?; + if self.append_name || self.append_description { + let definition = Definition::from_str(definition) + .map_err(|e| ArrowError::ExternalError(Box::new(e)))?; - let name = std::str::from_utf8(definition.name())?; - self.names.append_value(name); + if self.append_name { + let name = std::str::from_utf8(definition.name())?; + self.names.append_value(name); + } - if let Some(description) = definition.description() { - let description = std::str::from_utf8(description)?; - self.descriptions.append_value(description); - } else { - self.descriptions.append_null(); + if self.append_description { + if let Some(description) = definition.description() { + let description = std::str::from_utf8(description)?; + self.descriptions.append_value(description); + } else { + self.descriptions.append_null(); + } + } } if self.append_sequence { @@ -129,25 +147,30 @@ impl FASTAArrayBuilder { for aa in sequence { let aa = match aa { b'A' => 1, - b'C' => 2, - b'D' => 3, - b'E' => 4, - b'F' => 5, - b'G' => 6, - b'H' => 7, - b'I' => 8, - b'K' => 9, - b'L' => 10, - b'M' => 11, - b'N' => 12, - b'P' => 13, - b'Q' => 14, - b'R' => 15, - b'S' => 16, - b'T' => 17, - b'V' => 18, - b'W' => 19, - b'Y' => 20, + b'B' => 2, + b'C' => 3, + b'D' => 4, + b'E' => 5, + b'F' => 6, + b'G' => 7, + b'H' => 8, + b'I' => 9, + b'K' => 10, + b'L' => 11, + b'M' => 12, + b'N' => 13, + b'O' => 14, + b'P' => 15, + b'Q' => 16, + b'R' => 17, + b'S' => 18, + b'T' => 19, + b'U' => 20, + b'V' => 21, + b'W' => 22, + b'Y' => 23, + b'X' => 24, + b'Z' => 25, _ => { return Err(ExonFastaError::InvalidAminoAcid(*aa).into()); } @@ -190,8 +213,13 @@ impl FASTAArrayBuilder { fn finish_inner(&mut self) -> Vec { let mut arrays: Vec = Vec::with_capacity(self.projection.len()); - arrays.push(Arc::new(self.names.finish())); - arrays.push(Arc::new(self.descriptions.finish())); + if self.append_name { + arrays.push(Arc::new(self.names.finish())); + } + + if self.append_description { + arrays.push(Arc::new(self.descriptions.finish())); + } if self.append_sequence { arrays.push(self.sequences.finish()); diff --git a/exon/exon-fasta/src/batch_reader.rs b/exon/exon-fasta/src/batch_reader.rs index e1d05103..51b489ab 100644 --- a/exon/exon-fasta/src/batch_reader.rs +++ b/exon/exon-fasta/src/batch_reader.rs @@ -70,7 +70,7 @@ where } async fn read_batch(&mut self) -> ExonFastaResult> { - let mut record_batch = FASTAArrayBuilder::create( + let mut array_builder = FASTAArrayBuilder::create( self.config.file_schema.clone(), self.config.projection.clone(), self.config.batch_size, @@ -82,26 +82,20 @@ where self.sequence_buffer.clear(); if (self.read_record().await?).is_some() { - record_batch.append(&self.buf, &self.sequence_buffer)?; + array_builder.append(&self.buf, &self.sequence_buffer)?; } else { break; } } - if record_batch.is_empty() { + if array_builder.is_empty() { return Ok(None); } - let batch = RecordBatch::try_new(self.config.file_schema.clone(), record_batch.finish())?; + let schema = self.config.projected_schema()?; + let batch = array_builder.try_into_record_batch(schema)?; - match &self.config.projection { - Some(projection) => { - let projected_batch = batch.project(projection)?; - - Ok(Some(projected_batch)) - } - None => Ok(Some(batch)), - } + Ok(Some(batch)) } /// Converts the reader into a stream of batches. diff --git a/exon/exon-fasta/src/config.rs b/exon/exon-fasta/src/config.rs index 26cfb698..36ca6f31 100644 --- a/exon/exon-fasta/src/config.rs +++ b/exon/exon-fasta/src/config.rs @@ -102,6 +102,20 @@ impl FASTAConfig { self } + /// Get the projection, returning the identity projection if none is set. + pub fn projection(&self) -> Vec { + self.projection + .clone() + .unwrap_or_else(|| (0..self.file_schema.fields().len()).collect()) + } + + /// Get the projected schema. + pub fn projected_schema(&self) -> arrow::error::Result { + let schema = self.file_schema.project(&self.projection())?; + + Ok(Arc::new(schema)) + } + /// Create a new FASTA configuration with a given projection. pub fn with_projection(mut self, projection: Vec) -> Self { // Only include fields that are in the file schema. diff --git a/exon/exon-fasta/src/error.rs b/exon/exon-fasta/src/error.rs index 6f51f24a..41ef334a 100644 --- a/exon/exon-fasta/src/error.rs +++ b/exon/exon-fasta/src/error.rs @@ -42,7 +42,11 @@ impl Display for ExonFastaError { write!(f, "Invalid nucleotide: {}", nucleotide) } ExonFastaError::InvalidAminoAcid(amino_acid) => { - write!(f, "Invalid amino acid: {}", amino_acid) + write!( + f, + "Invalid amino acid: {}", + std::char::from_u32(*amino_acid as u32).unwrap() + ) } } }