Skip to content

Commit

Permalink
fix: better handling for ambiguous AAs
Browse files Browse the repository at this point in the history
  • Loading branch information
tshauck committed May 24, 2024
1 parent 0ac0895 commit a2eb1c0
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 52 deletions.
11 changes: 7 additions & 4 deletions exon/exon-core/tests/sqllogictests/slt/fasta-scan-tests.slt
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ CREATE EXTERNAL TABLE exon_table STORED AS FASTA OPTIONS (file_extension 'faa')
query T
SELECT id, description, sequence FROM exon_table;
----
a description [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
b description2 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
a description [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23]
b description2 [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23]

statement ok
DROP TABLE exon_table;
Expand All @@ -154,5 +154,8 @@ CREATE EXTERNAL TABLE exon_table STORED AS FASTA OPTIONS (file_extension 'faa',
query T
SELECT id, description, sequence FROM exon_table;
----
a description [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
b description2 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
a description [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23]
b description2 [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23]

statement ok
DROP TABLE exon_table;
98 changes: 63 additions & 35 deletions exon/exon-fasta/src/array_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
use std::{str::FromStr, sync::Arc};

use arrow::{
array::{ArrayBuilder, ArrayRef, GenericListBuilder, GenericStringBuilder, Int32Builder},
array::{ArrayRef, GenericListBuilder, GenericStringBuilder, Int32Builder},
datatypes::SchemaRef,
error::ArrowError,
};
Expand All @@ -29,6 +29,8 @@ pub struct FASTAArrayBuilder {
descriptions: GenericStringBuilder<i32>,
sequences: SequenceBuilder,
projection: Vec<usize>,
append_name: bool,
append_description: bool,
append_sequence: bool,
rows: usize,
}
Expand Down Expand Up @@ -67,10 +69,16 @@ impl FASTAArrayBuilder {
GenericStringBuilder::<i64>::with_capacity(capacity, capacity),
),
SequenceDataType::IntegerEncodeProtein => SequenceBuilder::IntegerEncodeProtein(
GenericListBuilder::<i32, Int32Builder>::new(Int32Builder::with_capacity(capacity)),
GenericListBuilder::<i32, Int32Builder>::with_capacity(
Int32Builder::with_capacity(60),
capacity,
),
),
SequenceDataType::IntegerEncodeDNA => SequenceBuilder::IntegerEncodeDNA(
GenericListBuilder::<i32, Int32Builder>::new(Int32Builder::with_capacity(capacity)),
GenericListBuilder::<i32, Int32Builder>::with_capacity(
Int32Builder::with_capacity(60),
capacity,
),
),
};

Expand All @@ -79,7 +87,9 @@ impl FASTAArrayBuilder {
None => (0..schema.fields().len()).collect(),
};

let append_sequence = true;
let append_name = projection.contains(&0);
let append_description = projection.contains(&1);
let append_sequence = projection.contains(&2);

Ok(Self {
names: GenericStringBuilder::<i32>::with_capacity(capacity, capacity),
Expand All @@ -88,29 +98,37 @@ impl FASTAArrayBuilder {
projection,
rows: 0,
append_sequence,
append_name,
append_description,
})
}

pub fn len(&self) -> usize {
self.names.len()
self.rows
}

pub fn is_empty(&self) -> bool {
self.len() == 0
}

pub fn append(&mut self, definition: &str, sequence: &[u8]) -> Result<(), ArrowError> {
let definition =
Definition::from_str(definition).map_err(|e| ArrowError::ExternalError(Box::new(e)))?;
if self.append_name || self.append_description {
let definition = Definition::from_str(definition)
.map_err(|e| ArrowError::ExternalError(Box::new(e)))?;

let name = std::str::from_utf8(definition.name())?;
self.names.append_value(name);
if self.append_name {
let name = std::str::from_utf8(definition.name())?;
self.names.append_value(name);
}

if let Some(description) = definition.description() {
let description = std::str::from_utf8(description)?;
self.descriptions.append_value(description);
} else {
self.descriptions.append_null();
if self.append_description {
if let Some(description) = definition.description() {
let description = std::str::from_utf8(description)?;
self.descriptions.append_value(description);
} else {
self.descriptions.append_null();
}
}
}

if self.append_sequence {
Expand All @@ -129,25 +147,30 @@ impl FASTAArrayBuilder {
for aa in sequence {
let aa = match aa {
b'A' => 1,
b'C' => 2,
b'D' => 3,
b'E' => 4,
b'F' => 5,
b'G' => 6,
b'H' => 7,
b'I' => 8,
b'K' => 9,
b'L' => 10,
b'M' => 11,
b'N' => 12,
b'P' => 13,
b'Q' => 14,
b'R' => 15,
b'S' => 16,
b'T' => 17,
b'V' => 18,
b'W' => 19,
b'Y' => 20,
b'B' => 2,
b'C' => 3,
b'D' => 4,
b'E' => 5,
b'F' => 6,
b'G' => 7,
b'H' => 8,
b'I' => 9,
b'K' => 10,
b'L' => 11,
b'M' => 12,
b'N' => 13,
b'O' => 14,
b'P' => 15,
b'Q' => 16,
b'R' => 17,
b'S' => 18,
b'T' => 19,
b'U' => 20,
b'V' => 21,
b'W' => 22,
b'Y' => 23,
b'X' => 24,
b'Z' => 25,
_ => {
return Err(ExonFastaError::InvalidAminoAcid(*aa).into());
}
Expand Down Expand Up @@ -190,8 +213,13 @@ impl FASTAArrayBuilder {
fn finish_inner(&mut self) -> Vec<ArrayRef> {
let mut arrays: Vec<ArrayRef> = Vec::with_capacity(self.projection.len());

arrays.push(Arc::new(self.names.finish()));
arrays.push(Arc::new(self.descriptions.finish()));
if self.append_name {
arrays.push(Arc::new(self.names.finish()));
}

if self.append_description {
arrays.push(Arc::new(self.descriptions.finish()));
}

if self.append_sequence {
arrays.push(self.sequences.finish());
Expand Down
18 changes: 6 additions & 12 deletions exon/exon-fasta/src/batch_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ where
}

async fn read_batch(&mut self) -> ExonFastaResult<Option<RecordBatch>> {
let mut record_batch = FASTAArrayBuilder::create(
let mut array_builder = FASTAArrayBuilder::create(
self.config.file_schema.clone(),
self.config.projection.clone(),
self.config.batch_size,
Expand All @@ -82,26 +82,20 @@ where
self.sequence_buffer.clear();

if (self.read_record().await?).is_some() {
record_batch.append(&self.buf, &self.sequence_buffer)?;
array_builder.append(&self.buf, &self.sequence_buffer)?;
} else {
break;
}
}

if record_batch.is_empty() {
if array_builder.is_empty() {
return Ok(None);
}

let batch = RecordBatch::try_new(self.config.file_schema.clone(), record_batch.finish())?;
let schema = self.config.projected_schema()?;
let batch = array_builder.try_into_record_batch(schema)?;

match &self.config.projection {
Some(projection) => {
let projected_batch = batch.project(projection)?;

Ok(Some(projected_batch))
}
None => Ok(Some(batch)),
}
Ok(Some(batch))
}

/// Converts the reader into a stream of batches.
Expand Down
14 changes: 14 additions & 0 deletions exon/exon-fasta/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,20 @@ impl FASTAConfig {
self
}

/// Get the projection, returning the identity projection if none is set.
pub fn projection(&self) -> Vec<usize> {
self.projection
.clone()
.unwrap_or_else(|| (0..self.file_schema.fields().len()).collect())
}

/// Get the projected schema.
pub fn projected_schema(&self) -> arrow::error::Result<SchemaRef> {
let schema = self.file_schema.project(&self.projection())?;

Ok(Arc::new(schema))
}

/// Create a new FASTA configuration with a given projection.
pub fn with_projection(mut self, projection: Vec<usize>) -> Self {
// Only include fields that are in the file schema.
Expand Down
6 changes: 5 additions & 1 deletion exon/exon-fasta/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,11 @@ impl Display for ExonFastaError {
write!(f, "Invalid nucleotide: {}", nucleotide)
}
ExonFastaError::InvalidAminoAcid(amino_acid) => {
write!(f, "Invalid amino acid: {}", amino_acid)
write!(
f,
"Invalid amino acid: {}",
std::char::from_u32(*amino_acid as u32).unwrap()
)
}
}
}
Expand Down

0 comments on commit a2eb1c0

Please sign in to comment.