Skip to content

Commit

Permalink
refactor: one hot to int encode (#513)
Browse files Browse the repository at this point in the history
  • Loading branch information
tshauck authored May 23, 2024
1 parent 81c6d21 commit 0916241
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 21 deletions.
6 changes: 3 additions & 3 deletions exon/exon-core/tests/sqllogictests/slt/fasta-scan-tests.slt
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ statement ok
DROP TABLE fa_table;

statement ok
SET exon.fasta_sequence_data_type = 'one_hot_dna';
SET exon.fasta_sequence_data_type = 'integer_encode_dna';

statement ok
CREATE EXTERNAL TABLE exon_table STORED AS FASTA LOCATION '$CARGO_MANIFEST_DIR/test-data/datasources/fasta/test.fasta';
Expand All @@ -134,7 +134,7 @@ statement ok
DROP TABLE exon_table;

statement ok
SET exon.fasta_sequence_data_type = 'one_hot_protein';
SET exon.fasta_sequence_data_type = 'integer_encode_protein';

statement ok
CREATE EXTERNAL TABLE exon_table STORED AS FASTA OPTIONS (file_extension 'faa') LOCATION '$CARGO_MANIFEST_DIR/test-data/datasources/faa/test.faa';
Expand All @@ -149,7 +149,7 @@ statement ok
DROP TABLE exon_table;

statement ok
CREATE EXTERNAL TABLE exon_table STORED AS FASTA OPTIONS (file_extension 'faa', fasta_sequence_data_type 'one_hot_protein') LOCATION '$CARGO_MANIFEST_DIR/test-data/datasources/faa/test.faa';
CREATE EXTERNAL TABLE exon_table STORED AS FASTA OPTIONS (file_extension 'faa', fasta_sequence_data_type 'integer_encode_protein') LOCATION '$CARGO_MANIFEST_DIR/test-data/datasources/faa/test.faa';

query T
SELECT id, description, sequence FROM exon_table;
Expand Down
22 changes: 10 additions & 12 deletions exon/exon-fasta/src/array_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,17 @@ pub struct FASTAArrayBuilder {
pub enum SequenceBuilder {
Utf8(GenericStringBuilder<i32>),
LargeUtf8(GenericStringBuilder<i64>),
OneHotDNA(GenericListBuilder<i32, Int32Builder>),
OneHotProtein(GenericListBuilder<i32, Int32Builder>),
IntegerEncodeDNA(GenericListBuilder<i32, Int32Builder>),
IntegerEncodeProtein(GenericListBuilder<i32, Int32Builder>),
}

impl SequenceBuilder {
fn finish(&mut self) -> ArrayRef {
match self {
Self::Utf8(ref mut builder) => Arc::new(builder.finish()),
Self::LargeUtf8(ref mut builder) => Arc::new(builder.finish()),
Self::OneHotProtein(ref mut builder) => Arc::new(builder.finish()),
Self::OneHotDNA(ref mut builder) => Arc::new(builder.finish()),
Self::IntegerEncodeProtein(ref mut builder) => Arc::new(builder.finish()),
Self::IntegerEncodeDNA(ref mut builder) => Arc::new(builder.finish()),
}
}
}
Expand All @@ -66,14 +66,12 @@ impl FASTAArrayBuilder {
SequenceDataType::LargeUtf8 => SequenceBuilder::LargeUtf8(
GenericStringBuilder::<i64>::with_capacity(capacity, capacity),
),
SequenceDataType::OneHotProtein => SequenceBuilder::OneHotProtein(
SequenceDataType::IntegerEncodeProtein => SequenceBuilder::IntegerEncodeProtein(
GenericListBuilder::<i32, Int32Builder>::new(Int32Builder::with_capacity(capacity)),
),
SequenceDataType::IntegerEncodeDNA => SequenceBuilder::IntegerEncodeDNA(
GenericListBuilder::<i32, Int32Builder>::new(Int32Builder::with_capacity(capacity)),
),
SequenceDataType::OneHotDNA => {
SequenceBuilder::OneHotDNA(GenericListBuilder::<i32, Int32Builder>::new(
Int32Builder::with_capacity(capacity),
))
}
};

let projection = match projection {
Expand Down Expand Up @@ -125,7 +123,7 @@ impl FASTAArrayBuilder {
let sequence = std::str::from_utf8(sequence)?;
builder.append_value(sequence);
}
SequenceBuilder::OneHotProtein(ref mut builder) => {
SequenceBuilder::IntegerEncodeProtein(ref mut builder) => {
let values = builder.values();

for aa in sequence {
Expand Down Expand Up @@ -160,7 +158,7 @@ impl FASTAArrayBuilder {

builder.append(true);
}
SequenceBuilder::OneHotDNA(ref mut builder) => {
SequenceBuilder::IntegerEncodeDNA(ref mut builder) => {
let values = builder.values();

// Convert the DNA sequence to one-hot encoding, use A => 1, C => 2, G => 3, T => 4, N => 5
Expand Down
12 changes: 6 additions & 6 deletions exon/exon-fasta/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ use object_store::ObjectStore;
pub enum SequenceDataType {
Utf8,
LargeUtf8,
OneHotProtein,
OneHotDNA,
IntegerEncodeProtein,
IntegerEncodeDNA,
}

impl FromStr for SequenceDataType {
Expand All @@ -34,8 +34,8 @@ impl FromStr for SequenceDataType {
match s {
"utf8" => Ok(Self::Utf8),
"large_utf8" => Ok(Self::LargeUtf8),
"one_hot_protein" => Ok(Self::OneHotProtein),
"one_hot_dna" => Ok(Self::OneHotDNA),
"integer_encode_protein" => Ok(Self::IntegerEncodeProtein),
"integer_encode_dna" => Ok(Self::IntegerEncodeDNA),
_ => Err("invalid sequence data type"),
}
}
Expand Down Expand Up @@ -182,14 +182,14 @@ impl FASTASchemaBuilder {
let field = Field::new("sequence", DataType::LargeUtf8, true);
fields[2] = field;
}
SequenceDataType::OneHotProtein => {
SequenceDataType::IntegerEncodeProtein => {
// List of i32
let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true)));

let field = Field::new("sequence", data_type, true);
fields[2] = field;
}
SequenceDataType::OneHotDNA => {
SequenceDataType::IntegerEncodeDNA => {
let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true)));

let field = Field::new("sequence", data_type, true);
Expand Down

0 comments on commit 0916241

Please sign in to comment.