Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add BZIP2 (.bz2) support for reading fasta and fastq #185

Merged
merged 1 commit into from
Dec 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions python/biobear/biobear.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class FileCompressionType(enum.Enum):
GZIP = 0
BGZIP = 1
NONE = 2
BZIP2 = 3

class FastaSequenceDataType(enum.Enum):
"""How to treat the sequence data in a FASTA file."""
Expand Down
3 changes: 3 additions & 0 deletions python/biobear/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,15 @@ class Compression(Enum):
INFERRED = "INFERRED"
NONE = "NONE"
GZIP = "GZIP"
BZIP2 = "BZIP2"

@classmethod
def from_file(cls, path: os.PathLike) -> "Compression":
"""Infer the compression type from the file extension."""
if Path(path).suffix == ".gz":
return Compression.GZIP
if Path(path).suffix == ".bz2":
return Compression.BZIP2
return Compression.NONE

def infer_or_use(self, path: os.PathLike) -> "Compression":
Expand Down
Binary file added python/tests/data/test.fa.bz2
Binary file not shown.
Binary file added python/tests/data/test.fq.bz2
Binary file not shown.
20 changes: 20 additions & 0 deletions python/tests/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,14 @@ def test_read_fastq():

assert len(df) == 2

fastq_path = DATA / "test.fq.bz2"
options = FASTQReadOptions(
file_extension="fq", file_compression_type=FileCompressionType.BZIP2
)

df = session.read_fastq_file(str(fastq_path), options=options).to_polars()

assert len(df) == 2

@pytest.mark.skipif(
not importlib.util.find_spec("polars"), reason="polars not installed"
Expand Down Expand Up @@ -285,6 +293,18 @@ def test_read_fasta_gz():

assert len(df) == 2

def test_read_fasta_bz2():
"""Test reading a fasta.bz2 file."""
session = connect()

fasta_path = DATA / "test.fa.bz2"

options = FASTAReadOptions(
file_extension="fa", file_compression_type=FileCompressionType.BZIP2
)
df = session.read_fasta_file(str(fasta_path), options=options).to_polars()

assert len(df) == 2

@pytest.mark.skipif(
not importlib.util.find_spec("polars"), reason="polars not installed"
Expand Down
5 changes: 5 additions & 0 deletions src/file_compression_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ pub enum FileCompressionType {
GZIP,
ZSTD,
UNCOMPRESSED,
BZIP2,
}

impl Default for FileCompressionType {
Expand Down Expand Up @@ -56,6 +57,7 @@ impl Display for FileCompressionType {
Self::GZIP => write!(f, "GZIP"),
Self::ZSTD => write!(f, "ZSTD"),
Self::UNCOMPRESSED => write!(f, "UNCOMPRESSED"),
Self::BZIP2 => write!(f, "BZIP2"),
}
}
}
Expand All @@ -66,6 +68,7 @@ impl From<FileCompressionType> for DFFileCompressionType {
FileCompressionType::GZIP => DFFileCompressionType::GZIP,
FileCompressionType::ZSTD => DFFileCompressionType::ZSTD,
FileCompressionType::UNCOMPRESSED => DFFileCompressionType::UNCOMPRESSED,
FileCompressionType::BZIP2 => DFFileCompressionType::BZIP2,
}
}
}
Expand All @@ -78,6 +81,7 @@ impl TryFrom<CompressionTypeVariant> for FileCompressionType {
CompressionTypeVariant::GZIP => Ok(Self::GZIP),
CompressionTypeVariant::ZSTD => Ok(Self::ZSTD),
CompressionTypeVariant::UNCOMPRESSED => Ok(Self::UNCOMPRESSED),
CompressionTypeVariant::BZIP2 => Ok(Self::BZIP2),
_ => Err(BioBearError::InvalidCompressionType(value.to_string())),
}
}
Expand All @@ -91,6 +95,7 @@ impl TryFrom<DFFileCompressionType> for FileCompressionType {
DFFileCompressionType::GZIP => Ok(Self::GZIP),
DFFileCompressionType::ZSTD => Ok(Self::ZSTD),
DFFileCompressionType::UNCOMPRESSED => Ok(Self::UNCOMPRESSED),
DFFileCompressionType::BZIP2 => Ok(Self::BZIP2),
_ => Err(BioBearError::InvalidCompressionType(
"Invalid compression type".to_string(),
)),
Expand Down
Loading