Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deserialize Run Containers #255

Merged
merged 3 commits into from
Jul 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ Unless you explicitly state otherwise, any contribution intentionally submitted
for inclusion in the work by you shall be dual licensed as above, without any
additional terms or conditions.

[github-actions-badge]: https://img.shields.io/github/workflow/status/RoaringBitmap/roaring-rs/Continuous%20integration.svg?style=flat-square
[github-actions-badge]:
https://github.com/RoaringBitmap/roaring-rs/actions/workflows/test.yml/badge.svg
[github-actions]: https://github.com/RoaringBitmap/roaring-rs/actions
[release-badge]: https://img.shields.io/github/release/RoaringBitmap/roaring-rs.svg?style=flat-square
[cargo]: https://crates.io/crates/roaring
Expand Down
2 changes: 1 addition & 1 deletion src/bitmap/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use std::ops::{
use super::store::{self, Store};
use super::util;

const ARRAY_LIMIT: u64 = 4096;
pub const ARRAY_LIMIT: u64 = 4096;

#[derive(PartialEq, Clone)]
pub struct Container {
Expand Down
68 changes: 52 additions & 16 deletions src/bitmap/serialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,19 @@ use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use std::convert::{Infallible, TryFrom};
use std::error::Error;
use std::io;
use std::ops::RangeInclusive;

use super::container::Container;
use crate::bitmap::store::{ArrayStore, BitmapStore, Store};
use crate::bitmap::container::{Container, ARRAY_LIMIT};
use crate::bitmap::store::{ArrayStore, BitmapStore, Store, BITMAP_LENGTH};
use crate::RoaringBitmap;

const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
const SERIAL_COOKIE: u16 = 12347;
// TODO: Need this once run containers are supported
// const NO_OFFSET_THRESHOLD: u8 = 4;
const NO_OFFSET_THRESHOLD: usize = 4;

// Sizes of header structures
const DESCRIPTION_BYTES: usize = 4;
const OFFSET_BYTES: usize = 4;

impl RoaringBitmap {
/// Return the size in bytes of the serialized output.
Expand Down Expand Up @@ -163,49 +167,81 @@ impl RoaringBitmap {
B: Fn(u64, Box<[u64; 1024]>) -> Result<BitmapStore, BErr>,
BErr: Error + Send + Sync + 'static,
{
let (size, has_offsets) = {
// First read the cookie to determine which version of the format we are reading
let (size, has_offsets, has_run_containers) = {
let cookie = reader.read_u32::<LittleEndian>()?;
if cookie == SERIAL_COOKIE_NO_RUNCONTAINER {
(reader.read_u32::<LittleEndian>()? as usize, true)
(reader.read_u32::<LittleEndian>()? as usize, true, false)
} else if (cookie as u16) == SERIAL_COOKIE {
return Err(io::Error::new(io::ErrorKind::Other, "run containers are unsupported"));
let size = ((cookie >> 16) + 1) as usize;
(size, size >= NO_OFFSET_THRESHOLD, true)
} else {
return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value"));
}
};

// Read the run container bitmap if necessary
let run_container_bitmap = if has_run_containers {
let mut bitmap = vec![0u8; (size + 7) / 8];
reader.read_exact(&mut bitmap)?;
Some(bitmap)
} else {
None
};

if size > u16::MAX as usize + 1 {
return Err(io::Error::new(io::ErrorKind::Other, "size is greater than supported"));
}

let mut description_bytes = vec![0u8; size * 4];
// Read the container descriptions
let mut description_bytes = vec![0u8; size * DESCRIPTION_BYTES];
reader.read_exact(&mut description_bytes)?;
let mut description_bytes = &description_bytes[..];

if has_offsets {
let mut offsets = vec![0u8; size * 4];
let mut offsets = vec![0u8; size * OFFSET_BYTES];
reader.read_exact(&mut offsets)?;
drop(offsets); // Not useful when deserializing into memory
}

let mut containers = Vec::with_capacity(size);

for _ in 0..size {
// Read each container
for i in 0..size {
let key = description_bytes.read_u16::<LittleEndian>()?;
let len = u64::from(description_bytes.read_u16::<LittleEndian>()?) + 1;
let cardinality = u64::from(description_bytes.read_u16::<LittleEndian>()?) + 1;

// If the run container bitmap is present, check if this container is a run container
let is_run_container =
run_container_bitmap.as_ref().map_or(false, |bm| bm[i / 8] & (1 << (i % 8)) != 0);

let store = if is_run_container {
let runs = reader.read_u16::<LittleEndian>()?;
let mut intervals = vec![[0, 0]; runs as usize];
reader.read_exact(cast_slice_mut(&mut intervals))?;
intervals.iter_mut().for_each(|[s, len]| {
*s = u16::from_le(*s);
*len = u16::from_le(*len);
});

let store = if len <= 4096 {
let mut values = vec![0; len as usize];
let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum();
let mut store = Store::with_capacity(cardinality);
intervals.into_iter().for_each(|[s, len]| {
store.insert_range(RangeInclusive::new(s, s + len));
});
store
} else if cardinality <= ARRAY_LIMIT {
let mut values = vec![0; cardinality as usize];
reader.read_exact(cast_slice_mut(&mut values))?;
values.iter_mut().for_each(|n| *n = u16::from_le(*n));
let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
Store::Array(array)
} else {
let mut values = Box::new([0; 1024]);
let mut values = Box::new([0; BITMAP_LENGTH]);
reader.read_exact(cast_slice_mut(&mut values[..]))?;
values.iter_mut().for_each(|n| *n = u64::from_le(*n));
let bitmap =
b(len, values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
let bitmap = b(cardinality, values)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
Store::Bitmap(bitmap)
};

Expand Down
4 changes: 4 additions & 0 deletions src/bitmap/store/array_store/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ impl ArrayStore {
ArrayStore { vec: vec![] }
}

pub fn with_capacity(capacity: usize) -> ArrayStore {
ArrayStore { vec: Vec::with_capacity(capacity) }
}

///
/// Create a new SortedU16Vec from a given vec
/// It is up to the caller to ensure the vec is sorted and deduplicated
Expand Down
12 changes: 11 additions & 1 deletion src/bitmap/store/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ use std::ops::{
};
use std::{slice, vec};

use self::bitmap_store::BITMAP_LENGTH;
pub use self::bitmap_store::BITMAP_LENGTH;
use self::Store::{Array, Bitmap};

pub use self::array_store::ArrayStore;
pub use self::bitmap_store::{BitmapIter, BitmapStore};

use crate::bitmap::container::ARRAY_LIMIT;

#[derive(Clone)]
pub enum Store {
Array(ArrayStore),
Expand All @@ -31,6 +33,14 @@ impl Store {
Store::Array(ArrayStore::new())
}

pub fn with_capacity(capacity: usize) -> Store {
if capacity <= ARRAY_LIMIT as usize {
Store::Array(ArrayStore::with_capacity(capacity))
} else {
Store::Bitmap(BitmapStore::new())
}
}

pub fn full() -> Store {
Store::Bitmap(BitmapStore::full())
}
Expand Down
Binary file added tests/bitmapwithruns.bin
Binary file not shown.
11 changes: 10 additions & 1 deletion tests/serialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use roaring::RoaringBitmap;

// Test data from https://github.com/RoaringBitmap/RoaringFormatSpec/tree/master/testdata
static BITMAP_WITHOUT_RUNS: &[u8] = include_bytes!("bitmapwithoutruns.bin");
static BITMAP_WITH_RUNS: &[u8] = include_bytes!("bitmapwithruns.bin");

fn test_data_bitmap() -> RoaringBitmap {
(0..100)
Expand All @@ -21,10 +22,18 @@ fn serialize_and_deserialize(bitmap: &RoaringBitmap) -> RoaringBitmap {
}

#[test]
fn test_deserialize_from_provided_data() {
fn test_deserialize_without_runs_from_provided_data() {
assert_eq!(RoaringBitmap::deserialize_from(BITMAP_WITHOUT_RUNS).unwrap(), test_data_bitmap());
}

#[test]
fn test_deserialize_with_runs_from_provided_data() {
assert_eq!(
RoaringBitmap::deserialize_from(&mut &BITMAP_WITH_RUNS[..]).unwrap(),
test_data_bitmap()
);
}

#[test]
fn test_serialize_into_provided_data() {
let bitmap = test_data_bitmap();
Expand Down