From 56183149a2a201b1aaa36b936ec643decb1fde92 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Fri, 7 Jul 2023 13:39:41 +0200 Subject: [PATCH 1/3] Deserialize run containers and convert them into bitmap/array Co-authored-by: Joseph Glanville --- src/bitmap/container.rs | 2 +- src/bitmap/serialization.rs | 68 ++++++++++++++++++++++------- src/bitmap/store/array_store/mod.rs | 4 ++ src/bitmap/store/mod.rs | 12 ++++- 4 files changed, 68 insertions(+), 18 deletions(-) diff --git a/src/bitmap/container.rs b/src/bitmap/container.rs index e6d0cf84..9cbab634 100644 --- a/src/bitmap/container.rs +++ b/src/bitmap/container.rs @@ -6,7 +6,7 @@ use std::ops::{ use super::store::{self, Store}; use super::util; -const ARRAY_LIMIT: u64 = 4096; +pub const ARRAY_LIMIT: u64 = 4096; #[derive(PartialEq, Clone)] pub struct Container { diff --git a/src/bitmap/serialization.rs b/src/bitmap/serialization.rs index c2871c4a..6ae84df4 100644 --- a/src/bitmap/serialization.rs +++ b/src/bitmap/serialization.rs @@ -3,15 +3,19 @@ use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use std::convert::{Infallible, TryFrom}; use std::error::Error; use std::io; +use std::ops::RangeInclusive; -use super::container::Container; -use crate::bitmap::store::{ArrayStore, BitmapStore, Store}; +use crate::bitmap::container::{Container, ARRAY_LIMIT}; +use crate::bitmap::store::{ArrayStore, BitmapStore, Store, BITMAP_LENGTH}; use crate::RoaringBitmap; const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; const SERIAL_COOKIE: u16 = 12347; -// TODO: Need this once run containers are supported -// const NO_OFFSET_THRESHOLD: u8 = 4; +const NO_OFFSET_THRESHOLD: usize = 4; + +// Sizes of header structures +const DESCRIPTION_BYTES: usize = 4; +const OFFSET_BYTES: usize = 4; impl RoaringBitmap { /// Return the size in bytes of the serialized output. @@ -163,49 +167,81 @@ impl RoaringBitmap { B: Fn(u64, Box<[u64; 1024]>) -> Result, BErr: Error + Send + Sync + 'static, { - let (size, has_offsets) = { + // First read the cookie to determine which version of the format we are reading + let (size, has_offsets, has_run_containers) = { let cookie = reader.read_u32::()?; if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { - (reader.read_u32::()? as usize, true) + (reader.read_u32::()? as usize, true, false) } else if (cookie as u16) == SERIAL_COOKIE { - return Err(io::Error::new(io::ErrorKind::Other, "run containers are unsupported")); + let size = ((cookie >> 16) + 1) as usize; + (size, size >= NO_OFFSET_THRESHOLD, true) } else { return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value")); } }; + // Read the run container bitmap if necessary + let run_container_bitmap = if has_run_containers { + let mut bitmap = vec![0u8; (size + 7) / 8]; + reader.read_exact(&mut bitmap)?; + Some(bitmap) + } else { + None + }; + if size > u16::MAX as usize + 1 { return Err(io::Error::new(io::ErrorKind::Other, "size is greater than supported")); } - let mut description_bytes = vec![0u8; size * 4]; + // Read the container descriptions + let mut description_bytes = vec![0u8; size * DESCRIPTION_BYTES]; reader.read_exact(&mut description_bytes)?; let mut description_bytes = &description_bytes[..]; if has_offsets { - let mut offsets = vec![0u8; size * 4]; + let mut offsets = vec![0u8; size * OFFSET_BYTES]; reader.read_exact(&mut offsets)?; drop(offsets); // Not useful when deserializing into memory } let mut containers = Vec::with_capacity(size); - for _ in 0..size { + // Read each container + for i in 0..size { let key = description_bytes.read_u16::()?; - let len = u64::from(description_bytes.read_u16::()?) + 1; + let cardinality = u64::from(description_bytes.read_u16::()?) + 1; + + // If the run container bitmap is present, check if this container is a run container + let is_run_container = + run_container_bitmap.as_ref().map_or(false, |bm| bm[i / 8] & (1 << (i % 8)) != 0); + + let store = if is_run_container { + let runs = reader.read_u16::()?; + let mut intervals = vec![[0, 0]; runs as usize]; + reader.read_exact(cast_slice_mut(&mut intervals))?; + intervals.iter_mut().for_each(|[s, len]| { + *s = u16::from_le(*s); + *len = u16::from_le(*len); + }); - let store = if len <= 4096 { - let mut values = vec![0; len as usize]; + let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); + let mut store = Store::with_capacity(cardinality); + intervals.into_iter().for_each(|[s, len]| { + store.insert_range(RangeInclusive::new(s, s + len)); + }); + store + } else if cardinality <= ARRAY_LIMIT { + let mut values = vec![0; cardinality as usize]; reader.read_exact(cast_slice_mut(&mut values))?; values.iter_mut().for_each(|n| *n = u16::from_le(*n)); let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; Store::Array(array) } else { - let mut values = Box::new([0; 1024]); + let mut values = Box::new([0; BITMAP_LENGTH]); reader.read_exact(cast_slice_mut(&mut values[..]))?; values.iter_mut().for_each(|n| *n = u64::from_le(*n)); - let bitmap = - b(len, values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + let bitmap = b(cardinality, values) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; Store::Bitmap(bitmap) }; diff --git a/src/bitmap/store/array_store/mod.rs b/src/bitmap/store/array_store/mod.rs index ca6ee206..543663df 100644 --- a/src/bitmap/store/array_store/mod.rs +++ b/src/bitmap/store/array_store/mod.rs @@ -21,6 +21,10 @@ impl ArrayStore { ArrayStore { vec: vec![] } } + pub fn with_capacity(capacity: usize) -> ArrayStore { + ArrayStore { vec: Vec::with_capacity(capacity) } + } + /// /// Create a new SortedU16Vec from a given vec /// It is up to the caller to ensure the vec is sorted and deduplicated diff --git a/src/bitmap/store/mod.rs b/src/bitmap/store/mod.rs index 653e14fe..9172a01a 100644 --- a/src/bitmap/store/mod.rs +++ b/src/bitmap/store/mod.rs @@ -7,12 +7,14 @@ use std::ops::{ }; use std::{slice, vec}; -use self::bitmap_store::BITMAP_LENGTH; +pub use self::bitmap_store::BITMAP_LENGTH; use self::Store::{Array, Bitmap}; pub use self::array_store::ArrayStore; pub use self::bitmap_store::{BitmapIter, BitmapStore}; +use crate::bitmap::container::ARRAY_LIMIT; + #[derive(Clone)] pub enum Store { Array(ArrayStore), @@ -31,6 +33,14 @@ impl Store { Store::Array(ArrayStore::new()) } + pub fn with_capacity(capacity: usize) -> Store { + if capacity <= ARRAY_LIMIT as usize { + Store::Array(ArrayStore::with_capacity(capacity)) + } else { + Store::Bitmap(BitmapStore::new()) + } + } + pub fn full() -> Store { Store::Bitmap(BitmapStore::full()) } From 8f23ccf41e9a6d2fb220d6def6041269ad669c27 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Fri, 7 Jul 2023 13:38:36 +0200 Subject: [PATCH 2/3] Add a test for the run containers --- tests/bitmapwithruns.bin | Bin 0 -> 48056 bytes tests/serialization.rs | 11 ++++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 tests/bitmapwithruns.bin diff --git a/tests/bitmapwithruns.bin b/tests/bitmapwithruns.bin new file mode 100644 index 0000000000000000000000000000000000000000..5ed243753e169295a32d6251db66180f23ceac06 GIT binary patch literal 48056 zcmeIuQyb)3w5Z{vVpUYBB$bL&skUv~wr$(CZQHhO+qPG)cJFo0f7tsf^BLb5*YnMt zfdl{y015&iU;{8fdI(UZDhgDsiv4%-|000=3<=DyNCAKU(!k#=^#A~P^j|*sU-o|$ z{I7=pHSoU{0>HA}0I=r2HvJa>ynrrXJE#Th2(g1)Ln~mWaJl5SR1eaf$h0Bbj9dfq zwJ21fSdLOwIlR(`syAvLsJo)!gr+@OHfUR-V}`CuPoVFM0T+hs7_nf?hzT90)R<9V zPKpKGl5fi&tUj@R!{!;=2kdUKzrx`h$F0-&`45+FTsv@U#k~oSdOU0Js>HhtpOmk_ zj~Bl$fIE-{#0aJXUqdROrqDV2oA)5ai8LEB%*ZkzM~gfa3gjqamB7nU@R~Jm<&t1Go*`10DiTfaky~;4Sb0_zZjnz5~C2zd#5? zKpdpNFvx=uPzGb54qBiK`d~^h4VWIx1ZD+ufVshZU_r15SR5<`mIW(-mBDIYO|TAF zA8Z6R1zUiv!FFIrunX87>;?7(2Y`dYVch zsvW8qY8Yw~Y94A8Y8&bh>Ky78>KW=2>K_^u8X6iA8XX!Jni!fAnjV@Jnj2aWS{zyy zS{YgsS|8dJ+8Wvs+8x>#Iv6?H+nJ`auJsA<%GW6f_o^08NIbK{KH_(0phS zv=mwat%lY?8=)=Gc4!y07dikPhK@lep)=5V=n`}lx&hsW?m-WsC(v`~74#PR0DXqO zLf@fZ&|fG7BQOqAa2V#{2rR=fScfgxg?%_BoCZ!0XM(fBIpExIKDZ!U1TGGjg3H1c z;L30{xF%c&t`9eYo5C&N)^IzxBise<4)=ol!UN#J@Gy8JJO&;QPlBhyGvL|qJa{3z z1YQoWg4e(i!Q7^hEj~{gFY)P-Fx$8X1R7M5Z9qky*%GWC5}m zS%$1c)*$PVO~_Vc2eKR4ha5zXAjgqY$XVn9av8aX+(hmm_mM})Q{)Bm8hM9&M7|*Z zB0rGd2!KK;iV`S|vZ#PcsDf&!f!e5tCec)AS~LTi8O?^~MDw8e(L!iZv;;r(I93WPi&elX zW7V*lSRJfB)(C5gwZK|q?XZqm7pyzh3+sywzy@Q(u#wmpY&|73?~83%iRwz#e1Iu$R~y>^=4g`v?1m z{lxxYAP(agPT~yC;UXTzRXmQHxPvF~6nJVp9i9=-f@jBb;d$`_cwxL4UJ@^Zm&YsN zRq+~lZM+`d5O0Dv$6MiT@eX)ryc^yV?}PWp2jN5U5%_3)96k}Bf=|b1;dAi?_+oq+ zz7k)9ug5pxTk#$EZhRkp5I=$+$4}vB@eBB6{2G1}zk}b$AK_2&7x-)Z9sUvjg8z&E zz<=Wa0TC!c5H!IO0wEC!p%DgQ6CRNyQW0s13`Axk8vMJeuY)!T!JCa?FOpZt>*Ou+F8P3bOgr4~?&sb$nk zY7MoX+C*)oc2K*iebhnf2z8t~MV+NCP?xD|)J^IRb)R}fJ*8eyuc>#`N9qgpFZF}^ zO#w7SqclO&G)oJ#L@TsL8?;S(bdpX*r=>H{ndxkFPC5^rpDsifrAyGI>2h>Mx(Z#L zu0_|S8_2>r*dJDas-bL@F5739{WAsV-41J!yL|>(E(6{M(^h5dy{hWS9zokFWpXsmkclsCo zmku!qgEJHpW_TvT$V`mU8H;flpGnE2VbU|1n5;|=CO4ChDaaIIiZi8{vP=c0GECE6T?%&)5EjEbHfY5 zi^I#pE5mET>%*JETf;lTyTkj!2g66g$HS+>XTul5m&4b>H^X)aV-413JvPavV$-r2*vxD;HYb~h&CeEMi?Suy z(rh`lB3p&6&eme@oHvdxkyFUShAZH`v?kJ@z5{gniDwV&AeK z*w5@&_B;EF{mX_pgu^+C3v)ad;bbnx>72#6oX@4?(s1dyOk7qj2bY`6#}(v?aK*V& zTv@IHSDCBE)#U1M^|?k|Q?3Qqnrp{(~#&F}gN!(O!1~;3V z$1UWRaLc(>+*)n}x0&0FZ0*=W^Y8eN z{1^UT{s;e?2Lwn!1wx<&RuBY9Py|gd1Y7Wgq>xHTD`XHd3)zI6LLMQ%P)H~$ln_b_ z<%Eht6`{INOQD@+h33)6&|!W?0~ zut-=ctPoZU>x7NM7Gb-vOV}$M5Dp8+gphr$!#x$sJOD|`?> z3txrr!Y|>k5E2m)7b!6;@?u1k#h9pzmgtJUm{Lq5rWZ4bS;ZV;ZZV%&P%I)A7fXp{ z#R_6&v6@&@tRvPJ8;MQD7Gi6$o!C+AB6b&hiG9TZ;$U%@I8q!Vju$71Q^gtLY;m5r zP+TG|7gvdE#SP+SahteP+#~K64~a*`6XI#{oOn^ZB3>77iFd^Z;$!id_)>f$z861< z|A^njpW+`8jKC2rLPnSf7ZD@Th#HAU%!m_7L{dajN76+yMzTb*M{-5-MhZja*aMs`GYNA^VyMvg>|M@~h~MlM7yN3KO~M(#xJM;=9< zMqWf-N8Uv~M!rP;jr@rGjsOxQp%Nj{5-SOkBq@?68Imn|Qc_AKrIj*BnWbz}PAQL+ zUn(ROl}bpZrE*e5sftuxswLHx8c2<$W>QP3jnrQ1Bz2W~NWG?`qo1PxM88FUM*l=X8J01blo^?mML8;~a$GiLM^4Bo>Xd_+DjpOVkY7v#(G zHTkA|N4_sVlAp>ij{qEbbvuGCWMDh-szN;9RU(ne{obW*x1J(S)`KV_gY zL>aD(QpPG1l*!68Wu`JmnXfETmMSZh)yg_$qq0TWuIy6wDhHIq$}#1naz;6?TvDzo zH{YDM0u{fQr;>bl+Vgn<-77r`KyFfM8#D~4XeBwQDrrz>Z+x>s;{P0)2Qjy zOlnp&hnic>rxsL;sKwP%YFV{{T3M~8)>P}L_0>jdQ?-TKT5YFxRJ*9%)m~~}b$~ip z9j1;{$Ef4gN$ON}hB{lFr!G{NsLRz=>RNS!x>?<(?o{`v`_)70QT2p+T0N&;RIjMl z)m!Ra^?~|WeWt!t->C1^PwGGFH}$9bM+IYW42zL5CdS3YSTv@_;xRMk#1gR-vDC42 zv5c`SvFx#2vAnSYvBI%pv68VevGTD>v8u5evD&eEv4*iGvF5Q>v9_@evCgq>v7WI$ zvHr0^v7xaMvC*+{v5B!MvFWi{vAMAYvBj}vv6ZnkvGuV{v8}NkvE8wKv4gQAvE#8* zv9qxYvCFY*v750wvHP({v8S;YvDdM8v5&DYv43MfV!va625G29Xtc&^f+lH-rfG&| zYo3Ro7~1b+rasW38FiQfs5N*E(rk zwH{h;t)Dhf8=?)@MrmWU3EE_Bnl@9Lqs`YAX-l;g+G=f`wo%)nZP#{bd$j}FVeOc9 zQahua*Dh&SwHw-P?Vk2fd!jwpUTJT&587w#tM*;{rTx`HaU_n%sdzZf$0Kn$9*gU7 zEAGbqc*=O1c=~vzc-DB1cQ>zc++@`c*-`11Iw_}ch}_~!Vw_|EvA z`2P5z_|f=@`04n$_{I2@`1Sa$_}%z}_~ZDq_{;d4`1|;$_&@P)@t^TOaZrbKOeb|l z=X6nz>Z%^sP2JHGdI~+Yo=(rGXVJ6kx%9kx0llzZOfRXI(aYTrX zo9nIgwt5G>v))bbsrS+Q>x1;6`UrirK2D#gPtm9Av-G+80)4T*Okb(5(bwyn^sV|1 zeYd_(Kd2wkkL#!Ov-$=7vVKj!so&A>>yPxO`V0NF{!ag>f6@Qdf9St;z<>5V|#sp)sG0m81%rWL0i;Shl3S+gg&e&*dF}54K zjJ?JIdRd-Id|kNM5~Y5p-m3$`$ev>1!C zL@R2kR@^cz$4Xc!tkhOIE2EXg%5LSd@>&I~!d5Y>q*cZ$Z&k9YS~aZNRz0hs)x>IU zwX)h;9jwk)H>;=B$LenlvW8kCtkKpuYoay9nr_Xq=2{D^#nv)wrM1RdZ*8)+T05-W z);{Z?b;LSuowCka7p%+HHS4Bz$GUGlvYuKmtk>2%>!bC>`q%nl{k8xbvQe9`X`8hL zTe1~fvklv}Jv(WqveVib?96sHJExt;&Tkj8i`pgZ(snt!qFu$VZr8Hw+70Z+b~C%B z-NtTjce1-H|*Q?J^P{k#C~qSvftVt?9cXB`@8+i{%eOE#K9fP2|K(K zabzdv=#J&Mj_;&&(m3gzOiorOhm+gM=M;2`IK`b(PFbgdQ`xEJ)O6}N^_@meQ>TT~ z+G*!>bh~!`x z`<+A1QRjqn+BxT3bgnqpom;ihm?yXoAFZWcGYo6F7X7H|u@#oUr^8MnM!$*tV$JGW*+nyW`x6?i6>rJIkHxE^rsS%iNXj8h5?B$=&MiaCf`= z+=K2B_qcn?J?ma@FT2;=o9-R=zWd01>b`JayYJkO?icr8_lNu21w6<@J;I|s))PF* zQ#{QxJlpfUq?gJ|>t*mVd)d64ULG&MSI8^smGDY?<-Ces6|cHi%d6`(@EUu~yp~=Y zuf5mF>+1FJdVBr6f!+{rxHrlh>rL<`d(*s`-W+efx5!)St?*WR>%5KL7H_+^%iHT6 z@D6*&yp!G;@4R=(yXxKWZhQBwWM(dtbfp-Y@U37fK)rJV7PG2|f`? z$cb1&Pgn^z;U`ii(j?L+G9|JmawKvm@+Ar;iX@6BN+rrBDkLf=swHYB>LltX8YP-0 zS|nO0+9f(Bx+J|;LZGd|~we$-d}xNrK7pYT)osr__*Mn8+6-OuIc^$Ylg{bGJezl>krujE(tYxuSO zdVWK{iQn9B<+t@a_?`W3eow!T-`^kP5A{d*qy2IIM1P7u-Jj*p^%wYy{bl}2e~rK1 z-{f!gclf*gef~lJh=1Ha<)8I0_?P`_{!Ramf8T%PKlNYuul;xaNB@ieum8jU?E^_D zi6)68on(_jQc5aGEomg}q?b%4Qzg?TGbA%7vn6vT^Ca^p3nhytOC(Dt%Oxu&t0b!@ zYbEO@8zdVin69`4vF`0RH~{|2xb70P7F}00000 literal 0 HcmV?d00001 diff --git a/tests/serialization.rs b/tests/serialization.rs index 0b31b060..42efdd43 100644 --- a/tests/serialization.rs +++ b/tests/serialization.rs @@ -4,6 +4,7 @@ use roaring::RoaringBitmap; // Test data from https://github.com/RoaringBitmap/RoaringFormatSpec/tree/master/testdata static BITMAP_WITHOUT_RUNS: &[u8] = include_bytes!("bitmapwithoutruns.bin"); +static BITMAP_WITH_RUNS: &[u8] = include_bytes!("bitmapwithruns.bin"); fn test_data_bitmap() -> RoaringBitmap { (0..100) @@ -21,10 +22,18 @@ fn serialize_and_deserialize(bitmap: &RoaringBitmap) -> RoaringBitmap { } #[test] -fn test_deserialize_from_provided_data() { +fn test_deserialize_without_runs_from_provided_data() { assert_eq!(RoaringBitmap::deserialize_from(BITMAP_WITHOUT_RUNS).unwrap(), test_data_bitmap()); } +#[test] +fn test_deserialize_with_runs_from_provided_data() { + assert_eq!( + RoaringBitmap::deserialize_from(&mut &BITMAP_WITH_RUNS[..]).unwrap(), + test_data_bitmap() + ); +} + #[test] fn test_serialize_into_provided_data() { let bitmap = test_data_bitmap(); From 95664bd5e8b1feb7ed15168a26fe062ce44a27f7 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 9 Jul 2023 13:37:25 +0200 Subject: [PATCH 3/3] Update README badge --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index dc692f8b..2d0c259a 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,8 @@ Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you shall be dual licensed as above, without any additional terms or conditions. -[github-actions-badge]: https://img.shields.io/github/workflow/status/RoaringBitmap/roaring-rs/Continuous%20integration.svg?style=flat-square +[github-actions-badge]: +https://github.com/RoaringBitmap/roaring-rs/actions/workflows/test.yml/badge.svg [github-actions]: https://github.com/RoaringBitmap/roaring-rs/actions [release-badge]: https://img.shields.io/github/release/RoaringBitmap/roaring-rs.svg?style=flat-square [cargo]: https://crates.io/crates/roaring