diff --git a/CMakeLists.txt b/CMakeLists.txt index 79c97a35c..ec0a5e958 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,6 +65,9 @@ endif() set(SNMALLOC_MIN_ALLOC_SIZE "" CACHE STRING "Minimum allocation bytes (power of 2)") set(SNMALLOC_MIN_ALLOC_STEP_SIZE "" CACHE STRING "Minimum allocation step (power of 2)") +set(SNMALLOC_DEALLOC_BATCH_RING_ASSOC "" CACHE STRING "Associativity of deallocation batch cache; 0 to disable") +set(SNMALLOC_DEALLOC_BATCH_RING_SET_BITS "" CACHE STRING "Logarithm of number of deallocation batch cache associativity sets") + if(MSVC AND SNMALLOC_STATIC_LIBRARY AND (SNMALLOC_STATIC_LIBRARY_PREFIX STREQUAL "")) message(FATAL_ERROR "Empty static library prefix not supported on MSVC") endif() @@ -251,6 +254,8 @@ if (SNMALLOC_NO_REALLOCARR) endif() add_as_define_value(SNMALLOC_MIN_ALLOC_SIZE) add_as_define_value(SNMALLOC_MIN_ALLOC_STEP_SIZE) +add_as_define_value(SNMALLOC_DEALLOC_BATCH_RING_ASSOC) +add_as_define_value(SNMALLOC_DEALLOC_BATCH_RING_SET_BITS) target_compile_definitions(snmalloc INTERFACE $<$:MALLOC_USABLE_SIZE_QUALIFIER=const>) diff --git a/src/snmalloc/backend/backend.h b/src/snmalloc/backend/backend.h index 883c471c6..ee170c38f 100644 --- a/src/snmalloc/backend/backend.h +++ b/src/snmalloc/backend/backend.h @@ -188,6 +188,14 @@ namespace snmalloc local_state.get_object_range()->dealloc_range(arena, size); } + SNMALLOC_FAST_PATH static capptr::Alloc + capptr_rederive_alloc(capptr::Alloc a, size_t objsize) + { + return capptr_to_user_address_control( + Aal::capptr_bound( + Authmap::amplify(a), objsize)); + } + template SNMALLOC_FAST_PATH static const PagemapEntry& get_metaentry(address_t p) { diff --git a/src/snmalloc/ds/allocconfig.h b/src/snmalloc/ds/allocconfig.h index 81a72303a..78ea9f41a 100644 --- a/src/snmalloc/ds/allocconfig.h +++ b/src/snmalloc/ds/allocconfig.h @@ -120,6 +120,45 @@ namespace snmalloc static constexpr size_t REMOTE_SLOTS = 1 << REMOTE_SLOT_BITS; static constexpr size_t REMOTE_MASK = REMOTE_SLOTS - 1; +#if defined(SNMALLOC_DEALLOC_BATCH_RING_ASSOC) + static constexpr size_t DEALLOC_BATCH_RING_ASSOC = + SNMALLOC_DEALLOC_BATCH_RING_ASSOC; +#else +# if defined(__has_cpp_attribute) +# if ( \ + __has_cpp_attribute(msvc::no_unique_address) && \ + (__cplusplus >= 201803L || _MSVC_LANG >= 201803L)) || \ + __has_cpp_attribute(no_unique_address) + // For C++20 or later, we do have [[no_unique_address]] and so can also do + // batching if we aren't turning on the backward-pointer mitigations + static constexpr size_t DEALLOC_BATCH_MIN_ALLOC_WORDS = + mitigations(freelist_backward_edge) ? 4 : 2; +# else + // For C++17, we don't have [[no_unique_address]] and so we always end up + // needing all four pointers' worth of space (because BatchedRemoteMessage has + // two freelist::Object::T<> links within, each of which will have two fields + // and will be padded to two pointers). + static constexpr size_t DEALLOC_BATCH_MIN_ALLOC_WORDS = 4; +# endif +# else + // If we don't even have the feature test macro, we're C++17 or earlier. + static constexpr size_t DEALLOC_BATCH_MIN_ALLOC_WORDS = 4; +# endif + + static constexpr size_t DEALLOC_BATCH_RING_ASSOC = + (MIN_ALLOC_SIZE >= (DEALLOC_BATCH_MIN_ALLOC_WORDS * sizeof(void*))) ? 2 : 0; +#endif + +#if defined(SNMALLOC_DEALLOC_BATCH_RING_SET_BITS) + static constexpr size_t DEALLOC_BATCH_RING_SET_BITS = + SNMALLOC_DEALLOC_BATCH_RING_SET_BITS; +#else + static constexpr size_t DEALLOC_BATCH_RING_SET_BITS = 3; +#endif + + static constexpr size_t DEALLOC_BATCH_RINGS = + DEALLOC_BATCH_RING_ASSOC * bits::one_at_bit(DEALLOC_BATCH_RING_SET_BITS); + static_assert( INTERMEDIATE_BITS < MIN_ALLOC_STEP_BITS, "INTERMEDIATE_BITS must be less than MIN_ALLOC_BITS"); diff --git a/src/snmalloc/mem/corealloc.h b/src/snmalloc/mem/corealloc.h index 99b0a9568..aca108a6d 100644 --- a/src/snmalloc/mem/corealloc.h +++ b/src/snmalloc/mem/corealloc.h @@ -380,9 +380,15 @@ namespace snmalloc } /** - * Very slow path for deallocating an object locally. + * Very slow path for object deallocation. + * + * The object has already been returned to the slab, so all that is left to + * do is update its metadata and, if that pushes us into having too many + * unused slabs in this size class, return some. + * + * Also while here, check the time. */ - SNMALLOC_SLOW_PATH void dealloc_local_object_slower( + SNMALLOC_SLOW_PATH void dealloc_local_object_meta( const PagemapEntry& entry, BackendSlabMetadata* meta) { smallsizeclass_t sizeclass = entry.get_sizeclass().as_small(); @@ -427,14 +433,17 @@ namespace snmalloc * This is either waking up a slab that was not actively being used * by this thread, or handling the final deallocation onto a slab, * so it can be reused by other threads. + * + * Live large objects look like slabs that need attention when they become + * free; that attention is also given here. */ - SNMALLOC_SLOW_PATH void - dealloc_local_object_slow(capptr::Alloc p, const PagemapEntry& entry) + SNMALLOC_SLOW_PATH void dealloc_local_object_slow( + capptr::Alloc p, + const PagemapEntry& entry, + BackendSlabMetadata* meta) { // TODO: Handle message queue on this path? - auto* meta = entry.get_slab_metadata(); - if (meta->is_large()) { // Handle large deallocation here. @@ -460,7 +469,8 @@ namespace snmalloc return; } - dealloc_local_object_slower(entry, meta); + // Not a large object; update slab metadata + dealloc_local_object_meta(entry, meta); } /** @@ -503,13 +513,11 @@ namespace snmalloc SNMALLOC_FAST_PATH_LAMBDA { return capptr_domesticate(local_state, p); }; - auto cb = [this, - &need_post](freelist::HeadPtr msg) SNMALLOC_FAST_PATH_LAMBDA { + auto cb = [this, domesticate, &need_post]( + capptr::Alloc msg) SNMALLOC_FAST_PATH_LAMBDA { auto& entry = Config::Backend::template get_metaentry(snmalloc::address_cast(msg)); - - handle_dealloc_remote(entry, msg.as_void(), need_post); - + handle_dealloc_remote(entry, msg, need_post, domesticate); return true; }; @@ -548,10 +556,12 @@ namespace snmalloc * * need_post will be set to true, if capacity is exceeded. */ + template void handle_dealloc_remote( const PagemapEntry& entry, - CapPtr p, - bool& need_post) + capptr::Alloc msg, + bool& need_post, + Domesticator_queue domesticate) { // TODO this needs to not double count stats // TODO this needs to not double revoke if using MTE @@ -559,21 +569,43 @@ namespace snmalloc if (SNMALLOC_LIKELY(entry.get_remote() == public_state())) { - dealloc_local_object(p, entry); + auto meta = entry.get_slab_metadata(); + + auto unreturned = + dealloc_local_objects_fast(msg, entry, meta, entropy, domesticate); + + /* + * dealloc_local_objects_fast has updated the free list but not updated + * the slab metadata; it falls to us to do so. It is UNLIKELY that we + * will need to take further steps, but we might. + */ + if (SNMALLOC_UNLIKELY(unreturned.template step())) + { + dealloc_local_object_slow(msg.as_void(), entry, meta); + + while (SNMALLOC_UNLIKELY(unreturned.template step())) + { + dealloc_local_object_meta(entry, meta); + } + } + return; } - else + + auto nelem = RemoteMessage::template ring_size( + msg, + freelist::Object::key_root, + entry.get_slab_metadata()->as_key_tweak(), + domesticate); + if ( + !need_post && + !attached_cache->remote_dealloc_cache.reserve_space(entry, nelem)) { - if ( - !need_post && - !attached_cache->remote_dealloc_cache.reserve_space(entry)) - { - need_post = true; - } - attached_cache->remote_dealloc_cache - .template dealloc( - entry.get_remote()->trunc_id(), p.as_void()); + need_post = true; } + attached_cache->remote_dealloc_cache + .template forward( + entry.get_remote()->trunc_id(), msg); } /** @@ -698,10 +730,12 @@ namespace snmalloc CapPtr p, const typename Config::PagemapEntry& entry) { - if (SNMALLOC_LIKELY(dealloc_local_object_fast(entry, p, entropy))) + auto meta = entry.get_slab_metadata(); + + if (SNMALLOC_LIKELY(dealloc_local_object_fast(p, entry, meta, entropy))) return; - dealloc_local_object_slow(p, entry); + dealloc_local_object_slow(p, entry, meta); } SNMALLOC_FAST_PATH void @@ -714,12 +748,11 @@ namespace snmalloc } SNMALLOC_FAST_PATH static bool dealloc_local_object_fast( - const PagemapEntry& entry, CapPtr p, + const PagemapEntry& entry, + BackendSlabMetadata* meta, LocalEntropy& entropy) { - auto meta = entry.get_slab_metadata(); - SNMALLOC_ASSERT(!meta->is_unused()); snmalloc_check_client( @@ -736,6 +769,42 @@ namespace snmalloc return SNMALLOC_LIKELY(!meta->return_object()); } + template + SNMALLOC_FAST_PATH static auto dealloc_local_objects_fast( + capptr::Alloc msg, + const PagemapEntry& entry, + BackendSlabMetadata* meta, + LocalEntropy& entropy, + Domesticator domesticate) + { + SNMALLOC_ASSERT(!meta->is_unused()); + + snmalloc_check_client( + mitigations(sanity_checks), + is_start_of_object(entry.get_sizeclass(), address_cast(msg)), + "Not deallocating start of an object"); + + size_t objsize = sizeclass_full_to_size(entry.get_sizeclass()); + + auto [curr, length] = RemoteMessage::template open_free_ring( + msg, + objsize, + freelist::Object::key_root, + meta->as_key_tweak(), + domesticate); + + // Update the head and the next pointer in the free list. + meta->free_queue.append_segment( + curr, + msg.template as_reinterpret>(), + length, + freelist::Object::key_root, + meta->as_key_tweak(), + entropy); + + return meta->return_objects(length); + } + template SNMALLOC_SLOW_PATH capptr::Alloc small_alloc(smallsizeclass_t sizeclass, freelist::Iter<>& fast_free_list) @@ -871,11 +940,11 @@ namespace snmalloc if (destroy_queue) { - auto cb = [this](capptr::Alloc p) { + auto cb = [this, domesticate](capptr::Alloc m) { bool need_post = true; // Always going to post, so ignore. const PagemapEntry& entry = - Config::Backend::get_metaentry(snmalloc::address_cast(p)); - handle_dealloc_remote(entry, p.as_void(), need_post); + Config::Backend::get_metaentry(snmalloc::address_cast(m)); + handle_dealloc_remote(entry, m, need_post, domesticate); }; message_queue().destroy_and_iterate(domesticate, cb); diff --git a/src/snmalloc/mem/freelist.h b/src/snmalloc/mem/freelist.h index f491e979a..56b18f5a8 100644 --- a/src/snmalloc/mem/freelist.h +++ b/src/snmalloc/mem/freelist.h @@ -40,6 +40,8 @@ namespace snmalloc { + class BatchedRemoteMessage; + static constexpr address_t NO_KEY_TWEAK = 0; /** @@ -139,6 +141,8 @@ namespace snmalloc friend class Object; + friend class ::snmalloc::BatchedRemoteMessage; + class Empty { public: @@ -916,6 +920,34 @@ namespace snmalloc return {first, last}; } + /** + * Put back an extracted segment from a builder using the same key. + * + * The caller must tell us how many elements are involved. + */ + void append_segment( + Object::BHeadPtr first, + Object::BHeadPtr last, + uint16_t size, + const FreeListKey& key, + address_t key_tweak, + LocalEntropy& entropy) + { + uint32_t index; + if constexpr (RANDOM) + index = entropy.next_bit(); + else + index = 0; + + if constexpr (TRACK_LENGTH) + length[index] += size; + else + UNUSED(size); + + Object::store_next(cast_end(index), first, key, key_tweak); + set_end(index, &(last->next_object)); + } + template SNMALLOC_FAST_PATH void validate( const FreeListKey& key, address_t key_tweak, Domesticator domesticate) diff --git a/src/snmalloc/mem/localalloc.h b/src/snmalloc/mem/localalloc.h index cfa0a5db8..9fad26012 100644 --- a/src/snmalloc/mem/localalloc.h +++ b/src/snmalloc/mem/localalloc.h @@ -286,7 +286,7 @@ namespace snmalloc address_cast(entry.get_slab_metadata())); #endif local_cache.remote_dealloc_cache.template dealloc( - entry.get_remote()->trunc_id(), p); + entry.get_slab_metadata(), p, &local_cache.entropy); post_remote_cache(); return; } @@ -658,6 +658,12 @@ namespace snmalloc return; } + dealloc_remote(entry, p_tame); + } + + SNMALLOC_SLOW_PATH void + dealloc_remote(const PagemapEntry& entry, capptr::Alloc p_tame) + { RemoteAllocator* remote = entry.get_remote(); if (SNMALLOC_LIKELY(remote != nullptr)) { @@ -673,12 +679,12 @@ namespace snmalloc if (local_cache.remote_dealloc_cache.reserve_space(entry)) { local_cache.remote_dealloc_cache.template dealloc( - remote->trunc_id(), p_tame); + entry.get_slab_metadata(), p_tame, &local_cache.entropy); # ifdef SNMALLOC_TRACING message<1024>( "Remote dealloc fast {} ({}, {})", - p_raw, - alloc_size(p_raw), + address_cast(p_tame), + alloc_size(p_tame.unsafe_ptr()), address_cast(entry.get_slab_metadata())); # endif return; diff --git a/src/snmalloc/mem/metadata.h b/src/snmalloc/mem/metadata.h index 7cf50e3af..a58822dfa 100644 --- a/src/snmalloc/mem/metadata.h +++ b/src/snmalloc/mem/metadata.h @@ -500,6 +500,59 @@ namespace snmalloc return (--needed()) == 0; } + class ReturnObjectsIterator + { + uint16_t _batch; + FrontendSlabMetadata* _meta; + + static_assert(sizeof(_batch) * 8 > MAX_CAPACITY_BITS); + + public: + ReturnObjectsIterator(uint16_t n, FrontendSlabMetadata* m) + : _batch(n), _meta(m) + {} + + template + SNMALLOC_FAST_PATH bool step() + { + // The first update must always return some positive number of objects. + SNMALLOC_ASSERT(!first || (_batch != 0)); + + /* + * Stop iteration when there are no more objects to return. Perform + * this test only on non-first steps to avoid a branch on the hot path. + */ + if (!first && _batch == 0) + return false; + + if (SNMALLOC_LIKELY(_batch < _meta->needed())) + { + // Will not hit threshold for state transition + _meta->needed() -= _batch; + return false; + } + + // Hit threshold for state transition, may yet hit another + _batch -= _meta->needed(); + _meta->needed() = 0; + return true; + } + }; + + /** + * A batch version of return_object. + * + * Returns an iterator that should have `.step<>()` called on it repeatedly + * until it returns `false`. The first step should invoke `.step()` + * while the rest should invoke `.step()`. After each + * true-returning `.step()`, the caller should run the slow-path code to + * update the rest of the metadata for this slab. + */ + ReturnObjectsIterator return_objects(uint16_t n) + { + return ReturnObjectsIterator(n, this); + } + bool is_unused() { return needed() == 0; @@ -605,7 +658,13 @@ namespace snmalloc [[nodiscard]] SNMALLOC_FAST_PATH address_t as_key_tweak() const noexcept { - return address_cast(this) / alignof(decltype(*this)); + return as_key_tweak(address_cast(this)); + } + + [[nodiscard]] SNMALLOC_FAST_PATH static address_t + as_key_tweak(address_t self) + { + return self / alignof(FrontendSlabMetadata); } typename ClientMeta::DataRef get_meta_for_object(size_t index) diff --git a/src/snmalloc/mem/remoteallocator.h b/src/snmalloc/mem/remoteallocator.h index c0fb7240c..0a72aa318 100644 --- a/src/snmalloc/mem/remoteallocator.h +++ b/src/snmalloc/mem/remoteallocator.h @@ -1,10 +1,284 @@ #pragma once #include "freelist_queue.h" -#include "remotecache.h" namespace snmalloc { + class RemoteMessageAssertions; + + /** + * Entries on a remote message queue. Logically, this is a pair of freelist + * linkages, together with some metadata: + * + * - a cyclic list ("ring") of free objects (atypically for rings, there is + * no sentinel node here: the message itself is a free object), + * + * - the length of that ring + * + * - the linkage for the message queue itself + * + * In practice, there is a fair bit more going on here: the ring of free + * objects is not entirely encoded as a freelist. While traversing the + * successor pointers in objects on the ring will eventually lead back to + * this RemoteMessage object, the successor pointer from this object is + * encoded as a relative displacement. This is guaranteed to be physically + * smaller than a full pointer (because slabs are smaller than the whole + * address space). This gives us enough room to pack in the length of the + * ring, without needing to grow the structure. + */ + class BatchedRemoteMessage + { + friend class BatchedRemoteMessageAssertions; + + freelist::Object::T<> free_ring; + freelist::Object::T<> message_link; + + static_assert( + sizeof(free_ring.next_object) >= sizeof(void*), + "BatchedRemoteMessage bitpacking needs sizeof(void*) in next_object"); + + public: + static auto emplace_in_alloc(capptr::Alloc alloc) + { + return CapPtr::unsafe_from( + new (alloc.unsafe_ptr()) BatchedRemoteMessage()); + } + + static auto mk_from_freelist_builder( + freelist::Builder& flb, + const FreeListKey& key, + address_t key_tweak) + { + size_t size = flb.extract_segment_length(); + + SNMALLOC_ASSERT(size < bits::one_at_bit(MAX_CAPACITY_BITS)); + + auto [first, last] = flb.extract_segment(key, key_tweak); + + /* + * Preserve the last node's backpointer and change its type. Because we + * use placement new to build our RemoteMessage atop the memory of a + * freelist::Object::T<> (to avoid UB) and the constructor may nullify + * the `prev` field, put it right back. Ideally the compiler is smart + * enough to see that this is a no-op. + */ + auto last_prev = last->prev; + auto self = + CapPtr::unsafe_from( + new (last.unsafe_ptr()) BatchedRemoteMessage()); + self->free_ring.prev = last_prev; + + // XXX On CHERI, we could do a fair bit better if we had a primitive for + // extracting and discarding the offset. That probably beats the dance + // done below, but it should work as it stands. + + auto n = freelist::HeadPtr::unsafe_from( + unsafe_from_uintptr>( + (static_cast(pointer_diff_signed(self, first)) + << MAX_CAPACITY_BITS) + + size)); + + // Close the ring, storing our bit-packed value in the next field. + freelist::Object::store_nextish( + &self->free_ring.next_object, first, key, key_tweak, n); + + return self; + } + + static freelist::HeadPtr + to_message_link(capptr::Alloc m) + { + return pointer_offset(m, offsetof(BatchedRemoteMessage, message_link)) + .as_reinterpret>(); + } + + static capptr::Alloc + from_message_link(freelist::HeadPtr chainPtr) + { + return pointer_offset_signed( + chainPtr, + -static_cast( + offsetof(BatchedRemoteMessage, message_link))) + .as_reinterpret(); + } + + template + SNMALLOC_FAST_PATH static std::pair + open_free_ring( + capptr::Alloc m, + size_t objsize, + const FreeListKey& key, + address_t key_tweak, + Domesticator_queue domesticate) + { + uintptr_t encoded = + m->free_ring.read_next(key, key_tweak, domesticate).unsafe_uintptr(); + + uint16_t decoded_size = + static_cast(encoded) & bits::mask_bits(MAX_CAPACITY_BITS); + static_assert(sizeof(decoded_size) * 8 > MAX_CAPACITY_BITS); + + /* + * Derive an out-of-bounds pointer to the next allocation, then use the + * authmap to reconstruct an in-bounds version, which we then immediately + * bound and rewild and then domesticate (how strange). + * + * XXX See above re: doing better on CHERI. + */ + auto next = domesticate( + capptr_rewild( + Config::Backend::capptr_rederive_alloc( + pointer_offset_signed( + m, static_cast(encoded) >> MAX_CAPACITY_BITS), + objsize)) + .template as_static>()); + + if constexpr (mitigations(freelist_backward_edge)) + { + next->check_prev( + signed_prev(address_cast(m), address_cast(next), key, key_tweak)); + } + else + { + UNUSED(key); + UNUSED(key_tweak); + } + + return {next.template as_static>(), decoded_size}; + } + + template + static uint16_t ring_size( + capptr::Alloc m, + const FreeListKey& key, + address_t key_tweak, + Domesticator_queue domesticate) + { + uintptr_t encoded = + m->free_ring.read_next(key, key_tweak, domesticate).unsafe_uintptr(); + + uint16_t decoded_size = + static_cast(encoded) & bits::mask_bits(MAX_CAPACITY_BITS); + static_assert(sizeof(decoded_size) * 8 > MAX_CAPACITY_BITS); + + if constexpr (mitigations(freelist_backward_edge)) + { + /* + * Like above, but we don't strictly need to rebound the pointer, + * since it's only used internally. Still, doesn't hurt to bound + * to the free list linkage. + */ + auto next = domesticate( + capptr_rewild( + Config::Backend::capptr_rederive_alloc( + pointer_offset_signed( + m, static_cast(encoded) >> MAX_CAPACITY_BITS), + sizeof(freelist::Object::T<>))) + .template as_static>()); + + next->check_prev( + signed_prev(address_cast(m), address_cast(next), key, key_tweak)); + } + else + { + UNUSED(key); + UNUSED(key_tweak); + UNUSED(domesticate); + } + + return decoded_size; + } + }; + + class BatchedRemoteMessageAssertions + { + static_assert( + (DEALLOC_BATCH_RINGS == 0) || + (sizeof(BatchedRemoteMessage) <= MIN_ALLOC_SIZE)); + static_assert(offsetof(BatchedRemoteMessage, free_ring) == 0); + + static_assert( + (DEALLOC_BATCH_RINGS == 0) || + (MAX_SLAB_SPAN_BITS + MAX_CAPACITY_BITS < 8 * sizeof(void*)), + "Ring bit-stuffing trick can't reach far enough to enclose a slab"); + }; + + /** The type of a remote message when we are not batching messages onto + * rings. + * + * Relative to BatchRemoteMessage, this type is smaller, as it contains only + * a single linkage, to the next message. (And possibly a backref, if + * mitigations(freelist_backward_edge) is enabled.) + */ + class SingletonRemoteMessage + { + friend class SingletonRemoteMessageAssertions; + + freelist::Object::T<> message_link; + + public: + static auto emplace_in_alloc(capptr::Alloc alloc) + { + return CapPtr::unsafe_from( + new (alloc.unsafe_ptr()) SingletonRemoteMessage()); + } + + static freelist::HeadPtr + to_message_link(capptr::Alloc m) + { + return pointer_offset(m, offsetof(SingletonRemoteMessage, message_link)) + .as_reinterpret>(); + } + + static capptr::Alloc + from_message_link(freelist::HeadPtr chainPtr) + { + return pointer_offset_signed( + chainPtr, + -static_cast( + offsetof(SingletonRemoteMessage, message_link))) + .as_reinterpret(); + } + + template + SNMALLOC_FAST_PATH static std::pair + open_free_ring( + capptr::Alloc m, + size_t, + const FreeListKey&, + address_t, + Domesticator_queue) + { + return { + m.as_reinterpret>(), static_cast(1)}; + } + + template + static uint16_t ring_size( + capptr::Alloc, + const FreeListKey&, + address_t, + Domesticator_queue) + { + return 1; + } + }; + + class SingletonRemoteMessageAssertions + { + static_assert(sizeof(SingletonRemoteMessage) <= MIN_ALLOC_SIZE); + static_assert( + sizeof(SingletonRemoteMessage) == sizeof(freelist::Object::T<>)); + static_assert(offsetof(SingletonRemoteMessage, message_link) == 0); + }; + + using RemoteMessage = std::conditional_t< + (DEALLOC_BATCH_RINGS > 0), + BatchedRemoteMessage, + SingletonRemoteMessage>; + + static_assert(sizeof(RemoteMessage) <= MIN_ALLOC_SIZE); + /** * A RemoteAllocator is the message queue of freed objects. It builds on the * FreeListMPSCQ but encapsulates knowledge that the objects are actually @@ -45,8 +319,9 @@ namespace snmalloc template void destroy_and_iterate(Domesticator_queue domesticate, Cb cb) { - auto cbwrap = [cb](freelist::HeadPtr p) - SNMALLOC_FAST_PATH_LAMBDA { cb(p.as_void()); }; + auto cbwrap = [cb](freelist::HeadPtr p) SNMALLOC_FAST_PATH_LAMBDA { + cb(RemoteMessage::from_message_link(p)); + }; return list.destroy_and_iterate(domesticate, cbwrap); } @@ -67,11 +342,14 @@ namespace snmalloc */ template void enqueue( - freelist::HeadPtr first, - freelist::HeadPtr last, + capptr::Alloc first, + capptr::Alloc last, Domesticator_head domesticate_head) { - list.enqueue(first, last, domesticate_head); + list.enqueue( + RemoteMessage::to_message_link(first), + RemoteMessage::to_message_link(last), + domesticate_head); } /** @@ -91,7 +369,10 @@ namespace snmalloc Domesticator_queue domesticate_queue, Cb cb) { - list.dequeue(domesticate_head, domesticate_queue, cb); + auto cbwrap = [cb](freelist::HeadPtr p) SNMALLOC_FAST_PATH_LAMBDA { + return cb(RemoteMessage::from_message_link(p)); + }; + list.dequeue(domesticate_head, domesticate_queue, cbwrap); } alloc_id_t trunc_id() diff --git a/src/snmalloc/mem/remotecache.h b/src/snmalloc/mem/remotecache.h index c3dbdcd95..585fb9146 100644 --- a/src/snmalloc/mem/remotecache.h +++ b/src/snmalloc/mem/remotecache.h @@ -12,6 +12,179 @@ namespace snmalloc { + + /** + * Same-destination message batching. + * + * In addition to batching message sends (see below), we can also batch + * collections of messages destined for the same slab. This class handles + * collecting sufficiently temporally local messages destined to the same + * slab, collecting them with freelist::Builder(s), and then converting + * them to RemoteMessage rings when appropriate. + * + * In order that this class not need to know about the mechanics of actually + * pushing RemoteMessage-s around, the methods involved in "closing" rings + * -- that is, in converting freelist::Builder(s) to RemoteMessages -- take + * a callable, of template type Forward, which is given the destination + * slab('s metadata address) and the to-be-sent RemoteMessage. + */ + template + class RemoteDeallocCacheBatching + { + static_assert(RINGS > 0); + + std::array, RINGS> open_builder; + std::array open_meta = {0}; + + SNMALLOC_FAST_PATH size_t + ring_set(typename Config::PagemapEntry::SlabMetadata* meta) + { + // See https://github.com/skeeto/hash-prospector for choice of constant + return ((meta->as_key_tweak() * 0x7EFB352D) >> 16) & + bits::mask_bits(DEALLOC_BATCH_RING_SET_BITS); + } + + template + SNMALLOC_FAST_PATH void close_one_pending(Forward forward, size_t ix) + { + auto rmsg = BatchedRemoteMessage::mk_from_freelist_builder( + open_builder[ix], + freelist::Object::key_root, + Config::PagemapEntry::SlabMetadata::as_key_tweak(open_meta[ix])); + + auto& entry = Config::Backend::get_metaentry(address_cast(rmsg)); + + forward(entry.get_remote()->trunc_id(), rmsg); + + open_meta[ix] = 0; + } + + SNMALLOC_FAST_PATH void init_one_pending( + size_t ix, typename Config::PagemapEntry::SlabMetadata* meta) + { + open_builder[ix].init( + 0, + freelist::Object::key_root, + Config::PagemapEntry::SlabMetadata::as_key_tweak(open_meta[ix])); + open_meta[ix] = address_cast(meta); + } + + public: + template + SNMALLOC_FAST_PATH void dealloc( + typename Config::PagemapEntry::SlabMetadata* meta, + freelist::HeadPtr r, + LocalEntropy* entropy, + Forward forward) + { + size_t ix_set = ring_set(meta); + + for (size_t ix_way = 0; ix_way < DEALLOC_BATCH_RING_ASSOC; ix_way++) + { + size_t ix = ix_set + ix_way; + if (address_cast(meta) == open_meta[ix]) + { + open_builder[ix].add( + r, freelist::Object::key_root, meta->as_key_tweak()); + + if constexpr (mitigations(random_preserve)) + { + auto rand_limit = entropy->next_fresh_bits(MAX_CAPACITY_BITS); + if (open_builder[ix].extract_segment_length() >= rand_limit) + { + close_one_pending(forward, ix); + open_meta[ix] = 0; + } + } + else + { + UNUSED(entropy); + } + return; + } + } + + // No hit in cache, so find an available or victim line. + + size_t victim_ix = ix_set; + size_t victim_size = 0; + for (size_t ix_way = 0; ix_way < DEALLOC_BATCH_RING_ASSOC; ix_way++) + { + size_t ix = ix_set + ix_way; + if (open_meta[ix] == 0) + { + victim_ix = ix; + break; + } + + size_t szix = open_builder[ix].extract_segment_length(); + if (szix > victim_size) + { + victim_size = szix; + victim_ix = ix; + } + } + + if (open_meta[victim_ix] != 0) + { + close_one_pending(forward, victim_ix); + } + init_one_pending(victim_ix, meta); + + open_builder[victim_ix].add( + r, freelist::Object::key_root, meta->as_key_tweak()); + } + + template + SNMALLOC_FAST_PATH void close_all(Forward forward) + { + for (size_t ix = 0; ix < RINGS; ix++) + { + if (open_meta[ix] != 0) + { + close_one_pending(forward, ix); + open_meta[ix] = 0; + } + } + } + + void init() + { + open_meta = {0}; + } + }; + + template + struct RemoteDeallocCacheNoBatching + { + void init() {} + + template + void close_all(Forward) + {} + + template + SNMALLOC_FAST_PATH void dealloc( + typename Config::PagemapEntry::SlabMetadata*, + freelist::HeadPtr r, + LocalEntropy* entropy, + Forward forward) + { + UNUSED(entropy); + + auto& entry = Config::Backend::get_metaentry(address_cast(r)); + forward( + entry.get_remote()->trunc_id(), + SingletonRemoteMessage::emplace_in_alloc(r.as_void())); + } + }; + + template + using RemoteDeallocCacheBatchingImpl = std::conditional_t< + (DEALLOC_BATCH_RINGS > 0), + RemoteDeallocCacheBatching, + RemoteDeallocCacheNoBatching>; + /** * Stores the remote deallocation to batch them before sending */ @@ -20,6 +193,8 @@ namespace snmalloc { std::array, REMOTE_SLOTS> list; + RemoteDeallocCacheBatchingImpl batching; + /** * The total amount of memory we are waiting for before we will dispatch * to other allocators. Zero can mean we have not initialised the allocator @@ -69,14 +244,34 @@ namespace snmalloc } template - SNMALLOC_FAST_PATH void - dealloc(RemoteAllocator::alloc_id_t target_id, capptr::Alloc p) + SNMALLOC_FAST_PATH void forward( + RemoteAllocator::alloc_id_t target_id, capptr::Alloc msg) + { + list[get_slot(target_id, 0)].add( + RemoteMessage::to_message_link(msg), + RemoteAllocator::key_global, + NO_KEY_TWEAK); + } + + template + SNMALLOC_FAST_PATH void dealloc( + typename Config::PagemapEntry::SlabMetadata* meta, + capptr::Alloc p, + LocalEntropy* entropy) { SNMALLOC_ASSERT(initialised); + auto r = freelist::Object::make(p); - list[get_slot(target_id, 0)].add( - r, RemoteAllocator::key_global, NO_KEY_TWEAK); + batching.dealloc( + meta, + r, + entropy, + [this]( + RemoteAllocator::alloc_id_t target_id, + capptr::Alloc msg) { + forward(target_id, msg); + }); } template @@ -94,6 +289,12 @@ namespace snmalloc return capptr_domesticate(local_state, p); }; + batching.close_all([this]( + RemoteAllocator::alloc_id_t target_id, + capptr::Alloc msg) { + forward(target_id, msg); + }); + while (true) { auto my_slot = get_slot(id, post_round); @@ -105,9 +306,11 @@ namespace snmalloc if (!list[i].empty()) { - auto [first, last] = list[i].extract_segment(key, NO_KEY_TWEAK); + auto [first_, last_] = list[i].extract_segment(key, NO_KEY_TWEAK); + auto first = RemoteMessage::from_message_link(first_); + auto last = RemoteMessage::from_message_link(last_); const auto& entry = - Config::Backend::get_metaentry(address_cast(first)); + Config::Backend::get_metaentry(address_cast(first_)); auto remote = entry.get_remote(); // If the allocator is not correctly aligned, then the bit that is // set implies this is used by the backend, and we should not be @@ -181,6 +384,8 @@ namespace snmalloc l.init(0, RemoteAllocator::key_global, NO_KEY_TWEAK); } capacity = REMOTE_CACHE; + + batching.init(); } }; } // namespace snmalloc