From fb776da909eeb68835294e351ecaf3ae270b6891 Mon Sep 17 00:00:00 2001 From: Nathaniel Wesley Filardo Date: Mon, 23 Sep 2024 18:18:09 +0000 Subject: [PATCH] WIP: BatchIt (#677) * Rename dealloc_local_object_slower to _meta Unlike its brethren, `dealloc_local_object` and `dealloc_local_object_slow`, the `dealloc_local_object_slower` method does not take a pointer to free space. Make this slightly more apparent by renaming it and adding some commentary to both definition and call site. * corealloc: get meta in dealloc_local_object Make both _fast() and _slow() arms take the meta as an argument; _meta() already did. * Introduce RemoteMessage structure Plumb its use around remoteallocator and remotecache * NFC: Plumb metadata to remotecache dealloc * Initial steps in batched remote messages This prepares the recipient to process a batched message. * Initial dealloc-side batching machinery Exercise recipient machinery by having the senders collect adjacent frees to the same slab into a batch. * Match free batch keying to slab freelist keying * freelist: add append_segment * SlabMetadata: machinery for returning multiple objects This might involve multiple (I think at most two, at the moment) transitions in the slab lifecycle state machine. Towards that end, return indicators to the caller that the slow path must be taken and how many objects of the original set have not yet been counted as returned. * corealloc: operate ring-at-a-time on remote queues * RemoteCache associative cache of rings * RemoteCache: N-set caching * Initial CHERI support for free rings * Matt's fix for slow-path codegen * Try: remotecache: don't store allocator IDs We can, as Matt so kindly reminds me, go get them from the pagemap. Since we need this value only when closing a ring, the read from over there is probably not very onerous. (We could also get the slab pointer from an object in the ring, but we need that whenever inserting into the cache, so it's probably more sensible to store that locally?) * Make BatchIt optional Move ring set bits and associativity knobs to allocconfig and expose them via CMake. If associtivity is zero, use non-batched implementations of the `RemoteMessage` and `RemoteDeallocCacheBatching` classes. By default, kick BatchIt on when we have enough room in the minimum allocation size to do it. Exactly how much space is enough is a function of which mitigations we have enabled and whether or not we are compiling with C++20. This commit reverts the change to `MIN_ALLOC_SIZE` made in "Introduce RemoteMessage structure" now that we have multiple types, and zies, of remote messages to choose from. * RemoteDeallocCacheBatching: store metas as address There's no need for a full pointer here, it'd just make the structure larger on CHERI. * NFC: plumb entropy from LocalAlloc to BatchIt * BatchIt random eviction In order not to thwart `mitigations(random_preserve)` too much, if it's on in combination with BatchIt, roll the dice every time we append to a batch to decide if we should stochastically evict this batch. By increasing the number of batches, we allow the recipient allocator increased opportunity to randomly stripe batches across the two `freelist::Builder` segments associated with each slab. --------- Co-authored-by: Nathaniel Wesley Filardo Co-authored-by: Matthew Parkinson --- CMakeLists.txt | 5 + src/snmalloc/backend/backend.h | 8 + src/snmalloc/ds/allocconfig.h | 39 ++++ src/snmalloc/mem/corealloc.h | 135 +++++++++---- src/snmalloc/mem/freelist.h | 32 ++++ src/snmalloc/mem/localalloc.h | 14 +- src/snmalloc/mem/metadata.h | 61 +++++- src/snmalloc/mem/remoteallocator.h | 295 ++++++++++++++++++++++++++++- src/snmalloc/mem/remotecache.h | 217 ++++++++++++++++++++- 9 files changed, 755 insertions(+), 51 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 79c97a35c..ec0a5e958 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,6 +65,9 @@ endif() set(SNMALLOC_MIN_ALLOC_SIZE "" CACHE STRING "Minimum allocation bytes (power of 2)") set(SNMALLOC_MIN_ALLOC_STEP_SIZE "" CACHE STRING "Minimum allocation step (power of 2)") +set(SNMALLOC_DEALLOC_BATCH_RING_ASSOC "" CACHE STRING "Associativity of deallocation batch cache; 0 to disable") +set(SNMALLOC_DEALLOC_BATCH_RING_SET_BITS "" CACHE STRING "Logarithm of number of deallocation batch cache associativity sets") + if(MSVC AND SNMALLOC_STATIC_LIBRARY AND (SNMALLOC_STATIC_LIBRARY_PREFIX STREQUAL "")) message(FATAL_ERROR "Empty static library prefix not supported on MSVC") endif() @@ -251,6 +254,8 @@ if (SNMALLOC_NO_REALLOCARR) endif() add_as_define_value(SNMALLOC_MIN_ALLOC_SIZE) add_as_define_value(SNMALLOC_MIN_ALLOC_STEP_SIZE) +add_as_define_value(SNMALLOC_DEALLOC_BATCH_RING_ASSOC) +add_as_define_value(SNMALLOC_DEALLOC_BATCH_RING_SET_BITS) target_compile_definitions(snmalloc INTERFACE $<$:MALLOC_USABLE_SIZE_QUALIFIER=const>) diff --git a/src/snmalloc/backend/backend.h b/src/snmalloc/backend/backend.h index 883c471c6..ee170c38f 100644 --- a/src/snmalloc/backend/backend.h +++ b/src/snmalloc/backend/backend.h @@ -188,6 +188,14 @@ namespace snmalloc local_state.get_object_range()->dealloc_range(arena, size); } + SNMALLOC_FAST_PATH static capptr::Alloc + capptr_rederive_alloc(capptr::Alloc a, size_t objsize) + { + return capptr_to_user_address_control( + Aal::capptr_bound( + Authmap::amplify(a), objsize)); + } + template SNMALLOC_FAST_PATH static const PagemapEntry& get_metaentry(address_t p) { diff --git a/src/snmalloc/ds/allocconfig.h b/src/snmalloc/ds/allocconfig.h index 81a72303a..78ea9f41a 100644 --- a/src/snmalloc/ds/allocconfig.h +++ b/src/snmalloc/ds/allocconfig.h @@ -120,6 +120,45 @@ namespace snmalloc static constexpr size_t REMOTE_SLOTS = 1 << REMOTE_SLOT_BITS; static constexpr size_t REMOTE_MASK = REMOTE_SLOTS - 1; +#if defined(SNMALLOC_DEALLOC_BATCH_RING_ASSOC) + static constexpr size_t DEALLOC_BATCH_RING_ASSOC = + SNMALLOC_DEALLOC_BATCH_RING_ASSOC; +#else +# if defined(__has_cpp_attribute) +# if ( \ + __has_cpp_attribute(msvc::no_unique_address) && \ + (__cplusplus >= 201803L || _MSVC_LANG >= 201803L)) || \ + __has_cpp_attribute(no_unique_address) + // For C++20 or later, we do have [[no_unique_address]] and so can also do + // batching if we aren't turning on the backward-pointer mitigations + static constexpr size_t DEALLOC_BATCH_MIN_ALLOC_WORDS = + mitigations(freelist_backward_edge) ? 4 : 2; +# else + // For C++17, we don't have [[no_unique_address]] and so we always end up + // needing all four pointers' worth of space (because BatchedRemoteMessage has + // two freelist::Object::T<> links within, each of which will have two fields + // and will be padded to two pointers). + static constexpr size_t DEALLOC_BATCH_MIN_ALLOC_WORDS = 4; +# endif +# else + // If we don't even have the feature test macro, we're C++17 or earlier. + static constexpr size_t DEALLOC_BATCH_MIN_ALLOC_WORDS = 4; +# endif + + static constexpr size_t DEALLOC_BATCH_RING_ASSOC = + (MIN_ALLOC_SIZE >= (DEALLOC_BATCH_MIN_ALLOC_WORDS * sizeof(void*))) ? 2 : 0; +#endif + +#if defined(SNMALLOC_DEALLOC_BATCH_RING_SET_BITS) + static constexpr size_t DEALLOC_BATCH_RING_SET_BITS = + SNMALLOC_DEALLOC_BATCH_RING_SET_BITS; +#else + static constexpr size_t DEALLOC_BATCH_RING_SET_BITS = 3; +#endif + + static constexpr size_t DEALLOC_BATCH_RINGS = + DEALLOC_BATCH_RING_ASSOC * bits::one_at_bit(DEALLOC_BATCH_RING_SET_BITS); + static_assert( INTERMEDIATE_BITS < MIN_ALLOC_STEP_BITS, "INTERMEDIATE_BITS must be less than MIN_ALLOC_BITS"); diff --git a/src/snmalloc/mem/corealloc.h b/src/snmalloc/mem/corealloc.h index 99b0a9568..aca108a6d 100644 --- a/src/snmalloc/mem/corealloc.h +++ b/src/snmalloc/mem/corealloc.h @@ -380,9 +380,15 @@ namespace snmalloc } /** - * Very slow path for deallocating an object locally. + * Very slow path for object deallocation. + * + * The object has already been returned to the slab, so all that is left to + * do is update its metadata and, if that pushes us into having too many + * unused slabs in this size class, return some. + * + * Also while here, check the time. */ - SNMALLOC_SLOW_PATH void dealloc_local_object_slower( + SNMALLOC_SLOW_PATH void dealloc_local_object_meta( const PagemapEntry& entry, BackendSlabMetadata* meta) { smallsizeclass_t sizeclass = entry.get_sizeclass().as_small(); @@ -427,14 +433,17 @@ namespace snmalloc * This is either waking up a slab that was not actively being used * by this thread, or handling the final deallocation onto a slab, * so it can be reused by other threads. + * + * Live large objects look like slabs that need attention when they become + * free; that attention is also given here. */ - SNMALLOC_SLOW_PATH void - dealloc_local_object_slow(capptr::Alloc p, const PagemapEntry& entry) + SNMALLOC_SLOW_PATH void dealloc_local_object_slow( + capptr::Alloc p, + const PagemapEntry& entry, + BackendSlabMetadata* meta) { // TODO: Handle message queue on this path? - auto* meta = entry.get_slab_metadata(); - if (meta->is_large()) { // Handle large deallocation here. @@ -460,7 +469,8 @@ namespace snmalloc return; } - dealloc_local_object_slower(entry, meta); + // Not a large object; update slab metadata + dealloc_local_object_meta(entry, meta); } /** @@ -503,13 +513,11 @@ namespace snmalloc SNMALLOC_FAST_PATH_LAMBDA { return capptr_domesticate(local_state, p); }; - auto cb = [this, - &need_post](freelist::HeadPtr msg) SNMALLOC_FAST_PATH_LAMBDA { + auto cb = [this, domesticate, &need_post]( + capptr::Alloc msg) SNMALLOC_FAST_PATH_LAMBDA { auto& entry = Config::Backend::template get_metaentry(snmalloc::address_cast(msg)); - - handle_dealloc_remote(entry, msg.as_void(), need_post); - + handle_dealloc_remote(entry, msg, need_post, domesticate); return true; }; @@ -548,10 +556,12 @@ namespace snmalloc * * need_post will be set to true, if capacity is exceeded. */ + template void handle_dealloc_remote( const PagemapEntry& entry, - CapPtr p, - bool& need_post) + capptr::Alloc msg, + bool& need_post, + Domesticator_queue domesticate) { // TODO this needs to not double count stats // TODO this needs to not double revoke if using MTE @@ -559,21 +569,43 @@ namespace snmalloc if (SNMALLOC_LIKELY(entry.get_remote() == public_state())) { - dealloc_local_object(p, entry); + auto meta = entry.get_slab_metadata(); + + auto unreturned = + dealloc_local_objects_fast(msg, entry, meta, entropy, domesticate); + + /* + * dealloc_local_objects_fast has updated the free list but not updated + * the slab metadata; it falls to us to do so. It is UNLIKELY that we + * will need to take further steps, but we might. + */ + if (SNMALLOC_UNLIKELY(unreturned.template step())) + { + dealloc_local_object_slow(msg.as_void(), entry, meta); + + while (SNMALLOC_UNLIKELY(unreturned.template step())) + { + dealloc_local_object_meta(entry, meta); + } + } + return; } - else + + auto nelem = RemoteMessage::template ring_size( + msg, + freelist::Object::key_root, + entry.get_slab_metadata()->as_key_tweak(), + domesticate); + if ( + !need_post && + !attached_cache->remote_dealloc_cache.reserve_space(entry, nelem)) { - if ( - !need_post && - !attached_cache->remote_dealloc_cache.reserve_space(entry)) - { - need_post = true; - } - attached_cache->remote_dealloc_cache - .template dealloc( - entry.get_remote()->trunc_id(), p.as_void()); + need_post = true; } + attached_cache->remote_dealloc_cache + .template forward( + entry.get_remote()->trunc_id(), msg); } /** @@ -698,10 +730,12 @@ namespace snmalloc CapPtr p, const typename Config::PagemapEntry& entry) { - if (SNMALLOC_LIKELY(dealloc_local_object_fast(entry, p, entropy))) + auto meta = entry.get_slab_metadata(); + + if (SNMALLOC_LIKELY(dealloc_local_object_fast(p, entry, meta, entropy))) return; - dealloc_local_object_slow(p, entry); + dealloc_local_object_slow(p, entry, meta); } SNMALLOC_FAST_PATH void @@ -714,12 +748,11 @@ namespace snmalloc } SNMALLOC_FAST_PATH static bool dealloc_local_object_fast( - const PagemapEntry& entry, CapPtr p, + const PagemapEntry& entry, + BackendSlabMetadata* meta, LocalEntropy& entropy) { - auto meta = entry.get_slab_metadata(); - SNMALLOC_ASSERT(!meta->is_unused()); snmalloc_check_client( @@ -736,6 +769,42 @@ namespace snmalloc return SNMALLOC_LIKELY(!meta->return_object()); } + template + SNMALLOC_FAST_PATH static auto dealloc_local_objects_fast( + capptr::Alloc msg, + const PagemapEntry& entry, + BackendSlabMetadata* meta, + LocalEntropy& entropy, + Domesticator domesticate) + { + SNMALLOC_ASSERT(!meta->is_unused()); + + snmalloc_check_client( + mitigations(sanity_checks), + is_start_of_object(entry.get_sizeclass(), address_cast(msg)), + "Not deallocating start of an object"); + + size_t objsize = sizeclass_full_to_size(entry.get_sizeclass()); + + auto [curr, length] = RemoteMessage::template open_free_ring( + msg, + objsize, + freelist::Object::key_root, + meta->as_key_tweak(), + domesticate); + + // Update the head and the next pointer in the free list. + meta->free_queue.append_segment( + curr, + msg.template as_reinterpret>(), + length, + freelist::Object::key_root, + meta->as_key_tweak(), + entropy); + + return meta->return_objects(length); + } + template SNMALLOC_SLOW_PATH capptr::Alloc small_alloc(smallsizeclass_t sizeclass, freelist::Iter<>& fast_free_list) @@ -871,11 +940,11 @@ namespace snmalloc if (destroy_queue) { - auto cb = [this](capptr::Alloc p) { + auto cb = [this, domesticate](capptr::Alloc m) { bool need_post = true; // Always going to post, so ignore. const PagemapEntry& entry = - Config::Backend::get_metaentry(snmalloc::address_cast(p)); - handle_dealloc_remote(entry, p.as_void(), need_post); + Config::Backend::get_metaentry(snmalloc::address_cast(m)); + handle_dealloc_remote(entry, m, need_post, domesticate); }; message_queue().destroy_and_iterate(domesticate, cb); diff --git a/src/snmalloc/mem/freelist.h b/src/snmalloc/mem/freelist.h index f491e979a..56b18f5a8 100644 --- a/src/snmalloc/mem/freelist.h +++ b/src/snmalloc/mem/freelist.h @@ -40,6 +40,8 @@ namespace snmalloc { + class BatchedRemoteMessage; + static constexpr address_t NO_KEY_TWEAK = 0; /** @@ -139,6 +141,8 @@ namespace snmalloc friend class Object; + friend class ::snmalloc::BatchedRemoteMessage; + class Empty { public: @@ -916,6 +920,34 @@ namespace snmalloc return {first, last}; } + /** + * Put back an extracted segment from a builder using the same key. + * + * The caller must tell us how many elements are involved. + */ + void append_segment( + Object::BHeadPtr first, + Object::BHeadPtr last, + uint16_t size, + const FreeListKey& key, + address_t key_tweak, + LocalEntropy& entropy) + { + uint32_t index; + if constexpr (RANDOM) + index = entropy.next_bit(); + else + index = 0; + + if constexpr (TRACK_LENGTH) + length[index] += size; + else + UNUSED(size); + + Object::store_next(cast_end(index), first, key, key_tweak); + set_end(index, &(last->next_object)); + } + template SNMALLOC_FAST_PATH void validate( const FreeListKey& key, address_t key_tweak, Domesticator domesticate) diff --git a/src/snmalloc/mem/localalloc.h b/src/snmalloc/mem/localalloc.h index cfa0a5db8..9fad26012 100644 --- a/src/snmalloc/mem/localalloc.h +++ b/src/snmalloc/mem/localalloc.h @@ -286,7 +286,7 @@ namespace snmalloc address_cast(entry.get_slab_metadata())); #endif local_cache.remote_dealloc_cache.template dealloc( - entry.get_remote()->trunc_id(), p); + entry.get_slab_metadata(), p, &local_cache.entropy); post_remote_cache(); return; } @@ -658,6 +658,12 @@ namespace snmalloc return; } + dealloc_remote(entry, p_tame); + } + + SNMALLOC_SLOW_PATH void + dealloc_remote(const PagemapEntry& entry, capptr::Alloc p_tame) + { RemoteAllocator* remote = entry.get_remote(); if (SNMALLOC_LIKELY(remote != nullptr)) { @@ -673,12 +679,12 @@ namespace snmalloc if (local_cache.remote_dealloc_cache.reserve_space(entry)) { local_cache.remote_dealloc_cache.template dealloc( - remote->trunc_id(), p_tame); + entry.get_slab_metadata(), p_tame, &local_cache.entropy); # ifdef SNMALLOC_TRACING message<1024>( "Remote dealloc fast {} ({}, {})", - p_raw, - alloc_size(p_raw), + address_cast(p_tame), + alloc_size(p_tame.unsafe_ptr()), address_cast(entry.get_slab_metadata())); # endif return; diff --git a/src/snmalloc/mem/metadata.h b/src/snmalloc/mem/metadata.h index 7cf50e3af..a58822dfa 100644 --- a/src/snmalloc/mem/metadata.h +++ b/src/snmalloc/mem/metadata.h @@ -500,6 +500,59 @@ namespace snmalloc return (--needed()) == 0; } + class ReturnObjectsIterator + { + uint16_t _batch; + FrontendSlabMetadata* _meta; + + static_assert(sizeof(_batch) * 8 > MAX_CAPACITY_BITS); + + public: + ReturnObjectsIterator(uint16_t n, FrontendSlabMetadata* m) + : _batch(n), _meta(m) + {} + + template + SNMALLOC_FAST_PATH bool step() + { + // The first update must always return some positive number of objects. + SNMALLOC_ASSERT(!first || (_batch != 0)); + + /* + * Stop iteration when there are no more objects to return. Perform + * this test only on non-first steps to avoid a branch on the hot path. + */ + if (!first && _batch == 0) + return false; + + if (SNMALLOC_LIKELY(_batch < _meta->needed())) + { + // Will not hit threshold for state transition + _meta->needed() -= _batch; + return false; + } + + // Hit threshold for state transition, may yet hit another + _batch -= _meta->needed(); + _meta->needed() = 0; + return true; + } + }; + + /** + * A batch version of return_object. + * + * Returns an iterator that should have `.step<>()` called on it repeatedly + * until it returns `false`. The first step should invoke `.step()` + * while the rest should invoke `.step()`. After each + * true-returning `.step()`, the caller should run the slow-path code to + * update the rest of the metadata for this slab. + */ + ReturnObjectsIterator return_objects(uint16_t n) + { + return ReturnObjectsIterator(n, this); + } + bool is_unused() { return needed() == 0; @@ -605,7 +658,13 @@ namespace snmalloc [[nodiscard]] SNMALLOC_FAST_PATH address_t as_key_tweak() const noexcept { - return address_cast(this) / alignof(decltype(*this)); + return as_key_tweak(address_cast(this)); + } + + [[nodiscard]] SNMALLOC_FAST_PATH static address_t + as_key_tweak(address_t self) + { + return self / alignof(FrontendSlabMetadata); } typename ClientMeta::DataRef get_meta_for_object(size_t index) diff --git a/src/snmalloc/mem/remoteallocator.h b/src/snmalloc/mem/remoteallocator.h index c0fb7240c..0a72aa318 100644 --- a/src/snmalloc/mem/remoteallocator.h +++ b/src/snmalloc/mem/remoteallocator.h @@ -1,10 +1,284 @@ #pragma once #include "freelist_queue.h" -#include "remotecache.h" namespace snmalloc { + class RemoteMessageAssertions; + + /** + * Entries on a remote message queue. Logically, this is a pair of freelist + * linkages, together with some metadata: + * + * - a cyclic list ("ring") of free objects (atypically for rings, there is + * no sentinel node here: the message itself is a free object), + * + * - the length of that ring + * + * - the linkage for the message queue itself + * + * In practice, there is a fair bit more going on here: the ring of free + * objects is not entirely encoded as a freelist. While traversing the + * successor pointers in objects on the ring will eventually lead back to + * this RemoteMessage object, the successor pointer from this object is + * encoded as a relative displacement. This is guaranteed to be physically + * smaller than a full pointer (because slabs are smaller than the whole + * address space). This gives us enough room to pack in the length of the + * ring, without needing to grow the structure. + */ + class BatchedRemoteMessage + { + friend class BatchedRemoteMessageAssertions; + + freelist::Object::T<> free_ring; + freelist::Object::T<> message_link; + + static_assert( + sizeof(free_ring.next_object) >= sizeof(void*), + "BatchedRemoteMessage bitpacking needs sizeof(void*) in next_object"); + + public: + static auto emplace_in_alloc(capptr::Alloc alloc) + { + return CapPtr::unsafe_from( + new (alloc.unsafe_ptr()) BatchedRemoteMessage()); + } + + static auto mk_from_freelist_builder( + freelist::Builder& flb, + const FreeListKey& key, + address_t key_tweak) + { + size_t size = flb.extract_segment_length(); + + SNMALLOC_ASSERT(size < bits::one_at_bit(MAX_CAPACITY_BITS)); + + auto [first, last] = flb.extract_segment(key, key_tweak); + + /* + * Preserve the last node's backpointer and change its type. Because we + * use placement new to build our RemoteMessage atop the memory of a + * freelist::Object::T<> (to avoid UB) and the constructor may nullify + * the `prev` field, put it right back. Ideally the compiler is smart + * enough to see that this is a no-op. + */ + auto last_prev = last->prev; + auto self = + CapPtr::unsafe_from( + new (last.unsafe_ptr()) BatchedRemoteMessage()); + self->free_ring.prev = last_prev; + + // XXX On CHERI, we could do a fair bit better if we had a primitive for + // extracting and discarding the offset. That probably beats the dance + // done below, but it should work as it stands. + + auto n = freelist::HeadPtr::unsafe_from( + unsafe_from_uintptr>( + (static_cast(pointer_diff_signed(self, first)) + << MAX_CAPACITY_BITS) + + size)); + + // Close the ring, storing our bit-packed value in the next field. + freelist::Object::store_nextish( + &self->free_ring.next_object, first, key, key_tweak, n); + + return self; + } + + static freelist::HeadPtr + to_message_link(capptr::Alloc m) + { + return pointer_offset(m, offsetof(BatchedRemoteMessage, message_link)) + .as_reinterpret>(); + } + + static capptr::Alloc + from_message_link(freelist::HeadPtr chainPtr) + { + return pointer_offset_signed( + chainPtr, + -static_cast( + offsetof(BatchedRemoteMessage, message_link))) + .as_reinterpret(); + } + + template + SNMALLOC_FAST_PATH static std::pair + open_free_ring( + capptr::Alloc m, + size_t objsize, + const FreeListKey& key, + address_t key_tweak, + Domesticator_queue domesticate) + { + uintptr_t encoded = + m->free_ring.read_next(key, key_tweak, domesticate).unsafe_uintptr(); + + uint16_t decoded_size = + static_cast(encoded) & bits::mask_bits(MAX_CAPACITY_BITS); + static_assert(sizeof(decoded_size) * 8 > MAX_CAPACITY_BITS); + + /* + * Derive an out-of-bounds pointer to the next allocation, then use the + * authmap to reconstruct an in-bounds version, which we then immediately + * bound and rewild and then domesticate (how strange). + * + * XXX See above re: doing better on CHERI. + */ + auto next = domesticate( + capptr_rewild( + Config::Backend::capptr_rederive_alloc( + pointer_offset_signed( + m, static_cast(encoded) >> MAX_CAPACITY_BITS), + objsize)) + .template as_static>()); + + if constexpr (mitigations(freelist_backward_edge)) + { + next->check_prev( + signed_prev(address_cast(m), address_cast(next), key, key_tweak)); + } + else + { + UNUSED(key); + UNUSED(key_tweak); + } + + return {next.template as_static>(), decoded_size}; + } + + template + static uint16_t ring_size( + capptr::Alloc m, + const FreeListKey& key, + address_t key_tweak, + Domesticator_queue domesticate) + { + uintptr_t encoded = + m->free_ring.read_next(key, key_tweak, domesticate).unsafe_uintptr(); + + uint16_t decoded_size = + static_cast(encoded) & bits::mask_bits(MAX_CAPACITY_BITS); + static_assert(sizeof(decoded_size) * 8 > MAX_CAPACITY_BITS); + + if constexpr (mitigations(freelist_backward_edge)) + { + /* + * Like above, but we don't strictly need to rebound the pointer, + * since it's only used internally. Still, doesn't hurt to bound + * to the free list linkage. + */ + auto next = domesticate( + capptr_rewild( + Config::Backend::capptr_rederive_alloc( + pointer_offset_signed( + m, static_cast(encoded) >> MAX_CAPACITY_BITS), + sizeof(freelist::Object::T<>))) + .template as_static>()); + + next->check_prev( + signed_prev(address_cast(m), address_cast(next), key, key_tweak)); + } + else + { + UNUSED(key); + UNUSED(key_tweak); + UNUSED(domesticate); + } + + return decoded_size; + } + }; + + class BatchedRemoteMessageAssertions + { + static_assert( + (DEALLOC_BATCH_RINGS == 0) || + (sizeof(BatchedRemoteMessage) <= MIN_ALLOC_SIZE)); + static_assert(offsetof(BatchedRemoteMessage, free_ring) == 0); + + static_assert( + (DEALLOC_BATCH_RINGS == 0) || + (MAX_SLAB_SPAN_BITS + MAX_CAPACITY_BITS < 8 * sizeof(void*)), + "Ring bit-stuffing trick can't reach far enough to enclose a slab"); + }; + + /** The type of a remote message when we are not batching messages onto + * rings. + * + * Relative to BatchRemoteMessage, this type is smaller, as it contains only + * a single linkage, to the next message. (And possibly a backref, if + * mitigations(freelist_backward_edge) is enabled.) + */ + class SingletonRemoteMessage + { + friend class SingletonRemoteMessageAssertions; + + freelist::Object::T<> message_link; + + public: + static auto emplace_in_alloc(capptr::Alloc alloc) + { + return CapPtr::unsafe_from( + new (alloc.unsafe_ptr()) SingletonRemoteMessage()); + } + + static freelist::HeadPtr + to_message_link(capptr::Alloc m) + { + return pointer_offset(m, offsetof(SingletonRemoteMessage, message_link)) + .as_reinterpret>(); + } + + static capptr::Alloc + from_message_link(freelist::HeadPtr chainPtr) + { + return pointer_offset_signed( + chainPtr, + -static_cast( + offsetof(SingletonRemoteMessage, message_link))) + .as_reinterpret(); + } + + template + SNMALLOC_FAST_PATH static std::pair + open_free_ring( + capptr::Alloc m, + size_t, + const FreeListKey&, + address_t, + Domesticator_queue) + { + return { + m.as_reinterpret>(), static_cast(1)}; + } + + template + static uint16_t ring_size( + capptr::Alloc, + const FreeListKey&, + address_t, + Domesticator_queue) + { + return 1; + } + }; + + class SingletonRemoteMessageAssertions + { + static_assert(sizeof(SingletonRemoteMessage) <= MIN_ALLOC_SIZE); + static_assert( + sizeof(SingletonRemoteMessage) == sizeof(freelist::Object::T<>)); + static_assert(offsetof(SingletonRemoteMessage, message_link) == 0); + }; + + using RemoteMessage = std::conditional_t< + (DEALLOC_BATCH_RINGS > 0), + BatchedRemoteMessage, + SingletonRemoteMessage>; + + static_assert(sizeof(RemoteMessage) <= MIN_ALLOC_SIZE); + /** * A RemoteAllocator is the message queue of freed objects. It builds on the * FreeListMPSCQ but encapsulates knowledge that the objects are actually @@ -45,8 +319,9 @@ namespace snmalloc template void destroy_and_iterate(Domesticator_queue domesticate, Cb cb) { - auto cbwrap = [cb](freelist::HeadPtr p) - SNMALLOC_FAST_PATH_LAMBDA { cb(p.as_void()); }; + auto cbwrap = [cb](freelist::HeadPtr p) SNMALLOC_FAST_PATH_LAMBDA { + cb(RemoteMessage::from_message_link(p)); + }; return list.destroy_and_iterate(domesticate, cbwrap); } @@ -67,11 +342,14 @@ namespace snmalloc */ template void enqueue( - freelist::HeadPtr first, - freelist::HeadPtr last, + capptr::Alloc first, + capptr::Alloc last, Domesticator_head domesticate_head) { - list.enqueue(first, last, domesticate_head); + list.enqueue( + RemoteMessage::to_message_link(first), + RemoteMessage::to_message_link(last), + domesticate_head); } /** @@ -91,7 +369,10 @@ namespace snmalloc Domesticator_queue domesticate_queue, Cb cb) { - list.dequeue(domesticate_head, domesticate_queue, cb); + auto cbwrap = [cb](freelist::HeadPtr p) SNMALLOC_FAST_PATH_LAMBDA { + return cb(RemoteMessage::from_message_link(p)); + }; + list.dequeue(domesticate_head, domesticate_queue, cbwrap); } alloc_id_t trunc_id() diff --git a/src/snmalloc/mem/remotecache.h b/src/snmalloc/mem/remotecache.h index c3dbdcd95..585fb9146 100644 --- a/src/snmalloc/mem/remotecache.h +++ b/src/snmalloc/mem/remotecache.h @@ -12,6 +12,179 @@ namespace snmalloc { + + /** + * Same-destination message batching. + * + * In addition to batching message sends (see below), we can also batch + * collections of messages destined for the same slab. This class handles + * collecting sufficiently temporally local messages destined to the same + * slab, collecting them with freelist::Builder(s), and then converting + * them to RemoteMessage rings when appropriate. + * + * In order that this class not need to know about the mechanics of actually + * pushing RemoteMessage-s around, the methods involved in "closing" rings + * -- that is, in converting freelist::Builder(s) to RemoteMessages -- take + * a callable, of template type Forward, which is given the destination + * slab('s metadata address) and the to-be-sent RemoteMessage. + */ + template + class RemoteDeallocCacheBatching + { + static_assert(RINGS > 0); + + std::array, RINGS> open_builder; + std::array open_meta = {0}; + + SNMALLOC_FAST_PATH size_t + ring_set(typename Config::PagemapEntry::SlabMetadata* meta) + { + // See https://github.com/skeeto/hash-prospector for choice of constant + return ((meta->as_key_tweak() * 0x7EFB352D) >> 16) & + bits::mask_bits(DEALLOC_BATCH_RING_SET_BITS); + } + + template + SNMALLOC_FAST_PATH void close_one_pending(Forward forward, size_t ix) + { + auto rmsg = BatchedRemoteMessage::mk_from_freelist_builder( + open_builder[ix], + freelist::Object::key_root, + Config::PagemapEntry::SlabMetadata::as_key_tweak(open_meta[ix])); + + auto& entry = Config::Backend::get_metaentry(address_cast(rmsg)); + + forward(entry.get_remote()->trunc_id(), rmsg); + + open_meta[ix] = 0; + } + + SNMALLOC_FAST_PATH void init_one_pending( + size_t ix, typename Config::PagemapEntry::SlabMetadata* meta) + { + open_builder[ix].init( + 0, + freelist::Object::key_root, + Config::PagemapEntry::SlabMetadata::as_key_tweak(open_meta[ix])); + open_meta[ix] = address_cast(meta); + } + + public: + template + SNMALLOC_FAST_PATH void dealloc( + typename Config::PagemapEntry::SlabMetadata* meta, + freelist::HeadPtr r, + LocalEntropy* entropy, + Forward forward) + { + size_t ix_set = ring_set(meta); + + for (size_t ix_way = 0; ix_way < DEALLOC_BATCH_RING_ASSOC; ix_way++) + { + size_t ix = ix_set + ix_way; + if (address_cast(meta) == open_meta[ix]) + { + open_builder[ix].add( + r, freelist::Object::key_root, meta->as_key_tweak()); + + if constexpr (mitigations(random_preserve)) + { + auto rand_limit = entropy->next_fresh_bits(MAX_CAPACITY_BITS); + if (open_builder[ix].extract_segment_length() >= rand_limit) + { + close_one_pending(forward, ix); + open_meta[ix] = 0; + } + } + else + { + UNUSED(entropy); + } + return; + } + } + + // No hit in cache, so find an available or victim line. + + size_t victim_ix = ix_set; + size_t victim_size = 0; + for (size_t ix_way = 0; ix_way < DEALLOC_BATCH_RING_ASSOC; ix_way++) + { + size_t ix = ix_set + ix_way; + if (open_meta[ix] == 0) + { + victim_ix = ix; + break; + } + + size_t szix = open_builder[ix].extract_segment_length(); + if (szix > victim_size) + { + victim_size = szix; + victim_ix = ix; + } + } + + if (open_meta[victim_ix] != 0) + { + close_one_pending(forward, victim_ix); + } + init_one_pending(victim_ix, meta); + + open_builder[victim_ix].add( + r, freelist::Object::key_root, meta->as_key_tweak()); + } + + template + SNMALLOC_FAST_PATH void close_all(Forward forward) + { + for (size_t ix = 0; ix < RINGS; ix++) + { + if (open_meta[ix] != 0) + { + close_one_pending(forward, ix); + open_meta[ix] = 0; + } + } + } + + void init() + { + open_meta = {0}; + } + }; + + template + struct RemoteDeallocCacheNoBatching + { + void init() {} + + template + void close_all(Forward) + {} + + template + SNMALLOC_FAST_PATH void dealloc( + typename Config::PagemapEntry::SlabMetadata*, + freelist::HeadPtr r, + LocalEntropy* entropy, + Forward forward) + { + UNUSED(entropy); + + auto& entry = Config::Backend::get_metaentry(address_cast(r)); + forward( + entry.get_remote()->trunc_id(), + SingletonRemoteMessage::emplace_in_alloc(r.as_void())); + } + }; + + template + using RemoteDeallocCacheBatchingImpl = std::conditional_t< + (DEALLOC_BATCH_RINGS > 0), + RemoteDeallocCacheBatching, + RemoteDeallocCacheNoBatching>; + /** * Stores the remote deallocation to batch them before sending */ @@ -20,6 +193,8 @@ namespace snmalloc { std::array, REMOTE_SLOTS> list; + RemoteDeallocCacheBatchingImpl batching; + /** * The total amount of memory we are waiting for before we will dispatch * to other allocators. Zero can mean we have not initialised the allocator @@ -69,14 +244,34 @@ namespace snmalloc } template - SNMALLOC_FAST_PATH void - dealloc(RemoteAllocator::alloc_id_t target_id, capptr::Alloc p) + SNMALLOC_FAST_PATH void forward( + RemoteAllocator::alloc_id_t target_id, capptr::Alloc msg) + { + list[get_slot(target_id, 0)].add( + RemoteMessage::to_message_link(msg), + RemoteAllocator::key_global, + NO_KEY_TWEAK); + } + + template + SNMALLOC_FAST_PATH void dealloc( + typename Config::PagemapEntry::SlabMetadata* meta, + capptr::Alloc p, + LocalEntropy* entropy) { SNMALLOC_ASSERT(initialised); + auto r = freelist::Object::make(p); - list[get_slot(target_id, 0)].add( - r, RemoteAllocator::key_global, NO_KEY_TWEAK); + batching.dealloc( + meta, + r, + entropy, + [this]( + RemoteAllocator::alloc_id_t target_id, + capptr::Alloc msg) { + forward(target_id, msg); + }); } template @@ -94,6 +289,12 @@ namespace snmalloc return capptr_domesticate(local_state, p); }; + batching.close_all([this]( + RemoteAllocator::alloc_id_t target_id, + capptr::Alloc msg) { + forward(target_id, msg); + }); + while (true) { auto my_slot = get_slot(id, post_round); @@ -105,9 +306,11 @@ namespace snmalloc if (!list[i].empty()) { - auto [first, last] = list[i].extract_segment(key, NO_KEY_TWEAK); + auto [first_, last_] = list[i].extract_segment(key, NO_KEY_TWEAK); + auto first = RemoteMessage::from_message_link(first_); + auto last = RemoteMessage::from_message_link(last_); const auto& entry = - Config::Backend::get_metaentry(address_cast(first)); + Config::Backend::get_metaentry(address_cast(first_)); auto remote = entry.get_remote(); // If the allocator is not correctly aligned, then the bit that is // set implies this is used by the backend, and we should not be @@ -181,6 +384,8 @@ namespace snmalloc l.init(0, RemoteAllocator::key_global, NO_KEY_TWEAK); } capacity = REMOTE_CACHE; + + batching.init(); } }; } // namespace snmalloc