From 8b95b9a916c2ba76fba334b18b7e4e51eac03a26 Mon Sep 17 00:00:00 2001 From: Nathaniel Filardo <105816689+nwf-msr@users.noreply.github.com> Date: Thu, 12 Sep 2024 17:06:53 -0400 Subject: [PATCH] Bottom commits from BatchIt (#675) * msvc: set __cplusplus to the actual value in use * ds_core/bits: add mask_bits; convert one_at_bit-s * remotecache: enable reserve_space multiple objects * nits * Small changes to tracing - Trace "Handling remote" once per batch, rather than per element - Remote queue events also log the associated metaslab; we'll use this to assess the efficacy of https://github.com/microsoft/snmalloc/issues/634 * freelist builder: allow forcibly tracking length * Try forward declaring freelist::Builder to appease macos-14 * freelist: tweak intra-slab obfuscation keys by meta address * NFC: freelist: allow `next` to be arbitrary value * Switch to a central, tweaked key for all free lists * allocconfig: introduce some properties of slabs We'll use these to pack values in message queues. - Maximum distance between two objects in a single slab - Maximum number of objects in a slab * NFC: Templatize LocalCache on Config * NFC: split dealloc_local_object_slow We'll use the _slower form when we're just stepping a slab through multiple rounds of state transition (to come), which can't involve the actual memory object in question. * NFC: make freelist::Object::T-s by placement new * NFC: CoreAlloc: split dealloc_local_object The pattern of `if (!fast()) { slow() }` occurs in a few places, including in contexts where we already know the entry and so don't need to look it up. --- CMakeLists.txt | 4 + src/snmalloc/aal/aal.h | 2 +- src/snmalloc/backend/globalconfig.h | 3 +- .../backend_helpers/largebuddyrange.h | 4 +- src/snmalloc/ds/allocconfig.h | 24 ++- src/snmalloc/ds_core/bits.h | 26 ++- src/snmalloc/mem/corealloc.h | 192 ++++++++++-------- src/snmalloc/mem/entropy.h | 31 ++- src/snmalloc/mem/freelist.h | 87 ++++++-- src/snmalloc/mem/localalloc.h | 32 ++- src/snmalloc/mem/localcache.h | 17 +- src/snmalloc/mem/metadata.h | 16 +- src/snmalloc/mem/remotecache.h | 13 +- src/snmalloc/mem/sizeclasstable.h | 10 +- src/test/func/domestication/domestication.cc | 3 +- src/test/perf/contention/contention.cc | 2 +- 16 files changed, 289 insertions(+), 177 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7d0cdcb5e..79c97a35c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -188,6 +188,10 @@ if(SNMALLOC_USE_CXX17) else() target_compile_features(snmalloc INTERFACE cxx_std_20) endif() +# https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus +if(MSVC) + target_compile_options(snmalloc INTERFACE "/Zc:__cplusplus") +endif() # Add header paths. target_include_directories(snmalloc diff --git a/src/snmalloc/aal/aal.h b/src/snmalloc/aal/aal.h index e1763995e..92baaf1f6 100644 --- a/src/snmalloc/aal/aal.h +++ b/src/snmalloc/aal/aal.h @@ -56,7 +56,7 @@ namespace snmalloc { /* * Provide a default specification of address_t as uintptr_t for Arch-es - * that support IntegerPointers. Those Arch-es without IntegerPoihnters + * that support IntegerPointers. Those Arch-es without IntegerPointers * must explicitly give their address_t. * * This somewhat obtuse way of spelling the defaulting is necessary so diff --git a/src/snmalloc/backend/globalconfig.h b/src/snmalloc/backend/globalconfig.h index 3918315bd..5d171a9b8 100644 --- a/src/snmalloc/backend/globalconfig.h +++ b/src/snmalloc/backend/globalconfig.h @@ -110,7 +110,8 @@ namespace snmalloc LocalEntropy entropy; entropy.init(); // Initialise key for remote deallocation lists - RemoteAllocator::key_global = FreeListKey(entropy.get_free_list_key()); + entropy.make_free_list_key(RemoteAllocator::key_global); + entropy.make_free_list_key(freelist::Object::key_root); // Need to randomise pagemap location. If requested and not a // StrictProvenance architecture, randomize its table's location within diff --git a/src/snmalloc/backend_helpers/largebuddyrange.h b/src/snmalloc/backend_helpers/largebuddyrange.h index d1446d725..e85e06f21 100644 --- a/src/snmalloc/backend_helpers/largebuddyrange.h +++ b/src/snmalloc/backend_helpers/largebuddyrange.h @@ -354,7 +354,7 @@ namespace snmalloc SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE); SNMALLOC_ASSERT(bits::is_pow2(size)); - if (size >= (bits::one_at_bit(MAX_SIZE_BITS) - 1)) + if (size >= bits::mask_bits(MAX_SIZE_BITS)) { if (ParentRange::Aligned) return parent.alloc_range(size); @@ -378,7 +378,7 @@ namespace snmalloc if constexpr (MAX_SIZE_BITS != (bits::BITS - 1)) { - if (size >= (bits::one_at_bit(MAX_SIZE_BITS) - 1)) + if (size >= bits::mask_bits(MAX_SIZE_BITS)) { parent_dealloc_range(base, size); return; diff --git a/src/snmalloc/ds/allocconfig.h b/src/snmalloc/ds/allocconfig.h index bcb19213e..81a72303a 100644 --- a/src/snmalloc/ds/allocconfig.h +++ b/src/snmalloc/ds/allocconfig.h @@ -93,6 +93,28 @@ namespace snmalloc MAX_SMALL_SIZECLASS_SIZE >= MIN_CHUNK_SIZE, "Large sizes need to be representable by as a multiple of MIN_CHUNK_SIZE"); + /** + * The number of bits needed to count the number of objects within a slab. + * + * Most likely, this is achieved by the smallest sizeclass, which will have + * many more than MIN_OBJECT_COUNT objects in its slab. But, just in case, + * it's defined here and checked when we compute the sizeclass table, since + * computing this number is potentially nontrivial. + */ +#if defined(SNMALLOC_QEMU_WORKAROUND) && defined(SNMALLOC_VA_BITS_64) + static constexpr size_t MAX_CAPACITY_BITS = 13; +#else + static constexpr size_t MAX_CAPACITY_BITS = 11; +#endif + + /** + * The maximum distance between the start of two objects in the same slab. + */ + static constexpr size_t MAX_SLAB_SPAN_SIZE = + (MIN_OBJECT_COUNT - 1) * MAX_SMALL_SIZECLASS_SIZE; + static constexpr size_t MAX_SLAB_SPAN_BITS = + bits::next_pow2_bits_const(MAX_SLAB_SPAN_SIZE); + // Number of slots for remote deallocation. static constexpr size_t REMOTE_SLOT_BITS = 8; static constexpr size_t REMOTE_SLOTS = 1 << REMOTE_SLOT_BITS; @@ -117,7 +139,7 @@ namespace snmalloc #ifdef USE_REMOTE_CACHE USE_REMOTE_CACHE #else - 1 << MIN_CHUNK_BITS + MIN_CHUNK_SIZE #endif ; diff --git a/src/snmalloc/ds_core/bits.h b/src/snmalloc/ds_core/bits.h index 74f7b3a2b..b192c8275 100644 --- a/src/snmalloc/ds_core/bits.h +++ b/src/snmalloc/ds_core/bits.h @@ -45,11 +45,12 @@ namespace snmalloc static constexpr size_t BITS = sizeof(size_t) * CHAR_BIT; /** - * Returns a value of type T that has a single bit set, + * Returns a value of type T that has a single bit set at the given index, + * with 0 being the least significant bit. * - * S is a template parameter because callers use either `int` or `size_t` - * and either is valid to represent a number in the range 0-63 (or 0-127 if - * we want to use `__uint128_t` as `T`). + * S, the type of the bit index, is a template parameter because callers + * use either `int` or `size_t` and either is valid to represent a number in + * the range 0-63 (or 0-127 if we want to use `__uint128_t` as `T`). */ template constexpr T one_at_bit(S shift) @@ -59,6 +60,19 @@ namespace snmalloc return (static_cast(1)) << shift; } + /** + * Returns a value of type T that has its n LSBs all set. + * + * S is a template parameter because callers use either `int` or `size_t` + * and either is valid to represent a number in the range 0-63 (or 0-127 if + * we want to use `__uint128_t` as `T`). + */ + template + constexpr T mask_bits(S n) + { + return one_at_bit(n) - 1; + } + inline SNMALLOC_FAST_PATH size_t clz(size_t x) { SNMALLOC_ASSERT(x != 0); // Calling with 0 is UB on some implementations @@ -326,7 +340,7 @@ namespace snmalloc constexpr size_t to_exp_mant_const(size_t value) { constexpr size_t LEADING_BIT = one_at_bit(MANTISSA_BITS + LOW_BITS) >> 1; - constexpr size_t MANTISSA_MASK = one_at_bit(MANTISSA_BITS) - 1; + constexpr size_t MANTISSA_MASK = mask_bits(MANTISSA_BITS); value = value - 1; @@ -344,7 +358,7 @@ namespace snmalloc if (MANTISSA_BITS > 0) { m_e = m_e + 1; - constexpr size_t MANTISSA_MASK = one_at_bit(MANTISSA_BITS) - 1; + constexpr size_t MANTISSA_MASK = mask_bits(MANTISSA_BITS); size_t m = m_e & MANTISSA_MASK; size_t e = m_e >> MANTISSA_BITS; size_t b = e == 0 ? 0 : 1; diff --git a/src/snmalloc/mem/corealloc.h b/src/snmalloc/mem/corealloc.h index ec254b6ea..99b0a9568 100644 --- a/src/snmalloc/mem/corealloc.h +++ b/src/snmalloc/mem/corealloc.h @@ -104,7 +104,7 @@ namespace snmalloc * This is the thread local structure associated to this * allocator. */ - LocalCache* attached_cache; + LocalCache* attached_cache; /** * Ticker to query the clock regularly at a lower cost. @@ -186,7 +186,7 @@ namespace snmalloc { auto slab_end = pointer_offset(bumpptr, slab_size + 1 - rsize); - auto& key = entropy.get_free_list_key(); + auto key_tweak = meta->as_key_tweak(); auto& b = meta->free_queue; @@ -237,14 +237,15 @@ namespace snmalloc auto curr_ptr = start_ptr; do { + auto next_ptr = curr_ptr->next; b.add( // Here begins our treatment of the heap as containing Wild pointers freelist::Object::make( capptr_to_user_address_control(curr_ptr.as_void())), - key, - NO_KEY_TWEAK, + freelist::Object::key_root, + key_tweak, entropy); - curr_ptr = curr_ptr->next; + curr_ptr = next_ptr; } while (curr_ptr != start_ptr); } else @@ -258,8 +259,8 @@ namespace snmalloc capptr_to_user_address_control( Aal::capptr_bound( p.as_void(), rsize))), - key, - NO_KEY_TWEAK, + freelist::Object::key_root, + key_tweak, entropy); p = pointer_offset(p, rsize); } while (p < slab_end); @@ -271,17 +272,18 @@ namespace snmalloc capptr::Alloc clear_slab(BackendSlabMetadata* meta, smallsizeclass_t sizeclass) { - auto& key = entropy.get_free_list_key(); + auto key_tweak = meta->as_key_tweak(); freelist::Iter<> fl; - auto more = meta->free_queue.close(fl, key, NO_KEY_TWEAK); + auto more = + meta->free_queue.close(fl, freelist::Object::key_root, key_tweak); UNUSED(more); auto local_state = backend_state_ptr(); auto domesticate = [local_state](freelist::QueuePtr p) SNMALLOC_FAST_PATH_LAMBDA { return capptr_domesticate(local_state, p); }; - capptr::Alloc p = - finish_alloc_no_zero(fl.take(key, domesticate), sizeclass); + capptr::Alloc p = finish_alloc_no_zero( + fl.take(freelist::Object::key_root, domesticate), sizeclass); // If clear_meta is requested, we should also walk the free list to clear // it. @@ -295,7 +297,7 @@ namespace snmalloc size_t count = 1; // Already taken one above. while (!fl.empty()) { - fl.take(key, domesticate); + fl.take(freelist::Object::key_root, domesticate); count++; } // Check the list contains all the elements @@ -305,13 +307,14 @@ namespace snmalloc if (more > 0) { - auto no_more = meta->free_queue.close(fl, key, NO_KEY_TWEAK); + auto no_more = + meta->free_queue.close(fl, freelist::Object::key_root, key_tweak); SNMALLOC_ASSERT(no_more == 0); UNUSED(no_more); while (!fl.empty()) { - fl.take(key, domesticate); + fl.take(freelist::Object::key_root, domesticate); count++; } } @@ -323,7 +326,7 @@ namespace snmalloc #ifdef SNMALLOC_TRACING message<1024>( - "Slab {} is unused, Object sizeclass {}", + "Slab {} is unused, Object sizeclass {}", start_of_slab.unsafe_ptr(), sizeclass); #endif @@ -351,7 +354,7 @@ namespace snmalloc if (check_slabs) { meta->free_queue.validate( - entropy.get_free_list_key(), NO_KEY_TWEAK, domesticate); + freelist::Object::key_root, meta->as_key_tweak(), domesticate); } return; } @@ -377,42 +380,13 @@ namespace snmalloc } /** - * Slow path for deallocating an object locally. - * This is either waking up a slab that was not actively being used - * by this thread, or handling the final deallocation onto a slab, - * so it can be reused by other threads. + * Very slow path for deallocating an object locally. */ - SNMALLOC_SLOW_PATH void - dealloc_local_object_slow(capptr::Alloc p, const PagemapEntry& entry) + SNMALLOC_SLOW_PATH void dealloc_local_object_slower( + const PagemapEntry& entry, BackendSlabMetadata* meta) { - // TODO: Handle message queue on this path? - - auto* meta = entry.get_slab_metadata(); - - if (meta->is_large()) - { - // Handle large deallocation here. - size_t entry_sizeclass = entry.get_sizeclass().as_large(); - size_t size = bits::one_at_bit(entry_sizeclass); - -#ifdef SNMALLOC_TRACING - message<1024>("Large deallocation: {}", size); -#else - UNUSED(size); -#endif - - // Remove from set of fully used slabs. - meta->node.remove(); - - Config::Backend::dealloc_chunk( - get_backend_local_state(), *meta, p, size, entry.get_sizeclass()); - - return; - } - smallsizeclass_t sizeclass = entry.get_sizeclass().as_small(); - UNUSED(entropy); if (meta->is_sleeping()) { // Slab has been woken up add this to the list of slabs with free space. @@ -448,6 +422,47 @@ namespace snmalloc ticker.check_tick(); } + /** + * Slow path for deallocating an object locally. + * This is either waking up a slab that was not actively being used + * by this thread, or handling the final deallocation onto a slab, + * so it can be reused by other threads. + */ + SNMALLOC_SLOW_PATH void + dealloc_local_object_slow(capptr::Alloc p, const PagemapEntry& entry) + { + // TODO: Handle message queue on this path? + + auto* meta = entry.get_slab_metadata(); + + if (meta->is_large()) + { + // Handle large deallocation here. + + // XXX: because large objects have unique metadata associated with them, + // the ring size here is one. We should probably assert that. + + size_t entry_sizeclass = entry.get_sizeclass().as_large(); + size_t size = bits::one_at_bit(entry_sizeclass); + +#ifdef SNMALLOC_TRACING + message<1024>("Large deallocation: {}", size); +#else + UNUSED(size); +#endif + + // Remove from set of fully used slabs. + meta->node.remove(); + + Config::Backend::dealloc_chunk( + get_backend_local_state(), *meta, p, size, entry.get_sizeclass()); + + return; + } + + dealloc_local_object_slower(entry, meta); + } + /** * Check if this allocator has messages to deallocate blocks from another * thread @@ -490,10 +505,6 @@ namespace snmalloc }; auto cb = [this, &need_post](freelist::HeadPtr msg) SNMALLOC_FAST_PATH_LAMBDA { -#ifdef SNMALLOC_TRACING - message<1024>("Handling remote"); -#endif - auto& entry = Config::Backend::template get_metaentry(snmalloc::address_cast(msg)); @@ -502,6 +513,10 @@ namespace snmalloc return true; }; +#ifdef SNMALLOC_TRACING + message<1024>("Handling remote queue before proceeding..."); +#endif + if constexpr (Config::Options.QueueHeadsAreTame) { /* @@ -544,18 +559,17 @@ namespace snmalloc if (SNMALLOC_LIKELY(entry.get_remote() == public_state())) { - if (SNMALLOC_LIKELY( - dealloc_local_object_fast(entry, p.as_void(), entropy))) - return; - - dealloc_local_object_slow(p, entry); + dealloc_local_object(p, entry); + return; } else { if ( !need_post && !attached_cache->remote_dealloc_cache.reserve_space(entry)) + { need_post = true; + } attached_cache->remote_dealloc_cache .template dealloc( entry.get_remote()->trunc_id(), p.as_void()); @@ -631,7 +645,7 @@ namespace snmalloc typename = std::enable_if_t> CoreAllocator( Range& spare, - LocalCache* cache, + LocalCache* cache, LocalState* backend = nullptr) : backend_state(backend), attached_cache(cache) { @@ -661,7 +675,7 @@ namespace snmalloc // stats().remote_post(); // TODO queue not in line! bool sent_something = attached_cache->remote_dealloc_cache - .post( + .template post( backend_state_ptr(), public_state()->trunc_id()); return sent_something; @@ -680,19 +694,25 @@ namespace snmalloc return handle_message_queue_inner(action, args...); } - SNMALLOC_FAST_PATH void - dealloc_local_object(CapPtr p) + SNMALLOC_FAST_PATH void dealloc_local_object( + CapPtr p, + const typename Config::PagemapEntry& entry) { - // PagemapEntry-s seen here are expected to have meaningful Remote - // pointers - auto& entry = - Config::Backend::template get_metaentry(snmalloc::address_cast(p)); if (SNMALLOC_LIKELY(dealloc_local_object_fast(entry, p, entropy))) return; dealloc_local_object_slow(p, entry); } + SNMALLOC_FAST_PATH void + dealloc_local_object(CapPtr p) + { + // PagemapEntry-s seen here are expected to have meaningful Remote + // pointers + dealloc_local_object( + p, Config::Backend::template get_metaentry(snmalloc::address_cast(p))); + } + SNMALLOC_FAST_PATH static bool dealloc_local_object_fast( const PagemapEntry& entry, CapPtr p, @@ -709,10 +729,9 @@ namespace snmalloc auto cp = p.as_static>(); - auto& key = entropy.get_free_list_key(); - // Update the head and the next pointer in the free list. - meta->free_queue.add(cp, key, NO_KEY_TWEAK, entropy); + meta->free_queue.add( + cp, freelist::Object::key_root, meta->as_key_tweak(), entropy); return SNMALLOC_LIKELY(!meta->return_object()); } @@ -810,7 +829,7 @@ namespace snmalloc // Set meta slab to empty. meta->initialise( - sizeclass, address_cast(slab), entropy.get_free_list_key()); + sizeclass, address_cast(slab), freelist::Object::key_root); // Build a free list for the slab alloc_new_list(slab, meta, rsize, slab_size, entropy); @@ -869,7 +888,7 @@ namespace snmalloc handle_message_queue([]() {}); } - auto posted = attached_cache->flush( + auto posted = attached_cache->template flush( backend_state_ptr(), [&](capptr::Alloc p) { dealloc_local_object(p); }); @@ -880,21 +899,21 @@ namespace snmalloc dealloc_local_slabs(sizeclass); } - laden.iterate([this, domesticate]( - BackendSlabMetadata* meta) SNMALLOC_FAST_PATH_LAMBDA { - if (!meta->is_large()) - { - meta->free_queue.validate( - entropy.get_free_list_key(), NO_KEY_TWEAK, domesticate); - } - }); + laden.iterate( + [domesticate](BackendSlabMetadata* meta) SNMALLOC_FAST_PATH_LAMBDA { + if (!meta->is_large()) + { + meta->free_queue.validate( + freelist::Object::key_root, meta->as_key_tweak(), domesticate); + } + }); return posted; } // This allows the caching layer to be attached to an underlying // allocator instance. - void attach(LocalCache* c) + void attach(LocalCache* c) { #ifdef SNMALLOC_TRACING message<1024>("Attach cache to {}", this); @@ -917,10 +936,9 @@ namespace snmalloc */ bool debug_is_empty_impl(bool* result) { - auto& key = entropy.get_free_list_key(); - - auto error = [&result, &key](auto slab_metadata) { - auto slab_interior = slab_metadata->get_slab_interior(key); + auto error = [&result](auto slab_metadata) { + auto slab_interior = + slab_metadata->get_slab_interior(freelist::Object::key_root); const PagemapEntry& entry = Config::Backend::get_metaentry(slab_interior); SNMALLOC_ASSERT(slab_metadata == entry.get_slab_metadata()); @@ -933,9 +951,11 @@ namespace snmalloc else report_fatal_error( "debug_is_empty: found non-empty allocator: size={} on " - "slab_start {}", + "slab_start {} meta {} entry {}", sizeclass_full_to_size(size_class), - slab_start); + slab_start, + address_cast(slab_metadata), + address_cast(&entry)); }; auto test = [&error](auto& queue) { @@ -987,7 +1007,7 @@ namespace snmalloc { // We need a cache to perform some operations, so set one up // temporarily - LocalCache temp(public_state()); + LocalCache temp(public_state()); attach(&temp); #ifdef SNMALLOC_TRACING message<1024>("debug_is_empty - attach a cache"); diff --git a/src/snmalloc/mem/entropy.h b/src/snmalloc/mem/entropy.h index 2e63b68bf..c6f2c85ff 100644 --- a/src/snmalloc/mem/entropy.h +++ b/src/snmalloc/mem/entropy.h @@ -28,7 +28,6 @@ namespace snmalloc uint64_t local_counter{0}; uint64_t fresh_bits{0}; uint64_t count{0}; - FreeListKey key{0, 0, 0}; public: constexpr LocalEntropy() = default; @@ -38,18 +37,6 @@ namespace snmalloc { local_key = get_entropy64(); local_counter = get_entropy64(); - if constexpr (bits::BITS == 64) - { - key.key1 = get_next(); - key.key2 = get_next(); - key.key_next = get_next(); - } - else - { - key.key1 = get_next() & 0xffff'ffff; - key.key2 = get_next() & 0xffff'ffff; - key.key_next = get_next() & 0xffff'ffff; - } bit_source = get_next(); } @@ -70,9 +57,20 @@ namespace snmalloc /** * A key for the free lists for this thread. */ - const FreeListKey& get_free_list_key() + void make_free_list_key(FreeListKey& key) { - return key; + if constexpr (bits::BITS == 64) + { + key.key1 = static_cast(get_next()); + key.key2 = static_cast(get_next()); + key.key_next = static_cast(get_next()); + } + else + { + key.key1 = static_cast(get_next() & 0xffff'ffff); + key.key2 = static_cast(get_next() & 0xffff'ffff); + key.key_next = static_cast(get_next() & 0xffff'ffff); + } } /** @@ -116,8 +114,7 @@ namespace snmalloc fresh_bits = get_next(); count = 64; } - uint16_t result = - static_cast(fresh_bits & (bits::one_at_bit(n) - 1)); + uint16_t result = static_cast(fresh_bits & bits::mask_bits(n)); fresh_bits >>= n; count -= n; return result; diff --git a/src/snmalloc/mem/freelist.h b/src/snmalloc/mem/freelist.h index 508a22839..f491e979a 100644 --- a/src/snmalloc/mem/freelist.h +++ b/src/snmalloc/mem/freelist.h @@ -55,9 +55,23 @@ namespace snmalloc namespace freelist { + template< + bool RANDOM, + bool TRACK_LENGTH = RANDOM, + SNMALLOC_CONCEPT(capptr::IsBound) BView = capptr::bounds::Alloc, + SNMALLOC_CONCEPT(capptr::IsBound) BQueue = capptr::bounds::AllocWild> + class Builder; + class Object { public: + /** + * Shared key for slab free lists (but tweaked by metadata address). + * + * XXX Maybe this belongs somewhere else + */ + inline static FreeListKey key_root{0xdeadbeef, 0xbeefdead, 0xdeadbeef}; + template< SNMALLOC_CONCEPT(capptr::IsBound) BQueue = capptr::bounds::AllocWild> class T; @@ -117,6 +131,7 @@ namespace snmalloc class T { template< + bool, bool, SNMALLOC_CONCEPT(capptr::IsBound), SNMALLOC_CONCEPT(capptr::IsBound)> @@ -245,7 +260,8 @@ namespace snmalloc SNMALLOC_CONCEPT(capptr::IsBound) BView> static BHeadPtr make(CapPtr p) { - return p.template as_static>(); + return CapPtr, BView>::unsafe_from( + new (p.unsafe_ptr()) Object::T()); } /** @@ -304,7 +320,7 @@ namespace snmalloc * though the result is likely not safe to dereference, being an * obfuscated bundle of bits (on non-CHERI architectures, anyway). That's * additional motivation to consider the result BQueue-bounded, as that - * is likely (but not necessarily) Wild. + * is likely (but not necessarily) Wild. */ template< SNMALLOC_CONCEPT(capptr::IsBound) BView, @@ -364,23 +380,15 @@ namespace snmalloc "Free Object Queue bounds must match View bounds (but may be Wild)"); } - /** - * Assign next_object and update its prev_encoded if - * SNMALLOC_CHECK_CLIENT. Static so that it can be used on reference to a - * free Object. - * - * Returns a pointer to the next_object field of the next parameter as an - * optimization for repeated snoc operations (in which - * next->next_object is nullptr). - */ template< SNMALLOC_CONCEPT(capptr::IsBound) BView, SNMALLOC_CONCEPT(capptr::IsBound) BQueue> - static BQueuePtr* store_next( + static void store_nextish( BQueuePtr* curr, BHeadPtr next, const FreeListKey& key, - address_t key_tweak) + address_t key_tweak, + BHeadPtr next_value) { assert_view_queue_bounds(); @@ -391,11 +399,33 @@ namespace snmalloc } else { + UNUSED(next); UNUSED(key); UNUSED(key_tweak); } - *curr = encode_next(address_cast(curr), next, key, key_tweak); + *curr = encode_next(address_cast(curr), next_value, key, key_tweak); + } + + /** + * Assign next_object and update its prev_encoded if + * SNMALLOC_CHECK_CLIENT. Static so that it can be used on reference to a + * free Object. + * + * Returns a pointer to the next_object field of the next parameter as an + * optimization for repeated snoc operations (in which + * next->next_object is nullptr). + */ + template< + SNMALLOC_CONCEPT(capptr::IsBound) BView, + SNMALLOC_CONCEPT(capptr::IsBound) BQueue> + static BQueuePtr* store_next( + BQueuePtr* curr, + BHeadPtr next, + const FreeListKey& key, + address_t key_tweak) + { + store_nextish(curr, next, key, key_tweak, next); return &(next->next_object); } @@ -640,10 +670,13 @@ namespace snmalloc */ template< bool RANDOM, - SNMALLOC_CONCEPT(capptr::IsBound) BView = capptr::bounds::Alloc, - SNMALLOC_CONCEPT(capptr::IsBound) BQueue = capptr::bounds::AllocWild> + bool TRACK_LENGTH, + SNMALLOC_CONCEPT(capptr::IsBound) BView, + SNMALLOC_CONCEPT(capptr::IsBound) BQueue> class Builder { + static_assert(!RANDOM || TRACK_LENGTH); + static constexpr size_t LENGTH = RANDOM ? 2 : 1; /* @@ -681,7 +714,8 @@ namespace snmalloc static_cast*>(head[ix])); } - SNMALLOC_NO_UNIQUE_ADDRESS std::array length{}; + SNMALLOC_NO_UNIQUE_ADDRESS + std::array length{}; public: constexpr Builder() = default; @@ -717,7 +751,7 @@ namespace snmalloc index = 0; set_end(index, Object::store_next(cast_end(index), n, key, key_tweak)); - if constexpr (RANDOM) + if constexpr (TRACK_LENGTH) { length[index]++; } @@ -739,6 +773,10 @@ namespace snmalloc { static_assert(RANDOM_ == RANDOM, "Don't set template parameter"); set_end(0, Object::store_next(cast_end(0), n, key, key_tweak)); + if constexpr (TRACK_LENGTH) + { + length[0]++; + } } /** @@ -831,7 +869,7 @@ namespace snmalloc for (size_t i = 0; i < LENGTH; i++) { end[i] = &head[i]; - if constexpr (RANDOM) + if constexpr (TRACK_LENGTH) { length[i] = 0; } @@ -849,6 +887,13 @@ namespace snmalloc } } + template + std::enable_if_t extract_segment_length() + { + static_assert(RANDOM_ == RANDOM, "Don't set SFINAE parameter!"); + return length[0]; + } + template std::enable_if_t< !RANDOM_, @@ -881,7 +926,7 @@ namespace snmalloc { if (&head[i] == end[i]) { - SNMALLOC_CHECK(!RANDOM || (length[i] == 0)); + SNMALLOC_CHECK(!TRACK_LENGTH || (length[i] == 0)); continue; } @@ -899,7 +944,7 @@ namespace snmalloc address_cast(curr), address_cast(next), key, key_tweak); curr = next; } - SNMALLOC_CHECK(!RANDOM || (count == length[i])); + SNMALLOC_CHECK(!TRACK_LENGTH || (count == length[i])); } } else diff --git a/src/snmalloc/mem/localalloc.h b/src/snmalloc/mem/localalloc.h index e0096d80d..cfa0a5db8 100644 --- a/src/snmalloc/mem/localalloc.h +++ b/src/snmalloc/mem/localalloc.h @@ -78,7 +78,7 @@ namespace snmalloc // allocation on the fast path. This part of the code is inspired by // mimalloc. // Also contains remote deallocation cache. - LocalCache local_cache{&Config::unused_remote}; + LocalCache local_cache{&Config::unused_remote}; // Underlying allocator for most non-fast path operations. CoreAlloc* core_alloc{nullptr}; @@ -209,7 +209,7 @@ namespace snmalloc if (meta != nullptr) { meta->initialise_large( - address_cast(chunk), local_cache.entropy.get_free_list_key()); + address_cast(chunk), freelist::Object::key_root); core_alloc->laden.insert(meta); } @@ -253,8 +253,7 @@ namespace snmalloc sizeclass); }; - return local_cache.template alloc( - domesticate, size, slowpath); + return local_cache.template alloc(domesticate, size, slowpath); } /** @@ -274,18 +273,18 @@ namespace snmalloc * In the second case we need to recheck if this is a remote deallocation, * as we might acquire the originating allocator. */ - SNMALLOC_SLOW_PATH void dealloc_remote_slow(capptr::Alloc p) + SNMALLOC_SLOW_PATH void + dealloc_remote_slow(const PagemapEntry& entry, capptr::Alloc p) { if (core_alloc != nullptr) { #ifdef SNMALLOC_TRACING message<1024>( - "Remote dealloc post {} ({})", + "Remote dealloc post {} ({}, {})", p.unsafe_ptr(), - alloc_size(p.unsafe_ptr())); + alloc_size(p.unsafe_ptr()), + address_cast(entry.get_slab_metadata())); #endif - const PagemapEntry& entry = - Config::Backend::template get_metaentry(address_cast(p)); local_cache.remote_dealloc_cache.template dealloc( entry.get_remote()->trunc_id(), p); post_remote_cache(); @@ -655,11 +654,7 @@ namespace snmalloc if (SNMALLOC_LIKELY(local_cache.remote_allocator == entry.get_remote())) { dealloc_cheri_checks(p_tame.unsafe_ptr()); - - if (SNMALLOC_LIKELY(CoreAlloc::dealloc_local_object_fast( - entry, p_tame, local_cache.entropy))) - return; - core_alloc->dealloc_local_object_slow(p_tame, entry); + core_alloc->dealloc_local_object(p_tame, entry); return; } @@ -681,12 +676,15 @@ namespace snmalloc remote->trunc_id(), p_tame); # ifdef SNMALLOC_TRACING message<1024>( - "Remote dealloc fast {} ({})", p_raw, alloc_size(p_raw)); + "Remote dealloc fast {} ({}, {})", + p_raw, + alloc_size(p_raw), + address_cast(entry.get_slab_metadata())); # endif return; } - dealloc_remote_slow(p_tame); + dealloc_remote_slow(entry, p_tame); return; } @@ -921,7 +919,7 @@ namespace snmalloc * core allocator for use by this local allocator then it needs to access * this field. */ - LocalCache& get_local_cache() + LocalCache& get_local_cache() { return local_cache; } diff --git a/src/snmalloc/mem/localcache.h b/src/snmalloc/mem/localcache.h index cfbbaa576..5a63e281d 100644 --- a/src/snmalloc/mem/localcache.h +++ b/src/snmalloc/mem/localcache.h @@ -37,6 +37,7 @@ namespace snmalloc // This is defined on its own, so that it can be embedded in the // thread local fast allocator, but also referenced from the // thread local core allocator. + template struct LocalCache { // Free list per small size class. These are used for @@ -54,7 +55,7 @@ namespace snmalloc /** * Remote deallocations for other threads */ - RemoteDeallocCache remote_dealloc_cache; + RemoteDeallocCache remote_dealloc_cache; constexpr LocalCache(RemoteAllocator* remote_allocator) : remote_allocator(remote_allocator) @@ -63,10 +64,10 @@ namespace snmalloc /** * Return all the free lists to the allocator. Used during thread teardown. */ - template + template bool flush(typename Config::LocalState* local_state, DeallocFun dealloc) { - auto& key = entropy.get_free_list_key(); + auto& key = freelist::Object::key_root; auto domesticate = [local_state](freelist::QueuePtr p) SNMALLOC_FAST_PATH_LAMBDA { return capptr_domesticate(local_state, p); @@ -85,19 +86,15 @@ namespace snmalloc } } - return remote_dealloc_cache.post( + return remote_dealloc_cache.template post( local_state, remote_allocator->trunc_id()); } - template< - ZeroMem zero_mem, - typename Config, - typename Slowpath, - typename Domesticator> + template SNMALLOC_FAST_PATH capptr::Alloc alloc(Domesticator domesticate, size_t size, Slowpath slowpath) { - auto& key = entropy.get_free_list_key(); + auto& key = freelist::Object::key_root; smallsizeclass_t sizeclass = size_to_sizeclass(size); auto& fl = small_fast_free_lists[sizeclass]; if (SNMALLOC_LIKELY(!fl.empty())) diff --git a/src/snmalloc/mem/metadata.h b/src/snmalloc/mem/metadata.h index d696a4de6..7cf50e3af 100644 --- a/src/snmalloc/mem/metadata.h +++ b/src/snmalloc/mem/metadata.h @@ -455,7 +455,7 @@ namespace snmalloc static_assert( std::is_base_of::value, "Template should be a subclass of FrontendSlabMetadata"); - free_queue.init(slab, key, NO_KEY_TWEAK); + free_queue.init(slab, key, this->as_key_tweak()); // Set up meta data as if the entire slab has been turned into a free // list. This means we don't have to check for special cases where we have // returned all the elements, but this is a slab that is still being bump @@ -477,7 +477,7 @@ namespace snmalloc void initialise_large(address_t slab, const FreeListKey& key) { // We will push to this just to make the fast path clean. - free_queue.init(slab, key, NO_KEY_TWEAK); + free_queue.init(slab, key, this->as_key_tweak()); // Flag to detect that it is a large alloc on the slow path large_ = true; @@ -573,11 +573,12 @@ namespace snmalloc LocalEntropy& entropy, smallsizeclass_t sizeclass) { - auto& key = entropy.get_free_list_key(); + auto& key = freelist::Object::key_root; std::remove_reference_t tmp_fl; - auto remaining = meta->free_queue.close(tmp_fl, key, NO_KEY_TWEAK); + auto remaining = + meta->free_queue.close(tmp_fl, key, meta->as_key_tweak()); auto p = tmp_fl.take(key, domesticate); fast_free_list = tmp_fl; @@ -599,7 +600,12 @@ namespace snmalloc // start of the slab. [[nodiscard]] address_t get_slab_interior(const FreeListKey& key) const { - return address_cast(free_queue.read_head(0, key, NO_KEY_TWEAK)); + return address_cast(free_queue.read_head(0, key, this->as_key_tweak())); + } + + [[nodiscard]] SNMALLOC_FAST_PATH address_t as_key_tweak() const noexcept + { + return address_cast(this) / alignof(decltype(*this)); } typename ClientMeta::DataRef get_meta_for_object(size_t index) diff --git a/src/snmalloc/mem/remotecache.h b/src/snmalloc/mem/remotecache.h index faea14103..c3dbdcd95 100644 --- a/src/snmalloc/mem/remotecache.h +++ b/src/snmalloc/mem/remotecache.h @@ -15,6 +15,7 @@ namespace snmalloc /** * Stores the remote deallocation to batch them before sending */ + template struct RemoteDeallocCache { std::array, REMOTE_SLOTS> list; @@ -54,10 +55,12 @@ namespace snmalloc * This does not require initialisation to be safely called. */ template - SNMALLOC_FAST_PATH bool reserve_space(const Entry& entry) + SNMALLOC_FAST_PATH bool reserve_space(const Entry& entry, uint16_t n = 1) { + static_assert(sizeof(n) * 8 > MAX_CAPACITY_BITS); + auto size = - static_cast(sizeclass_full_to_size(entry.get_sizeclass())); + n * static_cast(sizeclass_full_to_size(entry.get_sizeclass())); bool result = capacity > size; if (result) @@ -70,13 +73,13 @@ namespace snmalloc dealloc(RemoteAllocator::alloc_id_t target_id, capptr::Alloc p) { SNMALLOC_ASSERT(initialised); - auto r = p.template as_reinterpret>(); + auto r = freelist::Object::make(p); list[get_slot(target_id, 0)].add( r, RemoteAllocator::key_global, NO_KEY_TWEAK); } - template + template bool post( typename Config::LocalState* local_state, RemoteAllocator::alloc_id_t id) { @@ -151,7 +154,7 @@ namespace snmalloc } } - // Reset capacity as we have empty everything + // Reset capacity as we have emptied everything capacity = REMOTE_CACHE; return sent_something; diff --git a/src/snmalloc/mem/sizeclasstable.h b/src/snmalloc/mem/sizeclasstable.h index 0a033319d..4dd2eec0e 100644 --- a/src/snmalloc/mem/sizeclasstable.h +++ b/src/snmalloc/mem/sizeclasstable.h @@ -165,6 +165,8 @@ namespace snmalloc uint16_t waking; }; + static_assert(sizeof(sizeclass_data_slow::capacity) * 8 > MAX_CAPACITY_BITS); + struct SizeClassTable { ModArray fast_{}; @@ -220,7 +222,7 @@ namespace snmalloc size_t slab_bits = bits::max( bits::next_pow2_bits_const(MIN_OBJECT_COUNT * rsize), MIN_CHUNK_BITS); - meta.slab_mask = bits::one_at_bit(slab_bits) - 1; + meta.slab_mask = bits::mask_bits(slab_bits); auto& meta_slow = slow(sizeclass_t::from_small_class(sizeclass)); meta_slow.capacity = @@ -245,8 +247,7 @@ namespace snmalloc { // Calculate reciprocal division constant. auto& meta = fast_small(sizeclass); - meta.div_mult = - ((bits::one_at_bit(DIV_MULT_SHIFT) - 1) / meta.size) + 1; + meta.div_mult = (bits::mask_bits(DIV_MULT_SHIFT) / meta.size) + 1; size_t zero = 0; meta.mod_zero_mult = (~zero / meta.size) + 1; @@ -270,6 +271,9 @@ namespace snmalloc constexpr SizeClassTable sizeclass_metadata = SizeClassTable(); + static_assert( + bits::BITS - sizeclass_metadata.DIV_MULT_SHIFT <= MAX_CAPACITY_BITS); + constexpr size_t DIV_MULT_SHIFT = sizeclass_metadata.DIV_MULT_SHIFT; constexpr size_t sizeclass_to_size(smallsizeclass_t sizeclass) diff --git a/src/test/func/domestication/domestication.cc b/src/test/func/domestication/domestication.cc index aa84ecdf2..984d442f1 100644 --- a/src/test/func/domestication/domestication.cc +++ b/src/test/func/domestication/domestication.cc @@ -137,7 +137,8 @@ int main() LocalEntropy entropy; entropy.init(); - RemoteAllocator::key_global = FreeListKey(entropy.get_free_list_key()); + entropy.make_free_list_key(RemoteAllocator::key_global); + entropy.make_free_list_key(freelist::Object::key_root); auto alloc1 = new Alloc(); diff --git a/src/test/perf/contention/contention.cc b/src/test/perf/contention/contention.cc index 9e12b660a..c2cfd8f85 100644 --- a/src/test/perf/contention/contention.cc +++ b/src/test/perf/contention/contention.cc @@ -137,7 +137,7 @@ void test_tasks(size_t num_tasks, size_t count, size_t size) ParallelTest test(num_tasks); std::cout << "Task test, " << num_tasks << " threads, " << count - << " swaps per thread " << test.time() << "ticks" << std::endl; + << " swaps per thread " << test.time() << " ticks" << std::endl; for (size_t n = 0; n < swapsize; n++) {