diff --git a/src/snmalloc/mem/corealloc.h b/src/snmalloc/mem/corealloc.h index 2577eab3b..ec254b6ea 100644 --- a/src/snmalloc/mem/corealloc.h +++ b/src/snmalloc/mem/corealloc.h @@ -242,6 +242,7 @@ namespace snmalloc freelist::Object::make( capptr_to_user_address_control(curr_ptr.as_void())), key, + NO_KEY_TWEAK, entropy); curr_ptr = curr_ptr->next; } while (curr_ptr != start_ptr); @@ -258,6 +259,7 @@ namespace snmalloc Aal::capptr_bound( p.as_void(), rsize))), key, + NO_KEY_TWEAK, entropy); p = pointer_offset(p, rsize); } while (p < slab_end); @@ -271,7 +273,7 @@ namespace snmalloc { auto& key = entropy.get_free_list_key(); freelist::Iter<> fl; - auto more = meta->free_queue.close(fl, key); + auto more = meta->free_queue.close(fl, key, NO_KEY_TWEAK); UNUSED(more); auto local_state = backend_state_ptr(); auto domesticate = [local_state](freelist::QueuePtr p) @@ -303,7 +305,7 @@ namespace snmalloc if (more > 0) { - auto no_more = meta->free_queue.close(fl, key); + auto no_more = meta->free_queue.close(fl, key, NO_KEY_TWEAK); SNMALLOC_ASSERT(no_more == 0); UNUSED(no_more); @@ -348,7 +350,8 @@ namespace snmalloc { if (check_slabs) { - meta->free_queue.validate(entropy.get_free_list_key(), domesticate); + meta->free_queue.validate( + entropy.get_free_list_key(), NO_KEY_TWEAK, domesticate); } return; } @@ -709,7 +712,7 @@ namespace snmalloc auto& key = entropy.get_free_list_key(); // Update the head and the next pointer in the free list. - meta->free_queue.add(cp, key, entropy); + meta->free_queue.add(cp, key, NO_KEY_TWEAK, entropy); return SNMALLOC_LIKELY(!meta->return_object()); } @@ -849,19 +852,14 @@ namespace snmalloc if (destroy_queue) { - auto p_wild = message_queue().destroy(); - auto p_tame = domesticate(p_wild); - - while (p_tame != nullptr) - { + auto cb = [this](capptr::Alloc p) { bool need_post = true; // Always going to post, so ignore. - auto n_tame = - p_tame->atomic_read_next(RemoteAllocator::key_global, domesticate); const PagemapEntry& entry = - Config::Backend::get_metaentry(snmalloc::address_cast(p_tame)); - handle_dealloc_remote(entry, p_tame.as_void(), need_post); - p_tame = n_tame; - } + Config::Backend::get_metaentry(snmalloc::address_cast(p)); + handle_dealloc_remote(entry, p.as_void(), need_post); + }; + + message_queue().destroy_and_iterate(domesticate, cb); } else { @@ -886,7 +884,8 @@ namespace snmalloc BackendSlabMetadata* meta) SNMALLOC_FAST_PATH_LAMBDA { if (!meta->is_large()) { - meta->free_queue.validate(entropy.get_free_list_key(), domesticate); + meta->free_queue.validate( + entropy.get_free_list_key(), NO_KEY_TWEAK, domesticate); } }); diff --git a/src/snmalloc/mem/freelist.h b/src/snmalloc/mem/freelist.h index fb401d4b2..508a22839 100644 --- a/src/snmalloc/mem/freelist.h +++ b/src/snmalloc/mem/freelist.h @@ -40,15 +40,17 @@ namespace snmalloc { + static constexpr address_t NO_KEY_TWEAK = 0; + /** * This function is used to sign back pointers in the free list. */ - inline static address_t - signed_prev(address_t curr, address_t next, const FreeListKey& key) + inline static address_t signed_prev( + address_t curr, address_t next, const FreeListKey& key, address_t tweak) { auto c = curr; auto n = next; - return (c + key.key1) * (n + key.key2); + return (c + key.key1) * (n + (key.key2 ^ tweak)); } namespace freelist @@ -171,22 +173,27 @@ namespace snmalloc SNMALLOC_CONCEPT(capptr::IsBound) BView = typename BQueue:: template with_wildness, typename Domesticator> - BHeadPtr - atomic_read_next(const FreeListKey& key, Domesticator domesticate) + BHeadPtr atomic_read_next( + const FreeListKey& key, address_t key_tweak, Domesticator domesticate) { auto n_wild = Object::decode_next( address_cast(&this->next_object), this->atomic_next_object.load(std::memory_order_acquire), - key); + key, + key_tweak); auto n_tame = domesticate(n_wild); if constexpr (mitigations(freelist_backward_edge)) { if (n_tame != nullptr) { - n_tame->prev.check_prev( - signed_prev(address_cast(this), address_cast(n_tame), key)); + n_tame->prev.check_prev(signed_prev( + address_cast(this), address_cast(n_tame), key, key_tweak)); } } + else + { + UNUSED(key_tweak); + } Aal::prefetch(n_tame.unsafe_ptr()); return n_tame; } @@ -198,11 +205,14 @@ namespace snmalloc SNMALLOC_CONCEPT(capptr::IsBound) BView = typename BQueue:: template with_wildness, typename Domesticator> - BHeadPtr - read_next(const FreeListKey& key, Domesticator domesticate) + BHeadPtr read_next( + const FreeListKey& key, address_t key_tweak, Domesticator domesticate) { return domesticate(Object::decode_next( - address_cast(&this->next_object), this->next_object, key)); + address_cast(&this->next_object), + this->next_object, + key, + key_tweak)); } /** @@ -253,8 +263,11 @@ namespace snmalloc * Involutive encryption with raw pointers */ template - inline static Object::T* - code_next(address_t curr, Object::T* next, const FreeListKey& key) + inline static Object::T* code_next( + address_t curr, + Object::T* next, + const FreeListKey& key, + address_t key_tweak) { // Note we can consider other encoding schemes here. // * XORing curr and next. This doesn't require any key material @@ -267,11 +280,13 @@ namespace snmalloc mitigations(freelist_forward_edge) && !aal_supports) { return unsafe_from_uintptr>( - unsafe_to_uintptr>(next) ^ key.key_next); + unsafe_to_uintptr>(next) ^ key.key_next ^ + key_tweak); } else { UNUSED(key); + UNUSED(key_tweak); return next; } } @@ -295,10 +310,13 @@ namespace snmalloc SNMALLOC_CONCEPT(capptr::IsBound) BView, SNMALLOC_CONCEPT(capptr::IsBound) BQueue> inline static BQueuePtr encode_next( - address_t curr, BHeadPtr next, const FreeListKey& key) + address_t curr, + BHeadPtr next, + const FreeListKey& key, + address_t key_tweak) { return BQueuePtr::unsafe_from( - code_next(curr, next.unsafe_ptr(), key)); + code_next(curr, next.unsafe_ptr(), key, key_tweak)); } /** @@ -320,10 +338,13 @@ namespace snmalloc SNMALLOC_CONCEPT(capptr::IsBound) BView, SNMALLOC_CONCEPT(capptr::IsBound) BQueue> inline static BHeadPtr decode_next( - address_t curr, BHeadPtr next, const FreeListKey& key) + address_t curr, + BHeadPtr next, + const FreeListKey& key, + address_t key_tweak) { return BHeadPtr::unsafe_from( - code_next(curr, next.unsafe_ptr(), key)); + code_next(curr, next.unsafe_ptr(), key, key_tweak)); } template< @@ -358,27 +379,32 @@ namespace snmalloc static BQueuePtr* store_next( BQueuePtr* curr, BHeadPtr next, - const FreeListKey& key) + const FreeListKey& key, + address_t key_tweak) { assert_view_queue_bounds(); if constexpr (mitigations(freelist_backward_edge)) { - next->prev.set_prev( - signed_prev(address_cast(curr), address_cast(next), key)); + next->prev.set_prev(signed_prev( + address_cast(curr), address_cast(next), key, key_tweak)); } else + { UNUSED(key); + UNUSED(key_tweak); + } - *curr = encode_next(address_cast(curr), next, key); + *curr = encode_next(address_cast(curr), next, key, key_tweak); return &(next->next_object); } template - static void store_null(BQueuePtr* curr, const FreeListKey& key) + static void store_null( + BQueuePtr* curr, const FreeListKey& key, address_t key_tweak) { - *curr = - encode_next(address_cast(curr), BQueuePtr(nullptr), key); + *curr = encode_next( + address_cast(curr), BQueuePtr(nullptr), key, key_tweak); } /** @@ -392,36 +418,45 @@ namespace snmalloc static void atomic_store_next( BHeadPtr curr, BHeadPtr next, - const FreeListKey& key) + const FreeListKey& key, + address_t key_tweak) { static_assert(BView::wildness == capptr::dimension::Wildness::Tame); if constexpr (mitigations(freelist_backward_edge)) { - next->prev.set_prev( - signed_prev(address_cast(curr), address_cast(next), key)); + next->prev.set_prev(signed_prev( + address_cast(curr), address_cast(next), key, key_tweak)); } else + { UNUSED(key); + UNUSED(key_tweak); + } // Signature needs to be visible before item is linked in // so requires release semantics. curr->atomic_next_object.store( - encode_next(address_cast(&curr->next_object), next, key), + encode_next(address_cast(&curr->next_object), next, key, key_tweak), std::memory_order_release); } template< SNMALLOC_CONCEPT(capptr::IsBound) BView, SNMALLOC_CONCEPT(capptr::IsBound) BQueue> - static void - atomic_store_null(BHeadPtr curr, const FreeListKey& key) + static void atomic_store_null( + BHeadPtr curr, + const FreeListKey& key, + address_t key_tweak) { static_assert(BView::wildness == capptr::dimension::Wildness::Tame); curr->atomic_next_object.store( encode_next( - address_cast(&curr->next_object), BQueuePtr(nullptr), key), + address_cast(&curr->next_object), + BQueuePtr(nullptr), + key, + key_tweak), std::memory_order_relaxed); } }; @@ -498,11 +533,47 @@ namespace snmalloc { Object::BHeadPtr curr{nullptr}; + struct KeyTweak + { + address_t key_tweak = 0; + SNMALLOC_FAST_PATH address_t get() + { + return key_tweak; + } + void set(address_t kt) + { + key_tweak = kt; + } + + constexpr KeyTweak() = default; + }; + + struct NoKeyTweak + { + SNMALLOC_FAST_PATH address_t get() + { + return 0; + } + void set(address_t) {} + }; + + SNMALLOC_NO_UNIQUE_ADDRESS + std::conditional_t< + mitigations(freelist_forward_edge) || + mitigations(freelist_backward_edge), + KeyTweak, + NoKeyTweak> + key_tweak; + public: - constexpr Iter(Object::BHeadPtr head, address_t prev_value) + constexpr Iter( + Object::BHeadPtr head, + address_t prev_value, + address_t kt) : IterBase(prev_value), curr(head) { UNUSED(prev_value); + key_tweak.set(kt); } constexpr Iter() = default; @@ -531,15 +602,15 @@ namespace snmalloc take(const FreeListKey& key, Domesticator domesticate) { auto c = curr; - auto next = curr->read_next(key, domesticate); + auto next = curr->read_next(key, key_tweak.get(), domesticate); Aal::prefetch(next.unsafe_ptr()); curr = next; if constexpr (mitigations(freelist_backward_edge)) { - auto p = - replace(signed_prev(address_cast(c), address_cast(next), key)); + auto p = replace(signed_prev( + address_cast(c), address_cast(next), key, key_tweak.get())); c->check_prev(p); } else @@ -636,6 +707,7 @@ namespace snmalloc void add( Object::BHeadPtr n, const FreeListKey& key, + address_t key_tweak, LocalEntropy& entropy) { uint32_t index; @@ -644,7 +716,7 @@ namespace snmalloc else index = 0; - set_end(index, Object::store_next(cast_end(index), n, key)); + set_end(index, Object::store_next(cast_end(index), n, key, key_tweak)); if constexpr (RANDOM) { length[index]++; @@ -660,20 +732,22 @@ namespace snmalloc * lists, which will be randomised at the other end. */ template - std::enable_if_t - add(Object::BHeadPtr n, const FreeListKey& key) + std::enable_if_t add( + Object::BHeadPtr n, + const FreeListKey& key, + address_t key_tweak) { static_assert(RANDOM_ == RANDOM, "Don't set template parameter"); - set_end(0, Object::store_next(cast_end(0), n, key)); + set_end(0, Object::store_next(cast_end(0), n, key, key_tweak)); } /** * Makes a terminator to a free list. */ - SNMALLOC_FAST_PATH void - terminate_list(uint32_t index, const FreeListKey& key) + SNMALLOC_FAST_PATH void terminate_list( + uint32_t index, const FreeListKey& key, address_t key_tweak) { - Object::store_null(cast_end(index), key); + Object::store_null(cast_end(index), key, key_tweak); } /** @@ -685,17 +759,21 @@ namespace snmalloc * and is thus subject to encoding if the next_object pointers * encoded. */ - [[nodiscard]] Object::BHeadPtr - read_head(uint32_t index, const FreeListKey& key) const + [[nodiscard]] Object::BHeadPtr read_head( + uint32_t index, const FreeListKey& key, address_t key_tweak) const { return Object::decode_next( - address_cast(&head[index]), cast_head(index), key); + address_cast(&head[index]), cast_head(index), key, key_tweak); } - address_t get_fake_signed_prev(uint32_t index, const FreeListKey& key) + address_t get_fake_signed_prev( + uint32_t index, const FreeListKey& key, address_t key_tweak) { return signed_prev( - address_cast(&head[index]), address_cast(read_head(index, key)), key); + address_cast(&head[index]), + address_cast(read_head(index, key, key_tweak)), + key, + key_tweak); } /** @@ -707,8 +785,8 @@ namespace snmalloc * The return value is how many entries are still contained in the * builder. */ - SNMALLOC_FAST_PATH uint16_t - close(Iter& fl, const FreeListKey& key) + SNMALLOC_FAST_PATH uint16_t close( + Iter& fl, const FreeListKey& key, address_t key_tweak) { uint32_t i; if constexpr (RANDOM) @@ -724,9 +802,12 @@ namespace snmalloc i = 0; } - terminate_list(i, key); + terminate_list(i, key, key_tweak); - fl = {read_head(i, key), get_fake_signed_prev(i, key)}; + fl = { + read_head(i, key, key_tweak), + get_fake_signed_prev(i, key, key_tweak), + key_tweak}; end[i] = &head[i]; @@ -744,7 +825,8 @@ namespace snmalloc /** * Set the builder to a not building state. */ - constexpr void init(address_t slab, const FreeListKey& key) + constexpr void + init(address_t slab, const FreeListKey& key, address_t key_tweak) { for (size_t i = 0; i < LENGTH; i++) { @@ -762,7 +844,8 @@ namespace snmalloc head[i] = Object::code_next( address_cast(&head[i]), useless_ptr_from_addr>(slab), - key); + key, + key_tweak); } } @@ -772,25 +855,25 @@ namespace snmalloc std::pair< Object::BHeadPtr, Object::BHeadPtr>> - extract_segment(const FreeListKey& key) + extract_segment(const FreeListKey& key, address_t key_tweak) { static_assert(RANDOM_ == RANDOM, "Don't set SFINAE parameter!"); SNMALLOC_ASSERT(!empty()); - auto first = read_head(0, key); + auto first = read_head(0, key, key_tweak); // end[0] is pointing to the first field in the object, // this is doing a CONTAINING_RECORD like cast to get back // to the actual object. This isn't true if the builder is // empty, but you are not allowed to call this in the empty case. auto last = Object::BHeadPtr::unsafe_from( Object::from_next_ptr(cast_end(0))); - init(address_cast(head[0]), key); + init(address_cast(head[0]), key, key_tweak); return {first, last}; } template - SNMALLOC_FAST_PATH void - validate(const FreeListKey& key, Domesticator domesticate) + SNMALLOC_FAST_PATH void validate( + const FreeListKey& key, address_t key_tweak, Domesticator domesticate) { if constexpr (mitigations(freelist_teardown_validate)) { @@ -803,16 +886,17 @@ namespace snmalloc } size_t count = 1; - auto curr = read_head(i, key); - auto prev = get_fake_signed_prev(i, key); + auto curr = read_head(i, key, key_tweak); + auto prev = get_fake_signed_prev(i, key, key_tweak); while (true) { curr->check_prev(prev); if (address_cast(&(curr->next_object)) == address_cast(end[i])) break; count++; - auto next = curr->read_next(key, domesticate); - prev = signed_prev(address_cast(curr), address_cast(next), key); + auto next = curr->read_next(key, key_tweak, domesticate); + prev = signed_prev( + address_cast(curr), address_cast(next), key, key_tweak); curr = next; } SNMALLOC_CHECK(!RANDOM || (count == length[i])); @@ -821,6 +905,7 @@ namespace snmalloc else { UNUSED(key); + UNUSED(key_tweak); UNUSED(domesticate); } } diff --git a/src/snmalloc/mem/freelist_queue.h b/src/snmalloc/mem/freelist_queue.h new file mode 100644 index 000000000..e7dd813e1 --- /dev/null +++ b/src/snmalloc/mem/freelist_queue.h @@ -0,0 +1,194 @@ +#pragma once + +#include "../ds/ds.h" +#include "freelist.h" + +#include +#include + +namespace snmalloc +{ + /** + * A FreeListMPSCQ is a chain of freed objects exposed as a MPSC append-only + * atomic queue that uses one xchg per append. + * + * The internal pointers are considered QueuePtr-s to support deployment + * scenarios in which the MPSCQ itself is exposed to the client. This is + * excessively paranoid in the common case that these metadata are as "hard" + * for the client to reach as the Pagemap, which we trust to store not just + * Tame CapPtr<>s but raw C++ pointers. + * + * Where necessary, methods expose two domesticator callbacks at the + * interface and are careful to use one for the front and back values and the + * other for pointers read from the queue itself. That's not ideal, but it + * lets the client condition its behavior appropriately and prevents us from + * accidentally following either of these pointers in generic code. + * Specifically, + * + * * `domesticate_head` is used for the MPSCQ pointers used to reach into + * the chain of objects + * + * * `domesticate_queue` is used to traverse links in that chain (and in + * fact, we traverse only the first). + * + * In the case that the MPSCQ is not easily accessible to the client, + * `domesticate_head` can just be a type coersion, and `domesticate_queue` + * should perform actual validation. If the MPSCQ is exposed to the + * allocator client, both Domesticators should perform validation. + */ + template + struct alignas(REMOTE_MIN_ALIGN) FreeListMPSCQ + { + // Store the message queue on a separate cacheline. It is mutable data that + // is read by other threads. + alignas(CACHELINE_SIZE) freelist::AtomicQueuePtr back{nullptr}; + // Store the two ends on different cache lines as access by different + // threads. + alignas(CACHELINE_SIZE) freelist::AtomicQueuePtr front{nullptr}; + // Fake first entry + freelist::Object::T stub{}; + + constexpr FreeListMPSCQ() = default; + + void invariant() + { + SNMALLOC_ASSERT( + (address_cast(front.load()) == address_cast(&stub)) || + (back != nullptr)); + } + + void init() + { + freelist::HeadPtr stub_ptr = freelist::HeadPtr::unsafe_from(&stub); + freelist::Object::atomic_store_null(stub_ptr, Key, Key_tweak); + front.store(freelist::QueuePtr::unsafe_from(&stub)); + back.store(nullptr, std::memory_order_relaxed); + invariant(); + } + + freelist::QueuePtr destroy() + { + freelist::QueuePtr fnt = front.load(); + back.store(nullptr, std::memory_order_relaxed); + if (address_cast(front.load()) == address_cast(&stub)) + return nullptr; + return fnt; + } + + template + void destroy_and_iterate(Domesticator_queue domesticate, Cb cb) + { + auto p = domesticate(destroy()); + + while (p != nullptr) + { + auto n = p->atomic_read_next(Key, Key_tweak, domesticate); + cb(p); + p = n; + } + } + + template + inline bool can_dequeue( + Domesticator_head domesticate_head, Domesticator_queue domesticate_queue) + { + return domesticate_head(front.load()) + ->atomic_read_next(Key, Key_tweak, domesticate_queue) != nullptr; + } + + /** + * Pushes a list of messages to the queue. Each message from first to + * last should be linked together through their next pointers. + * + * The Domesticator here is used only on pointers read from the head. See + * the commentary on the class. + */ + template + void enqueue( + freelist::HeadPtr first, + freelist::HeadPtr last, + Domesticator_head domesticate_head) + { + invariant(); + freelist::Object::atomic_store_null(last, Key, Key_tweak); + + // Exchange needs to be acq_rel. + // * It needs to be a release, so nullptr in next is visible. + // * Needs to be acquire, so linking into the list does not race with + // the other threads nullptr init of the next field. + freelist::QueuePtr prev = + back.exchange(capptr_rewild(last), std::memory_order_acq_rel); + + if (SNMALLOC_LIKELY(prev != nullptr)) + { + freelist::Object::atomic_store_next( + domesticate_head(prev), first, Key, Key_tweak); + return; + } + + front.store(capptr_rewild(first)); + } + + /** + * Destructively iterate the queue. Each queue element is removed and fed + * to the callback in turn. The callback may return false to stop iteration + * early (but must have processed the element it was given!). + * + * Takes a domestication callback for each of "pointers read from head" and + * "pointers read from queue". See the commentary on the class. + */ + template< + typename Domesticator_head, + typename Domesticator_queue, + typename Cb> + void dequeue( + Domesticator_head domesticate_head, + Domesticator_queue domesticate_queue, + Cb cb) + { + invariant(); + SNMALLOC_ASSERT(front.load() != nullptr); + + // Use back to bound, so we don't handle new entries. + auto b = back.load(std::memory_order_relaxed); + freelist::HeadPtr curr = domesticate_head(front.load()); + + while (address_cast(curr) != address_cast(b)) + { + freelist::HeadPtr next = + curr->atomic_read_next(Key, Key_tweak, domesticate_queue); + // We have observed a non-linearisable effect of the queue. + // Just go back to allocating normally. + if (SNMALLOC_UNLIKELY(next == nullptr)) + break; + // We want this element next, so start it loading. + Aal::prefetch(next.unsafe_ptr()); + if (SNMALLOC_UNLIKELY(!cb(curr))) + { + /* + * We've domesticate_queue-d next so that we can read through it, but + * we're storing it back into client-accessible memory in + * !QueueHeadsAreTame builds, so go ahead and consider it Wild again. + * On QueueHeadsAreTame builds, the subsequent domesticate_head call + * above will also be a type-level sleight of hand, but we can still + * justify it by the domesticate_queue that happened in this + * dequeue(). + */ + front = capptr_rewild(next); + invariant(); + return; + } + + curr = next; + } + + /* + * Here, we've hit the end of the queue: next is nullptr and curr has not + * been handed to the callback. The same considerations about Wildness + * above hold here. + */ + front = capptr_rewild(curr); + invariant(); + } + }; +} // namespace snmalloc diff --git a/src/snmalloc/mem/metadata.h b/src/snmalloc/mem/metadata.h index 10fa93e37..d696a4de6 100644 --- a/src/snmalloc/mem/metadata.h +++ b/src/snmalloc/mem/metadata.h @@ -455,7 +455,7 @@ namespace snmalloc static_assert( std::is_base_of::value, "Template should be a subclass of FrontendSlabMetadata"); - free_queue.init(slab, key); + free_queue.init(slab, key, NO_KEY_TWEAK); // Set up meta data as if the entire slab has been turned into a free // list. This means we don't have to check for special cases where we have // returned all the elements, but this is a slab that is still being bump @@ -477,7 +477,7 @@ namespace snmalloc void initialise_large(address_t slab, const FreeListKey& key) { // We will push to this just to make the fast path clean. - free_queue.init(slab, key); + free_queue.init(slab, key, NO_KEY_TWEAK); // Flag to detect that it is a large alloc on the slow path large_ = true; @@ -576,7 +576,8 @@ namespace snmalloc auto& key = entropy.get_free_list_key(); std::remove_reference_t tmp_fl; - auto remaining = meta->free_queue.close(tmp_fl, key); + + auto remaining = meta->free_queue.close(tmp_fl, key, NO_KEY_TWEAK); auto p = tmp_fl.take(key, domesticate); fast_free_list = tmp_fl; @@ -598,7 +599,7 @@ namespace snmalloc // start of the slab. [[nodiscard]] address_t get_slab_interior(const FreeListKey& key) const { - return address_cast(free_queue.read_head(0, key)); + return address_cast(free_queue.read_head(0, key, NO_KEY_TWEAK)); } typename ClientMeta::DataRef get_meta_for_object(size_t index) diff --git a/src/snmalloc/mem/remoteallocator.h b/src/snmalloc/mem/remoteallocator.h index f441d1aeb..c0fb7240c 100644 --- a/src/snmalloc/mem/remoteallocator.h +++ b/src/snmalloc/mem/remoteallocator.h @@ -1,43 +1,20 @@ #pragma once -#include "../ds/ds.h" -#include "freelist.h" -#include "metadata.h" -#include "sizeclasstable.h" - -#include -#include +#include "freelist_queue.h" +#include "remotecache.h" namespace snmalloc { /** + * A RemoteAllocator is the message queue of freed objects. It builds on the + * FreeListMPSCQ but encapsulates knowledge that the objects are actually + * RemoteMessage-s and not just any freelist::object::T<>s. * - * A RemoteAllocator is the message queue of freed objects. It exposes a MPSC - * append-only atomic queue that uses one xchg per append. - * - * The internal pointers are considered QueuePtr-s to support deployment - * scenarios in which the RemoteAllocator itself is exposed to the client. - * This is excessively paranoid in the common case that the RemoteAllocator-s - * are as "hard" for the client to reach as the Pagemap, which we trust to - * store not just Tame CapPtr<>s but raw C++ pointers. - * - * While we could try to condition the types used here on a flag in the - * backend's `struct Flags Options` value, we instead expose two domesticator - * callbacks at the interface and are careful to use one for the front and - * back values and the other for pointers read from the queue itself. That's - * not ideal, but it lets the client condition its behavior appropriately and - * prevents us from accidentally following either of these pointers in generic - * code. - * - * `domesticate_head` is used for the pointer used to reach the of the queue, - * while `domesticate_queue` is used to traverse the first link in the queue - * itself. In the case that the RemoteAllocator is not easily accessible to - * the client, `domesticate_head` can just be a type coersion, and - * `domesticate_queue` should perform actual validation. If the - * RemoteAllocator is exposed to the client, both Domesticators should perform - * validation. + * RemoteAllocator-s may be exposed to client tampering. As a result, + * pointer domestication may be necessary. See the documentation for + * FreeListMPSCQ for details. */ - struct alignas(REMOTE_MIN_ALIGN) RemoteAllocator + struct RemoteAllocator { /** * Global key for all remote lists. @@ -49,50 +26,36 @@ namespace snmalloc */ inline static FreeListKey key_global{0xdeadbeef, 0xbeefdead, 0xdeadbeef}; - using alloc_id_t = address_t; + FreeListMPSCQ list; - // Store the message queue on a separate cacheline. It is mutable data that - // is read by other threads. - alignas(CACHELINE_SIZE) freelist::AtomicQueuePtr back{nullptr}; - // Store the two ends on different cache lines as access by different - // threads. - alignas(CACHELINE_SIZE) freelist::AtomicQueuePtr front{nullptr}; - // Fake first entry - freelist::Object::T stub{}; + using alloc_id_t = address_t; constexpr RemoteAllocator() = default; void invariant() { - SNMALLOC_ASSERT( - (address_cast(front.load()) == address_cast(&stub)) || - (back != nullptr)); + list.invariant(); } void init() { - freelist::HeadPtr stub_ptr = freelist::HeadPtr::unsafe_from(&stub); - freelist::Object::atomic_store_null(stub_ptr, key_global); - front.store(freelist::QueuePtr::unsafe_from(&stub)); - back.store(nullptr, std::memory_order_relaxed); - invariant(); + list.init(); } - freelist::QueuePtr destroy() + template + void destroy_and_iterate(Domesticator_queue domesticate, Cb cb) { - freelist::QueuePtr fnt = front.load(); - back.store(nullptr, std::memory_order_relaxed); - if (address_cast(front.load()) == address_cast(&stub)) - return nullptr; - return fnt; + auto cbwrap = [cb](freelist::HeadPtr p) + SNMALLOC_FAST_PATH_LAMBDA { cb(p.as_void()); }; + + return list.destroy_and_iterate(domesticate, cbwrap); } template inline bool can_dequeue( Domesticator_head domesticate_head, Domesticator_queue domesticate_queue) { - return domesticate_head(front.load()) - ->atomic_read_next(key_global, domesticate_queue) != nullptr; + return list.can_dequeue(domesticate_head, domesticate_queue); } /** @@ -108,24 +71,7 @@ namespace snmalloc freelist::HeadPtr last, Domesticator_head domesticate_head) { - invariant(); - freelist::Object::atomic_store_null(last, key_global); - - // Exchange needs to be acq_rel. - // * It needs to be a release, so nullptr in next is visible. - // * Needs to be acquire, so linking into the list does not race with - // the other threads nullptr init of the next field. - freelist::QueuePtr prev = - back.exchange(capptr_rewild(last), std::memory_order_acq_rel); - - if (SNMALLOC_LIKELY(prev != nullptr)) - { - freelist::Object::atomic_store_next( - domesticate_head(prev), first, key_global); - return; - } - - front.store(capptr_rewild(first)); + list.enqueue(first, last, domesticate_head); } /** @@ -145,49 +91,7 @@ namespace snmalloc Domesticator_queue domesticate_queue, Cb cb) { - invariant(); - SNMALLOC_ASSERT(front.load() != nullptr); - - // Use back to bound, so we don't handle new entries. - auto b = back.load(std::memory_order_relaxed); - freelist::HeadPtr curr = domesticate_head(front.load()); - - while (address_cast(curr) != address_cast(b)) - { - freelist::HeadPtr next = - curr->atomic_read_next(key_global, domesticate_queue); - // We have observed a non-linearisable effect of the queue. - // Just go back to allocating normally. - if (SNMALLOC_UNLIKELY(next == nullptr)) - break; - // We want this element next, so start it loading. - Aal::prefetch(next.unsafe_ptr()); - if (SNMALLOC_UNLIKELY(!cb(curr))) - { - /* - * We've domesticate_queue-d next so that we can read through it, but - * we're storing it back into client-accessible memory in - * !QueueHeadsAreTame builds, so go ahead and consider it Wild again. - * On QueueHeadsAreTame builds, the subsequent domesticate_head call - * above will also be a type-level sleight of hand, but we can still - * justify it by the domesticate_queue that happened in this - * dequeue(). - */ - front = capptr_rewild(next); - invariant(); - return; - } - - curr = next; - } - - /* - * Here, we've hit the end of the queue: next is nullptr and curr has not - * been handed to the callback. The same considerations about Wildness - * above hold here. - */ - front = capptr_rewild(curr); - invariant(); + list.dequeue(domesticate_head, domesticate_queue, cb); } alloc_id_t trunc_id() diff --git a/src/snmalloc/mem/remotecache.h b/src/snmalloc/mem/remotecache.h index 96f5e0973..faea14103 100644 --- a/src/snmalloc/mem/remotecache.h +++ b/src/snmalloc/mem/remotecache.h @@ -73,7 +73,7 @@ namespace snmalloc auto r = p.template as_reinterpret>(); list[get_slot(target_id, 0)].add( - r, RemoteAllocator::key_global); + r, RemoteAllocator::key_global, NO_KEY_TWEAK); } template @@ -102,7 +102,7 @@ namespace snmalloc if (!list[i].empty()) { - auto [first, last] = list[i].extract_segment(key); + auto [first, last] = list[i].extract_segment(key, NO_KEY_TWEAK); const auto& entry = Config::Backend::get_metaentry(address_cast(first)); auto remote = entry.get_remote(); @@ -135,7 +135,7 @@ namespace snmalloc // so take copy of the head, mark the last element, // and clear the original list. freelist::Iter<> resend; - list[my_slot].close(resend, key); + list[my_slot].close(resend, key, NO_KEY_TWEAK); post_round++; @@ -147,7 +147,7 @@ namespace snmalloc const auto& entry = Config::Backend::get_metaentry(address_cast(r)); auto i = entry.get_remote()->trunc_id(); size_t slot = get_slot(i, post_round); - list[slot].add(r, key); + list[slot].add(r, key, NO_KEY_TWEAK); } } @@ -175,7 +175,7 @@ namespace snmalloc { // We do not need to initialise with a particular slab, so pass // a null address. - l.init(0, RemoteAllocator::key_global); + l.init(0, RemoteAllocator::key_global, NO_KEY_TWEAK); } capacity = REMOTE_CACHE; } diff --git a/src/test/perf/msgpass/msgpass.cc b/src/test/perf/msgpass/msgpass.cc new file mode 100644 index 000000000..f3918595a --- /dev/null +++ b/src/test/perf/msgpass/msgpass.cc @@ -0,0 +1,302 @@ +/** + * A simulation of a message-passing application workload for snmalloc. + * + * - N_PRODUCER producer threads allocate and queue spans of messages randomly, + * - to N_CONSUMER consumer threads, which dequeue messages and free() them. + * + * Optionally, N_PROXY threads act as both producers and consumers, forwarding + * received messages back to another queue rather than freeing them. + */ + +#include "test/opt.h" +#include "test/setup.h" +#include "test/usage.h" +#include "test/xoroshiro.h" + +constexpr static bool be_chatty = false; + +#include +#include +#include +#include +#include +#include +#include + +using namespace snmalloc; + +void chatty(const char* p, ...) +{ + if constexpr (be_chatty) + { + va_list va; + va_start(va, p); + vfprintf(stderr, p, va); + va_end(va); + } +} + +/* + * Interpret SNMALLOC_PASS_THROUGH ourselves to make this a bit more fair of a + * comparison, since relying of snmalloc itself to do the passing through + * results in it imposing its own idea of alignment onto the underlying + * allocator, which might result in it taking less optimized paths. + */ +#ifdef SNMALLOC_PASS_THROUGH +struct MyAlloc +{ + MyAlloc() {} + void* alloc(size_t sz) + { + return malloc(sz); + } + void dealloc(void* p) + { + free(p); + } +}; +#else +struct MyAlloc +{ + snmalloc::Alloc& a; + MyAlloc() : a(ThreadAlloc::get()) {} + void* alloc(size_t sz) + { + return a.alloc(sz); + } + void dealloc(void* p) + { + a.dealloc(p); + } +}; +#endif + +/* + * FreeListMPSCQ make for convenient MPSC queues, so we use those for sending + * "messages". Each consumer or proxy has its own (source) queue. + */ +static FreeListKey msgqueue_key{0xab2acada, 0xb2a01234, 0x56789abc}; +static constexpr address_t msgqueue_key_tweak = 0xfedc'ba98; + +struct params +{ + size_t N_PRODUCER; + size_t N_CONSUMER; + size_t N_PROXY; + size_t N_QUEUE; + size_t N_PRODUCER_BATCH; + size_t N_MAX_OUTSTANDING; + size_t N_MAX_BATCH_SIZE; + FreeListMPSCQ* msgqueue; // [N_QUEUE] +}; + +std::atomic producers_live; +std::atomic queue_gate; +std::atomic messages_outstanding; + +freelist::HeadPtr domesticate_nop(freelist::QueuePtr p) +{ + return freelist::HeadPtr::unsafe_from(p.unsafe_ptr()); +}; + +void consumer(const struct params* param, size_t qix) +{ + MyAlloc a{}; + auto& myq = param->msgqueue[qix]; + + chatty("Cl %zu q is %p\n", qix, &myq); + + do + { + size_t reap = 0; + + if (myq.can_dequeue(domesticate_nop, domesticate_nop)) + { + myq.dequeue( + domesticate_nop, + domesticate_nop, + [qix, &a, &reap](freelist::HeadPtr o) { + UNUSED(qix); + auto p = o.as_void().unsafe_ptr(); + chatty("Cl %zu free %p\n", qix, p); + a.dealloc(p); + reap++; + return true; + }); + } + + messages_outstanding -= reap; + + if (reap == 0) + { + std::this_thread::yield(); + } + else + { + chatty("Cl %zu reap %zu\n", qix, reap); + } + + } while (myq.can_dequeue(domesticate_nop, domesticate_nop) || + producers_live || (queue_gate > param->N_CONSUMER)); + + chatty("Cl %zu fini\n", qix); + a.dealloc(myq.destroy().unsafe_ptr()); +} + +void proxy(const struct params* param, size_t qix) +{ + auto& myq = param->msgqueue[qix]; + auto& qs = param->msgqueue; + + chatty("Px %zu q is %p\n", qix, &myq); + + xoroshiro::p128r32 r(1234 + qix, qix); + do + { + if (myq.can_dequeue(domesticate_nop, domesticate_nop)) + { + myq.dequeue( + domesticate_nop, domesticate_nop, [qs, qix, &r](freelist::HeadPtr o) { + auto rcptqix = r.next() % qix; + + chatty( + "Px %zu send %p to %zu\n", qix, o.as_void().unsafe_ptr(), rcptqix); + + qs[rcptqix].enqueue(o, o, domesticate_nop); + return true; + }); + } + + std::this_thread::yield(); + } while (myq.can_dequeue(domesticate_nop, domesticate_nop) || + producers_live || (queue_gate > qix + 1)); + + chatty("Px %zu fini\n", qix); + + MyAlloc().dealloc(myq.destroy().unsafe_ptr()); + queue_gate--; +} + +void producer(const struct params* param, size_t pix) +{ + MyAlloc a{}; + static constexpr size_t msgsizes[] = {48, 64, 96, 128}; + static constexpr size_t nmsgsizes = sizeof(msgsizes) / sizeof(msgsizes[0]); + + xoroshiro::p128r32 r(5489 + pix, pix); + + freelist::Builder batch; + batch.init(0, msgqueue_key, msgqueue_key_tweak); + + for (size_t batchix = param->N_PRODUCER_BATCH; batchix > 0; batchix--) + { + while (messages_outstanding >= param->N_MAX_OUTSTANDING) + { + std::this_thread::yield(); + } + + size_t nmsg = (r.next() & 15) + 1; + size_t msgsize = msgsizes[r.next() % nmsgsizes]; + + /* Allocate batch and form list */ + for (size_t msgix = 0; msgix < nmsg; msgix++) + { + auto msg = a.alloc(msgsize); + chatty("Pd %zu make %p\n", pix, msg); + + auto msgc = capptr::Alloc::unsafe_from(msg) + .template as_reinterpret>(); + batch.add(msgc, msgqueue_key, msgqueue_key_tweak); + } + + /* Post to random queue */ + auto [bfirst, blast] = + batch.extract_segment(msgqueue_key, msgqueue_key_tweak); + auto rcptqix = r.next() % param->N_QUEUE; + param->msgqueue[rcptqix].enqueue(bfirst, blast, domesticate_nop); + messages_outstanding += nmsg; + + chatty("Pd %zu send %zu to %zu\n", pix, nmsg, rcptqix); + + /* Occasionally yield the CPU */ + if ((batchix & 0xF) == 1) + std::this_thread::yield(); + } + + chatty("Pd %zu fini\n", pix); +} + +int main(int argc, char** argv) +{ + struct params param; + + opt::Opt opt(argc, argv); + param.N_PRODUCER = opt.is("--producers", 3); + param.N_CONSUMER = opt.is("--consumers", 3); + param.N_PROXY = opt.is("--proxies", 2); + param.N_PRODUCER_BATCH = opt.is("--batches", 1024 * 1024); + param.N_MAX_OUTSTANDING = opt.is("--max-out", 4 * 1024); + param.N_MAX_BATCH_SIZE = opt.is("--max-batch", 16); + + std::cout << "msgpass --producers=" << param.N_PRODUCER + << " --consumers=" << param.N_CONSUMER + << " --proxies=" << param.N_PROXY + << " --batches=" << param.N_PRODUCER_BATCH + << " --max-out=" << param.N_MAX_OUTSTANDING + << " --max-batch=" << param.N_MAX_BATCH_SIZE << std::endl; + + param.N_QUEUE = param.N_CONSUMER + param.N_PROXY; + param.msgqueue = + new FreeListMPSCQ[param.N_QUEUE]; + + auto* producer_threads = new std::thread[param.N_PRODUCER]; + auto* queue_threads = new std::thread[param.N_QUEUE]; + + for (size_t i = 0; i < param.N_QUEUE; i++) + { + param.msgqueue[i].init(); + } + + producers_live = true; + queue_gate = param.N_QUEUE; + messages_outstanding = 0; + + /* Spawn consumers */ + for (size_t i = 0; i < param.N_CONSUMER; i++) + { + queue_threads[i] = std::thread(consumer, ¶m, i); + } + + /* Spawn proxies */ + for (size_t i = param.N_CONSUMER; i < param.N_QUEUE; i++) + { + queue_threads[i] = std::thread(proxy, ¶m, i); + } + + /* Spawn producers */ + for (size_t i = 0; i < param.N_PRODUCER; i++) + { + producer_threads[i] = std::thread(producer, ¶m, i); + } + + /* Wait for producers to finish */ + for (size_t i = 0; i < param.N_PRODUCER; i++) + { + producer_threads[i].join(); + } + producers_live = false; + + /* Wait for proxies and consumers to finish */ + for (size_t i = 0; i < param.N_QUEUE; i++) + { + queue_threads[param.N_QUEUE - 1 - i].join(); + } + + delete[] producer_threads; + delete[] queue_threads; + + /* Ensure that we have not lost any allocations */ + debug_check_empty(); + + return 0; +}