diff --git a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp index d75485fec3a..7694fc641c9 100644 --- a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp @@ -87,6 +87,29 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators, Register start, Register count, Register scratch, RegSet saved_regs) { + if (G1BarrierSimple) { + + Label L_loop, L_done; + const Register end = count; + + __ cbz(count, L_done); // zero count - nothing to do + + __ lea(end, Address(start, count, Address::lsl(LogBytesPerHeapOop))); // end = start + count << LogBytesPerHeapOop + __ sub(end, end, BytesPerHeapOop); // last element address to make inclusive + __ lsr(start, start, CardTable::card_shift); + __ lsr(end, end, CardTable::card_shift); + __ sub(count, end, start); // number of bytes to copy + + __ load_byte_map_base(scratch); + __ add(start, start, scratch); + __ bind(L_loop); + __ strb(zr, Address(start, count)); + __ subs(count, count, 1); + __ br(Assembler::GE, L_loop); + __ bind(L_done); + return; + } + __ push(saved_regs, sp); assert_different_registers(start, count, scratch); assert_different_registers(c_rarg0, count); @@ -209,6 +232,15 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, Label done; Label runtime; + if (G1BarrierSimple) { + const Register card_addr = tmp; + __ lsr(card_addr, store_addr, CardTable::card_shift); + __ load_byte_map_base(tmp2); + __ add(card_addr, card_addr, tmp2); + __ strb(zr, Address(card_addr)); + __ bind(done); + return; + } // Does store cross heap regions? __ eor(tmp, store_addr, new_val); @@ -449,6 +481,13 @@ void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* __ load_parameter(0, card_offset); __ lsr(card_offset, card_offset, CardTable::card_shift); __ load_byte_map_base(byte_map_base); + + if (G1BarrierSimple) { + __ strb(zr, Address(byte_map_base, card_offset)); + __ bind(done); + __ epilogue(); + return; + } __ ldrb(rscratch1, Address(byte_map_base, card_offset)); __ cmpw(rscratch1, (int)G1CardTable::g1_young_card_val()); __ br(Assembler::EQ, done); diff --git a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp index c8271ead5de..ac485125c0a 100644 --- a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp @@ -41,6 +41,8 @@ #define __ masm-> +#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8) + void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm, DecoratorSet decorators, Register addr, Register count) { bool dest_uninitialized = (decorators & IS_DEST_UNINITIALIZED) != 0; @@ -97,6 +99,35 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators, Register addr, Register count, Register tmp) { + if (G1BarrierSimple) { + CardTableBarrierSet* ct = + barrier_set_cast(BarrierSet::barrier_set()); + intptr_t disp = (intptr_t) ct->card_table()->byte_map_base(); + + Label L_loop, L_done; + const Register end = count; + assert_different_registers(addr, end); + + __ testl(count, count); + __ jcc(Assembler::zero, L_done); // zero count - nothing to do + + __ leaq(end, Address(addr, count, TIMES_OOP, 0)); // end == addr+count*oop_size + __ subptr(end, BytesPerHeapOop); // end - 1 to make inclusive + __ shrptr(addr, CardTable::card_shift); + __ shrptr(end, CardTable::card_shift); + __ subptr(end, addr); // end --> cards count + + __ mov64(tmp, disp); + __ addptr(addr, tmp); + __ bind(L_loop); + __ movb(Address(addr, count, Address::times_1), 0); + __ decrement(count); + __ jcc(Assembler::greaterEqual, L_loop); + __ bind(L_done); + + return; + } + __ pusha(); // push registers (overkill) #ifdef _LP64 if (c_rarg0 == count) { // On win64 c_rarg0 == rcx @@ -302,6 +333,12 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, __ movptr(cardtable, (intptr_t)ct->card_table()->byte_map_base()); __ addptr(card_addr, cardtable); + if (G1BarrierSimple) { + __ movb(Address(card_addr, 0), (int)G1CardTable::dirty_card_val()); + __ bind(done); + return; + } + __ cmpb(Address(card_addr, 0), (int)G1CardTable::g1_young_card_val()); __ jcc(Assembler::equal, done); @@ -551,6 +588,17 @@ void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* NOT_LP64(__ get_thread(thread);) + if (G1BarrierSimple) { + __ movb(Address(card_addr, 0), (int)G1CardTable::dirty_card_val()); + __ bind(done); + __ pop(rcx); + __ pop(rax); + + __ epilogue(); + + return; + } + __ cmpb(Address(card_addr, 0), (int)G1CardTable::g1_young_card_val()); __ jcc(Assembler::equal, done); diff --git a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp index c54f41efd6e..8d97939a459 100644 --- a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp +++ b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp @@ -447,6 +447,13 @@ void G1BarrierSetC2::post_barrier(GraphKit* kit, // Combine card table base and card offset Node* card_adr = __ AddP(no_base, byte_map_base_node(kit), card_offset ); + if (G1BarrierSimple) { + __ store(__ ctrl(), card_adr, dirty_card, T_BYTE, Compile::AliasIdxRaw, MemNode::unordered); + // Final sync IdealKit and GraphKit. + kit->final_sync(ideal); + return; + } + // If we know the value being stored does it cross regions? if (val != NULL) { @@ -658,6 +665,39 @@ bool G1BarrierSetC2::is_gc_barrier_node(Node* node) const { void G1BarrierSetC2::eliminate_gc_barrier(PhaseMacroExpand* macro, Node* node) const { assert(node->Opcode() == Op_CastP2X, "ConvP2XNode required"); + + if (G1BarrierSimple) { + Node* this_region = node->in(0); + // Search "if (marking != 0)" check and set it to "false". + // There is no G1 pre barrier if previous stored value is NULL + // (for example, after initialization). + if (this_region->is_Region() && this_region->req() == 3) { + int ind = 1; + if (!this_region->in(ind)->is_IfFalse()) { + ind = 2; + } + if (this_region->in(ind)->is_IfFalse() && + this_region->in(ind)->in(0)->Opcode() == Op_If) { + Node* bol = this_region->in(ind)->in(0)->in(1); + assert(bol->is_Bool(), ""); + Node* cmpx = bol->in(1); + if (bol->as_Bool()->_test._test == BoolTest::ne && + cmpx->is_Cmp() && cmpx->in(2) == macro->intcon(0) && + cmpx->in(1)->is_Load()) { + Node* adr = cmpx->in(1)->as_Load()->in(MemNode::Address); + const int marking_offset = in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()); + if (adr->is_AddP() && adr->in(AddPNode::Base) == macro->top() && + adr->in(AddPNode::Address)->Opcode() == Op_ThreadLocal && + adr->in(AddPNode::Offset) == macro->MakeConX(marking_offset)) { + macro->replace_node(cmpx, macro->makecon(TypeInt::CC_EQ)); + } + } + } + } + CardTableBarrierSetC2::eliminate_gc_barrier(macro, node); + return; + } + assert(node->outcnt() <= 2, "expects 1 or 2 users: Xor and URShift nodes"); // It could be only one user, URShift node, in Object.clone() intrinsic // but the new allocation is passed to arraycopy stub and it could not diff --git a/src/hotspot/share/gc/g1/g1Arguments.cpp b/src/hotspot/share/gc/g1/g1Arguments.cpp index c397910ef3b..67413b90182 100644 --- a/src/hotspot/share/gc/g1/g1Arguments.cpp +++ b/src/hotspot/share/gc/g1/g1Arguments.cpp @@ -83,7 +83,10 @@ void G1Arguments::initialize() { vm_exit_during_initialization("The flag -XX:+UseG1GC can not be combined with -XX:ParallelGCThreads=0", NULL); } - if (FLAG_IS_DEFAULT(G1ConcRefinementThreads)) { + if (G1BarrierSimple) { + FLAG_SET_DEFAULT(G1ConcRefinementThreads, 0); + FLAG_SET_DEFAULT(G1ConcRSLogCacheSize, 0); + } else if (FLAG_IS_DEFAULT(G1ConcRefinementThreads)) { FLAG_SET_ERGO(uint, G1ConcRefinementThreads, ParallelGCThreads); } diff --git a/src/hotspot/share/gc/g1/g1BarrierSet.cpp b/src/hotspot/share/gc/g1/g1BarrierSet.cpp index 8531d36e13c..0953e4b1836 100644 --- a/src/hotspot/share/gc/g1/g1BarrierSet.cpp +++ b/src/hotspot/share/gc/g1/g1BarrierSet.cpp @@ -120,6 +120,12 @@ void G1BarrierSet::invalidate(MemRegion mr) { volatile jbyte* byte = _card_table->byte_for(mr.start()); jbyte* last_byte = _card_table->byte_for(mr.last()); Thread* thr = Thread::current(); + if (G1BarrierSimple) { + for (; byte <= last_byte; byte++) { + *byte = G1CardTable::dirty_card_val(); + } + return; + } // skip all consecutive young cards for (; byte <= last_byte && *byte == G1CardTable::g1_young_card_val(); byte++); diff --git a/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp b/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp index 8fc692fbb24..ee754b6f300 100644 --- a/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp +++ b/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp @@ -48,6 +48,10 @@ inline void G1BarrierSet::write_ref_field_pre(T* field) { template inline void G1BarrierSet::write_ref_field_post(T* field, oop new_val) { volatile jbyte* byte = _card_table->byte_for(field); + if (G1BarrierSimple) { + *byte = G1CardTable::dirty_card_val(); + return; + } if (*byte != G1CardTable::g1_young_card_val()) { // Take a slow path for cards in old write_ref_field_post_slow(byte); diff --git a/src/hotspot/share/gc/g1/g1CardTable.cpp b/src/hotspot/share/gc/g1/g1CardTable.cpp index b66ddd28a5b..b2fc47aff44 100644 --- a/src/hotspot/share/gc/g1/g1CardTable.cpp +++ b/src/hotspot/share/gc/g1/g1CardTable.cpp @@ -53,6 +53,8 @@ bool G1CardTable::mark_card_deferred(size_t card_index) { } void G1CardTable::g1_mark_as_young(const MemRegion& mr) { + assert(!G1BarrierSimple, "Should not be called with G1BarrierSimple"); + jbyte *const first = byte_for(mr.start()); jbyte *const last = byte_after(mr.last()); @@ -61,7 +63,9 @@ void G1CardTable::g1_mark_as_young(const MemRegion& mr) { #ifndef PRODUCT void G1CardTable::verify_g1_young_region(MemRegion mr) { - verify_region(mr, g1_young_gen, true); + if (!G1BarrierSimple) { + verify_region(mr, g1_young_gen, true); + } } #endif @@ -97,6 +101,9 @@ void G1CardTable::initialize(G1RegionToSpaceMapper* mapper) { } bool G1CardTable::is_in_young(oop obj) const { + if (G1BarrierSimple) { + return G1CollectedHeap::heap()->heap_region_containing(obj)->is_young(); + } volatile jbyte* p = byte_for(obj); return *p == G1CardTable::g1_young_card_val(); } diff --git a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp index 8b221ef1ce2..2d1133370d0 100644 --- a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp +++ b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp @@ -2181,6 +2181,12 @@ void G1CollectedHeap::heap_region_par_iterate_from_start(HeapRegionClosure* cl, _hrm.par_iterate(cl, hrclaimer, 0); } +void G1CollectedHeap::heap_region_par_iterate_chunk_based(HeapRegionClosure* cl, + HeapRegionChunkClaimer* chunk_claimer, + uint worker_id) const { + _hrm.par_iterate(cl, chunk_claimer, chunk_claimer->offset_for_worker(worker_id)); +} + void G1CollectedHeap::collection_set_iterate(HeapRegionClosure* cl) { _collection_set.iterate(cl); } @@ -2671,7 +2677,9 @@ class RegisterHumongousWithInCSetFastTestClosure : public HeapRegionClosure { if (g1h->is_in_closed_subset(ct->addr_for(card_ptr))) { if (*card_ptr != G1CardTable::dirty_card_val()) { *card_ptr = G1CardTable::dirty_card_val(); - _dcq.enqueue(card_ptr); + if (!G1BarrierSimple) { + _dcq.enqueue(card_ptr); + } } } } @@ -3188,6 +3196,7 @@ class G1ParTask : public AbstractGangTask { G1RootProcessor* _root_processor; TaskTerminator _terminator; uint _n_workers; + HeapRegionChunkClaimer _chunk_claimer; public: G1ParTask(G1CollectedHeap* g1h, G1ParScanThreadStateSet* per_thread_states, RefToScanQueueSet *task_queues, G1RootProcessor* root_processor, uint n_workers) @@ -3197,7 +3206,8 @@ class G1ParTask : public AbstractGangTask { _queues(task_queues), _root_processor(root_processor), _terminator(n_workers, _queues), - _n_workers(n_workers) + _n_workers(n_workers), + _chunk_claimer(n_workers) {} void work(uint worker_id) { @@ -3223,7 +3233,7 @@ class G1ParTask : public AbstractGangTask { // treating the nmethods visited to act as roots for concurrent marking. // We only want to make sure that the oops in the nmethods are adjusted with regard to the // objects copied by the current evacuation. - _g1h->g1_rem_set()->oops_into_collection_set_do(pss, worker_id); + _g1h->g1_rem_set()->oops_into_collection_set_do(pss, worker_id, &_chunk_claimer); double strong_roots_sec = os::elapsedTime() - start_strong_roots_sec; @@ -3718,8 +3728,12 @@ void G1CollectedHeap::redirty_logged_cards() { dirty_card_queue_set().reset_for_par_iteration(); workers()->run_task(&redirty_task); - DirtyCardQueueSet& dcq = G1BarrierSet::dirty_card_queue_set(); - dcq.merge_bufferlists(&dirty_card_queue_set()); + if (G1BarrierSimple) { + dirty_card_queue_set().clear(); + } else { + DirtyCardQueueSet& dcq = G1BarrierSet::dirty_card_queue_set(); + dcq.merge_bufferlists(&dirty_card_queue_set()); + } assert(dirty_card_queue_set().completed_buffers_num() == 0, "All should be consumed"); g1_policy()->phase_times()->record_redirty_logged_cards_time_ms((os::elapsedTime() - redirty_logged_cards_start) * 1000.0); diff --git a/src/hotspot/share/gc/g1/g1CollectedHeap.hpp b/src/hotspot/share/gc/g1/g1CollectedHeap.hpp index 34d0f429490..d2e86e8628f 100644 --- a/src/hotspot/share/gc/g1/g1CollectedHeap.hpp +++ b/src/hotspot/share/gc/g1/g1CollectedHeap.hpp @@ -148,6 +148,7 @@ class G1CollectedHeap : public CollectedHeap { // Other related classes. friend class HeapRegionClaimer; + friend class HeapRegionChunkClaimer; // Testing classes. friend class G1CheckCSetFastTableClosure; @@ -1167,6 +1168,10 @@ class G1CollectedHeap : public CollectedHeap { void heap_region_par_iterate_from_start(HeapRegionClosure* cl, HeapRegionClaimer* hrclaimer) const; + void heap_region_par_iterate_chunk_based(HeapRegionClosure* cl, + HeapRegionChunkClaimer* chunk_claimer, + uint worker_id) const; + // Iterate over the regions (if any) in the current collection set. void collection_set_iterate(HeapRegionClosure* blk); diff --git a/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp b/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp index bbb3e9121f2..3bbdbe6ae28 100644 --- a/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp +++ b/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp @@ -112,6 +112,11 @@ inline void G1CollectedHeap::dirty_young_block(HeapWord* start, size_t word_size) { assert_heap_not_locked(); + if (G1BarrierSimple) { + // Not necessary to mark card young + return; + } + // Assign the containing region to containing_hr so that we don't // have to keep calling heap_region_containing() in the // asserts below. diff --git a/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp b/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp index 85f76aabd2a..8fca8e2d20c 100644 --- a/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp +++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp @@ -51,6 +51,7 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id, _stack_trim_upper_threshold(GCDrainStackTargetSize * 2 + 1), _stack_trim_lower_threshold(GCDrainStackTargetSize), _trim_ticks(), + _pending_cards(0), _old_gen_is_full(false) { // we allocate G1YoungSurvRateNumRegions plus one entries, since @@ -81,7 +82,7 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id, } // Pass locally gathered statistics to global state. -size_t G1ParScanThreadState::flush(size_t* surviving_young_words) { +size_t G1ParScanThreadState::flush(size_t* surviving_young_words, size_t& pending_cards) { _dcq.flush(); // Update allocation statistics. _plab_allocator->flush_and_retire_stats(); @@ -95,6 +96,8 @@ size_t G1ParScanThreadState::flush(size_t* surviving_young_words) { surviving_young_words[region_index] += G1ParScanThreadState::surviving_young_words()[region_index]; sum += G1ParScanThreadState::surviving_young_words()[region_index]; } + assert(G1BarrierSimple || _pending_cards == 0, "Sanity"); + pending_cards += _pending_cards; return sum; } @@ -351,6 +354,7 @@ const size_t* G1ParScanThreadStateSet::surviving_young_words() const { void G1ParScanThreadStateSet::flush() { assert(!_flushed, "thread local state from the per thread states should be flushed once"); + size_t pending_cards = 0; size_t copied_words = 0; for (uint worker_index = 0; worker_index < _n_workers; ++worker_index) { G1ParScanThreadState* pss = _states[worker_index]; @@ -359,10 +363,12 @@ void G1ParScanThreadStateSet::flush() { continue; } - copied_words += pss->flush(_surviving_young_words_total); + copied_words += pss->flush(_surviving_young_words_total, pending_cards); delete pss; _states[worker_index] = NULL; } + assert(G1BarrierSimple || pending_cards == 0, "Only available for G1BarrierSimple"); + _g1h->g1_policy()->record_pending_cards(pending_cards); _g1h->g1_policy()->record_copied_bytes(copied_words << LogBytesPerWord); _flushed = true; } diff --git a/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp b/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp index 6a80461928b..e49ad2483e3 100644 --- a/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp +++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp @@ -70,6 +70,8 @@ class G1ParScanThreadState : public CHeapObj { // this points into the array, as we use the first few entries for padding size_t* _surviving_young_words; + size_t _pending_cards; + // Indicates whether in the last generation (old) there is no more space // available for allocation. bool _old_gen_is_full; @@ -137,7 +139,11 @@ class G1ParScanThreadState : public CHeapObj { return _surviving_young_words + 1; } - size_t flush(size_t* surviving_young_words); + void record_pending_cards(size_t pending_cards) { + _pending_cards = pending_cards; + } + + size_t flush(size_t* surviving_young_words, size_t& pending_cards); private: #define G1_PARTIAL_ARRAY_MASK 0x2 diff --git a/src/hotspot/share/gc/g1/g1Policy.hpp b/src/hotspot/share/gc/g1/g1Policy.hpp index a057db9a19f..5a18b9eb047 100644 --- a/src/hotspot/share/gc/g1/g1Policy.hpp +++ b/src/hotspot/share/gc/g1/g1Policy.hpp @@ -135,6 +135,10 @@ class G1Policy: public CHeapObj { _max_rs_lengths = rs_lengths; } + void record_pending_cards(size_t pending_cards) { + _pending_cards = pending_cards; + } + void record_copied_bytes(size_t copied_bytes) { _copied_bytes = copied_bytes; } diff --git a/src/hotspot/share/gc/g1/g1RemSet.cpp b/src/hotspot/share/gc/g1/g1RemSet.cpp index fb210d67ead..235cf2af7a8 100644 --- a/src/hotspot/share/gc/g1/g1RemSet.cpp +++ b/src/hotspot/share/gc/g1/g1RemSet.cpp @@ -482,7 +482,188 @@ class G1RefineCardClosure: public CardTableEntryClosure { size_t cards_skipped() const { return _cards_skipped; } }; -void G1RemSet::update_rem_set(G1ParScanThreadState* pss, uint worker_i) { +class G1ScanDirtyCardsHRClosure : public HeapRegionClosure { + G1CollectedHeap* _g1h; + G1CardTable* _ct; + G1RemSetScanState* _scan_state; + HeapRegionChunkClaimer* _chunk_claimer; + G1ScanObjsDuringUpdateRSClosure* _update_rs_cl; + + HeapRegion* _current_hr; + HeapWord* _scan_top; + + // This data structure stores consecutive dirty cards. [cards_start, cards_end) are dirty. + struct DirtyCardRange { + jbyte* cards_start; + jbyte* cards_end; + DirtyCardRange() : cards_start(NULL), cards_end(NULL) {} + bool is_empty() const { + return cards_start == NULL; + } + size_t len() const { + assert(cards_start < cards_end, "Sanity"); + return cards_end - cards_start; + } + void add(jbyte* card) { + if (cards_start == NULL) { + assert(cards_end == NULL, "Sanity"); + cards_start = card; + cards_end = card + 1; + } else { + assert(cards_end != NULL, "Sanity"); + assert(card == cards_start - 1, "Must decrement"); + cards_start = card; + } + } + void clear() { + cards_start = NULL; + cards_end = NULL; + } + }; + DirtyCardRange _dirties; + + size_t _dirty_card_num; + + void process_consecutive() { + assert(_current_hr != NULL, "Sanity"); + assert(_scan_top == _scan_state->scan_top(_current_hr->hrm_index()), "Sanity"); + assert(!_dirties.is_empty(), "Sanity"); + _scan_state->add_dirty_region(_current_hr->hrm_index()); + size_t len = _dirties.len(); + + HeapWord* mr_addr = _ct->addr_for(_dirties.cards_start); + HeapWord* mr_end = mr_addr + len * G1CardTable::card_size_in_words; + MemRegion mr(mr_addr, MIN2(mr_end, _scan_top)); + + bool res = _current_hr->oops_on_card_seq_iterate_careful(mr, _update_rs_cl); + assert(res, "Must have iterated objects"); + _update_rs_cl->trim_queue_partially(); + + _dirty_card_num += len; + _dirties.clear(); + } + + // Walk the card table by words (multiple of cards). + jbyte* walk_through_words(jbyte* processed_card, jbyte* const limit) { + if (is_aligned(processed_card, BytesPerWord)) { + assert(is_aligned(limit, BytesPerWord), "Sanity"); + // This potentially reads a few bytes over the top, which costs only a little extra work, + // so we don't check. + while (processed_card >= limit + BytesPerWord && + *(intptr_t*)(processed_card - BytesPerWord) == CardTableRS::clean_card_row_val()) { + // The whole word under processed_card is clean. We skip it. + + processed_card -= BytesPerWord; + } + } + return processed_card; + } + + void scan_chunk(HeapWord* chunk_start, HeapWord* chunk_end) { + jbyte* const scan_start = _ct->byte_for(chunk_start); + jbyte* const scan_end = _ct->byte_for(chunk_end - 1) + 1; // We don't do this card + jbyte* processed_card = scan_end; + assert(_dirties.is_empty(), "Sanity"); + + // Backward scan of cards in [scan_start, scan_end). + assert(is_aligned(scan_start, BytesPerWord), "Chunk must be aligned"); + while (processed_card > scan_start) { + if (processed_card >= scan_start + BytesPerWord) { + processed_card = walk_through_words(processed_card, scan_start); + if (processed_card <= scan_start) { + break; + } + } + + // Check the card under processed_card. + jbyte* curr = processed_card - 1; + jbyte card_val = *curr; + if (card_val == G1CardTable::dirty_card_val()) { + if (!_dirties.is_empty() && curr != _dirties.cards_start - 1) { + // We are adding a dirty card that is not connected to the previous ones. + process_consecutive(); + } + // Remember it. Process later. + _dirties.add(curr); + + *curr = G1CardTable::claimed_card_val(); + } else { + // It's claimed or deferred because of scan_rem_set(). We ignore them. + assert(card_val == G1CardTable::clean_card_val() || + card_val & G1CardTable::claimed_card_val() || + card_val & G1CardTable::deferred_card_val(), + "If it's not dirty, it must be clean, claimed, or deferred");; + } + + --processed_card; + } + + if (!_dirties.is_empty()) { + process_consecutive(); + } + } + +public: + G1ScanDirtyCardsHRClosure(G1RemSetScanState* scan_state, + HeapRegionChunkClaimer* chunk_claimer, + G1ScanObjsDuringUpdateRSClosure* update_rs_cl) : + _g1h(G1CollectedHeap::heap()), + _ct(_g1h->card_table()), + _scan_state(scan_state), + _chunk_claimer(chunk_claimer), + _update_rs_cl(update_rs_cl), + _current_hr(NULL), + _scan_top(NULL), + _dirties(), + _dirty_card_num(0) {} + + bool do_heap_region(HeapRegion* r) { + uint hrm_index = r->hrm_index(); + HeapWord* scan_top = _scan_state->scan_top(hrm_index); + if (scan_top == NULL) { + // Either empty or in collection set. + _chunk_claimer->claim_region(hrm_index); + return false; + } + _update_rs_cl->set_region(r); + _current_hr = r; + assert(scan_top > r->bottom(), "Must be non-empty"); + _scan_top = scan_top; + + const uint chunks_per_region = HeapRegionChunkClaimer::ChunksPerRegion; + assert(HeapRegion::GrainWords % chunks_per_region == 0, "Chunk size must be multiple of word"); + const size_t words_per_chunk = HeapRegion::GrainWords / chunks_per_region; + // Claim chunks backwards. Block offset table has an affinity for backward iteration. + uint chunk = chunks_per_region; + do { + --chunk; + HeapWord* chunk_start = r->bottom() + chunk * words_per_chunk; + // We have to claim this chunk even it's over scan top, so that other workers + // can quickly skip this region by checking is_region_claimed(). + bool res = _chunk_claimer->claim_chunk(_chunk_claimer->get_chunk_index(hrm_index, chunk)); + if (res == false) { + // Failed to claim this chunk. Another thread must have succeeded. + continue; + } + if (chunk_start >= scan_top) { + // Nothing to scan. + continue; + } + HeapWord* chunk_end = chunk_start + words_per_chunk; + scan_chunk(chunk_start, MIN2(chunk_end, scan_top)); + } while (chunk > 0); + + _current_hr = NULL; + _scan_top = NULL; + return false; + } + + size_t dirty_card_num() const { + return _dirty_card_num; + } +}; + +void G1RemSet::update_rem_set(G1ParScanThreadState* pss, uint worker_i, HeapRegionChunkClaimer* chunk_claimer) { G1GCPhaseTimes* p = _g1p->phase_times(); // Apply closure to log entries in the HCC. @@ -499,11 +680,18 @@ void G1RemSet::update_rem_set(G1ParScanThreadState* pss, uint worker_i) { G1EvacPhaseTimesTracker x(p, pss, G1GCPhaseTimes::UpdateRS, worker_i); G1ScanObjsDuringUpdateRSClosure update_rs_cl(_g1h, pss, worker_i); - G1RefineCardClosure refine_card_cl(_g1h, &update_rs_cl); - _g1h->iterate_dirty_card_closure(&refine_card_cl, worker_i); + if (G1BarrierSimple) { + G1ScanDirtyCardsHRClosure cl(_scan_state, chunk_claimer, &update_rs_cl); + _g1h->heap_region_par_iterate_chunk_based(&cl, chunk_claimer, worker_i); + pss->record_pending_cards(cl.dirty_card_num()); + p->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, cl.dirty_card_num(), G1GCPhaseTimes::UpdateRSScannedCards); + } else { + G1RefineCardClosure refine_card_cl(_g1h, &update_rs_cl); + _g1h->iterate_dirty_card_closure(&refine_card_cl, worker_i); - p->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, refine_card_cl.cards_scanned(), G1GCPhaseTimes::UpdateRSScannedCards); - p->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, refine_card_cl.cards_skipped(), G1GCPhaseTimes::UpdateRSSkippedCards); + p->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, refine_card_cl.cards_scanned(), G1GCPhaseTimes::UpdateRSScannedCards); + p->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, refine_card_cl.cards_skipped(), G1GCPhaseTimes::UpdateRSSkippedCards); + } } } @@ -511,9 +699,9 @@ void G1RemSet::cleanupHRRS() { HeapRegionRemSet::cleanup(); } -void G1RemSet::oops_into_collection_set_do(G1ParScanThreadState* pss, uint worker_i) { - update_rem_set(pss, worker_i); - scan_rem_set(pss, worker_i);; +void G1RemSet::oops_into_collection_set_do(G1ParScanThreadState* pss, uint worker_i, HeapRegionChunkClaimer* chunk_claimer) { + update_rem_set(pss, worker_i, chunk_claimer); + scan_rem_set(pss, worker_i); } void G1RemSet::prepare_for_oops_into_collection_set_do() { @@ -547,6 +735,7 @@ inline void check_card_ptr(jbyte* card_ptr, G1CardTable* ct) { void G1RemSet::refine_card_concurrently(jbyte* card_ptr, uint worker_i) { assert(!_g1h->is_gc_active(), "Only call concurrently"); + assert(!G1BarrierSimple, "Concurrent Refinement should be disabled"); // Construct the region representing the card. HeapWord* start = _ct->addr_for(card_ptr); diff --git a/src/hotspot/share/gc/g1/g1RemSet.hpp b/src/hotspot/share/gc/g1/g1RemSet.hpp index 36b9a54e12d..995df2c0015 100644 --- a/src/hotspot/share/gc/g1/g1RemSet.hpp +++ b/src/hotspot/share/gc/g1/g1RemSet.hpp @@ -50,6 +50,7 @@ class G1Policy; class G1ScanObjsDuringScanRSClosure; class G1ScanObjsDuringUpdateRSClosure; class HeapRegionClaimer; +class HeapRegionChunkClaimer; // A G1RemSet in which each heap region has a rem set that records the // external heap references into it. Uses a mod ref bs to track updates, @@ -66,7 +67,7 @@ class G1RemSet: public CHeapObj { // Flush remaining refinement buffers for cross-region references to either evacuate references // into the collection set or update the remembered set. - void update_rem_set(G1ParScanThreadState* pss, uint worker_i); + void update_rem_set(G1ParScanThreadState* pss, uint worker_i, HeapRegionChunkClaimer* claimer); G1CollectedHeap* _g1h; size_t _num_conc_refined_cards; // Number of cards refined concurrently to the mutator. @@ -101,7 +102,7 @@ class G1RemSet: public CHeapObj { // // Further applies heap_region_codeblobs on the oops of the unmarked nmethods on the strong code // roots list for each region in the collection set. - void oops_into_collection_set_do(G1ParScanThreadState* pss, uint worker_i); + void oops_into_collection_set_do(G1ParScanThreadState* pss, uint worker_i, HeapRegionChunkClaimer* claimer); // Prepare for and cleanup after an oops_into_collection_set_do // call. Must call each of these once before and after (in sequential diff --git a/src/hotspot/share/gc/g1/heapRegionManager.cpp b/src/hotspot/share/gc/g1/heapRegionManager.cpp index 8fa5baf37a4..f77619a041d 100644 --- a/src/hotspot/share/gc/g1/heapRegionManager.cpp +++ b/src/hotspot/share/gc/g1/heapRegionManager.cpp @@ -361,6 +361,41 @@ void HeapRegionManager::par_iterate(HeapRegionClosure* blk, HeapRegionClaimer* h } } +void HeapRegionManager::par_iterate(HeapRegionClosure* blk, + HeapRegionChunkClaimer* chunk_claimer, + const uint start_index) const { + const uint n_regions = chunk_claimer->n_regions(); + for (uint count = 0; count < n_regions; count++) { + const uint index = (start_index + count) % n_regions; + assert(index < n_regions, "Sanity"); + if (!is_available(index)) { + continue; + } + HeapRegion* r = _regions.get_by_index(index); + assert(r->hrm_index() == index, "Sanity"); + if (chunk_claimer->is_region_claimed(index)) { + continue; + } + + // We don't directly claim the chunk or the region, in case the user want different flavors + // of iteration, e.g., forward/backward iteration. + // Instead, we let the hr closure to process the region and trust it will claim the chunks. + // If it processed a region without claiming any chunk, an assertion will fire. +#ifndef PRODUCT + HeapRegionChunkClaimer::RegionClaimState state = chunk_claimer->region_state(index); +#endif + bool res = blk->do_heap_region(r); +#ifndef PRODUCT + HeapRegionChunkClaimer::RegionClaimState new_state = chunk_claimer->region_state(index); + assert(state == HeapRegionChunkClaimer::RegionClaimed || new_state != state, + "Should change region claim state"); +#endif + if (res) { + return; + } + } +} + uint HeapRegionManager::shrink_by(uint num_regions_to_remove) { assert(length() > 0, "the region sequence should not be empty"); assert(length() <= _allocated_heapregions_length, "invariant"); @@ -507,3 +542,41 @@ bool HeapRegionClaimer::claim_region(uint region_index) { uint old_val = Atomic::cmpxchg(Claimed, &_claims[region_index], Unclaimed); return old_val == Unclaimed; } + +HeapRegionChunkClaimer::HeapRegionChunkClaimer(uint n_workers) : + _n_workers(n_workers), + _n_regions(G1CollectedHeap::heap()->_hrm._allocated_heapregions_length), + _n_chunks(_n_regions * ChunksPerRegion), + _claims(NULL) { + assert(n_workers > 0, "Need at least one worker"); + assert(HeapRegion::GrainWords % ChunksPerRegion == 0, "Chunk size must be multiple of word"); + jbyte* new_claims = NEW_C_HEAP_ARRAY(jbyte, _n_chunks, mtGC); + memset(new_claims, Unclaimed, sizeof(*_claims) * _n_chunks); + _claims = new_claims; +} + +HeapRegionChunkClaimer::~HeapRegionChunkClaimer() { + assert(_claims != NULL, "Must be initialized"); + FREE_C_HEAP_ARRAY(jbyte, _claims); +} + +uint HeapRegionChunkClaimer::offset_for_worker(uint worker_id) const { + assert(worker_id < _n_workers, "Invalid worker id"); + return _n_regions * worker_id / _n_workers; +} + +bool HeapRegionChunkClaimer::claim_chunk(uint chunk_index) { + assert(chunk_index < _n_chunks, "Invalid chunk index"); + uint old_val = Atomic::cmpxchg(Claimed, &_claims[chunk_index], Unclaimed, + memory_order_relaxed); + return old_val == Unclaimed; +} + +bool HeapRegionChunkClaimer::claim_region(uint region_index) { + assert(region_index < _n_regions, "Invalid region index"); + // This only works for a completely unclaimed region. If any part of the region is claimed + // by another thread, we cannot claim the whole region. + uint old_val = Atomic::cmpxchg(RegionClaimed, ((RegionClaimState*)_claims) + region_index, + RegionUnclaimed, memory_order_relaxed); + return old_val == RegionUnclaimed; +} diff --git a/src/hotspot/share/gc/g1/heapRegionManager.hpp b/src/hotspot/share/gc/g1/heapRegionManager.hpp index 385d837942b..e5ebd71f9a1 100644 --- a/src/hotspot/share/gc/g1/heapRegionManager.hpp +++ b/src/hotspot/share/gc/g1/heapRegionManager.hpp @@ -33,6 +33,7 @@ class HeapRegion; class HeapRegionClosure; class HeapRegionClaimer; +class HeapRegionChunkClaimer; class FreeRegionList; class WorkGang; @@ -70,6 +71,7 @@ class G1HeapRegionTable : public G1BiasedMappedArray { class HeapRegionManager: public CHeapObj { friend class VMStructs; friend class HeapRegionClaimer; + friend class HeapRegionChunkClaimer; G1HeapRegionTable _regions; @@ -246,6 +248,10 @@ class HeapRegionManager: public CHeapObj { void par_iterate(HeapRegionClosure* blk, HeapRegionClaimer* hrclaimer, const uint start_index) const; + void par_iterate(HeapRegionClosure* blk, + HeapRegionChunkClaimer* chunk_claimer, + const uint start_index) const; + // Uncommit up to num_regions_to_remove regions that are completely free. // Return the actual number of uncommitted regions. uint shrink_by(uint num_regions_to_remove); @@ -287,4 +293,58 @@ class HeapRegionClaimer : public StackObj { // Claim the given region, returns true if successfully claimed. bool claim_region(uint region_index); }; + +// Split a region into chunks for better parallelization granularity. +class HeapRegionChunkClaimer { + friend class HeapRegionManager; + + uint _n_workers; + uint _n_regions; + uint _n_chunks; + volatile jbyte* _claims; + + static const jbyte Unclaimed = 0; + static const jbyte Claimed = 0xff; + + typedef uint64_t RegionClaimState; + + static const RegionClaimState RegionUnclaimed = 0; + static const RegionClaimState RegionClaimed = (RegionClaimState)(-1); + + public: + static const uint ChunksPerRegion = (uint)(sizeof(RegionClaimState) / sizeof(jbyte)); + + HeapRegionChunkClaimer(uint n_workers); + ~HeapRegionChunkClaimer(); + + uint n_regions() const { return _n_regions; } + + uint get_chunk_index(uint region_index, uint chunk) const { + assert(region_index < _n_regions, "Invalid region index"); + assert(chunk < ChunksPerRegion, "Sanity"); + return region_index * ChunksPerRegion + chunk; + } + + uint offset_for_worker(uint worker_id) const; + + RegionClaimState region_state(uint region_index) const { + assert(region_index < _n_regions, "Invalid region index"); + volatile RegionClaimState* state = ((volatile RegionClaimState*)_claims) + region_index; + return *state; + } + + bool is_chunk_claimed(uint chunk_index) const { + assert(chunk_index < _n_chunks, "Invalid chunk index"); + volatile jbyte* state = _claims + chunk_index; + return *state == Claimed; + } + + bool is_region_claimed(uint region_index) const { + return region_state(region_index) == RegionClaimed; + } + + bool claim_chunk(uint chunk_index); + bool claim_region(uint region_index); +}; + #endif // SHARE_VM_GC_G1_HEAPREGIONMANAGER_HPP diff --git a/src/hotspot/share/runtime/globals_ext.hpp b/src/hotspot/share/runtime/globals_ext.hpp index 56efe6cfe77..642e932dfac 100644 --- a/src/hotspot/share/runtime/globals_ext.hpp +++ b/src/hotspot/share/runtime/globals_ext.hpp @@ -87,6 +87,9 @@ product(bool, AppCDSVerifyClassPathOrder, true, \ "Verify classpath order between the dump phase and replay phase") \ \ + product(bool, G1BarrierSimple, false, \ + "Use simple G1 post barrier") \ + \ //add new AJDK specific flags here diff --git a/test/hotspot/jtreg/gc/arguments/TestG1ConcRefinementThreads.java b/test/hotspot/jtreg/gc/arguments/TestG1ConcRefinementThreads.java index 083f0d7fc09..ce5a6750b4c 100644 --- a/test/hotspot/jtreg/gc/arguments/TestG1ConcRefinementThreads.java +++ b/test/hotspot/jtreg/gc/arguments/TestG1ConcRefinementThreads.java @@ -28,6 +28,7 @@ * @key gc * @bug 8047976 * @requires vm.gc.G1 + * @requires vm.opt.final.G1BarrierSimple != true * @summary Tests argument processing for G1ConcRefinementThreads * @library /test/lib * @library / diff --git a/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java b/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java index 2267e06ea55..9beee9eeeeb 100644 --- a/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java +++ b/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java @@ -30,6 +30,7 @@ * includes the expected necessary messages. * @key gc * @requires vm.gc.G1 + * @requires vm.opt.final.G1BarrierSimple != true * @library /test/lib * @modules java.base/jdk.internal.misc * java.management