diff --git a/vmkit/src/main.rs b/vmkit/src/main.rs index 8185fdb..8b3e3d0 100644 --- a/vmkit/src/main.rs +++ b/vmkit/src/main.rs @@ -1,32 +1,32 @@ - +use mmtk::util::options::AffinityKind; +use mmtk::util::Address; use mmtk::{util::options::PlanSelector, vm::slot::SimpleSlot, AllocationSemantics, MMTKBuilder}; -use std::mem::offset_of; +use std::cell::RefCell; use std::sync::OnceLock; +use std::sync::Arc; +use vmkit::threading::parked_scope; use vmkit::{ mm::{traits::Trace, MemoryManager}, object_model::{ metadata::{GCMetadata, TraceCallback}, object::VMKitObject, }, + sync::Monitor, threading::{GCBlockAdapter, Thread, ThreadContext}, VMKit, VirtualMachine, }; -const CONSERVATIVE_TRACE_NODE: bool = false; - #[repr(C)] struct Node { - left: VMKitObject, - right: VMKitObject, - i: usize, - j: usize, + left: NodeRef, + right: NodeRef, } static METADATA: GCMetadata = GCMetadata { trace: TraceCallback::TraceObject(|object, tracer| unsafe { let node = object.as_address().as_mut_ref::(); - node.left.trace_object(tracer); - node.right.trace_object(tracer); + node.left.0.trace_object(tracer); + node.right.0.trace_object(tracer); }), instance_size: size_of::(), compute_size: None, @@ -104,136 +104,155 @@ impl VirtualMachine for BenchVM { } } -fn make_node( - thread: &Thread, - left: VMKitObject, - right: VMKitObject, - i: usize, - j: usize, -) -> VMKitObject { - let node = MemoryManager::allocate( - thread, - size_of::(), - 16, - &METADATA, - AllocationSemantics::Default, - ); - - unsafe { - node.set_field_object_no_write_barrier::(offset_of!(Node, left), left); - node.set_field_object_no_write_barrier::(offset_of!(Node, right), right); - node.set_field_usize::(offset_of!(Node, i), i); - node.set_field_usize::(offset_of!(Node, j), j); - } - node -} +#[repr(transparent)] +#[derive(Clone, Copy, PartialEq, Eq)] +struct NodeRef(VMKitObject); -fn tree_size(i: usize) -> usize { - (1 << (i + 1)) - 1 -} - -fn num_iters(stretch_tree_depth: usize, i: usize) -> usize { - 4 + tree_size(stretch_tree_depth) / tree_size(i) -} - -fn populate(thread: &Thread, depth: usize, this_node: VMKitObject) { - let mut depth = depth; - if depth <= 0 { - return; +impl NodeRef { + pub fn new(thread: &Thread, left: NodeRef, right: NodeRef) -> Self { + let node = MemoryManager::::allocate( + thread, + size_of::(), + 16, + &METADATA, + AllocationSemantics::Default, + ); + unsafe { + let node = node.as_address().as_mut_ref::(); + node.left = left; + node.right = right; + } + Self(node) } - depth -= 1; - this_node.set_field_object::( - offset_of!(Node, left), - make_node(thread, VMKitObject::NULL, VMKitObject::NULL, 0, 0), - ); - let left = this_node.get_field_object::(offset_of!(Node, left)); - this_node.set_field_object::( - offset_of!(Node, right), - make_node(thread, VMKitObject::NULL, VMKitObject::NULL, 0, 0), - ); - - - - populate( - thread, - depth, - this_node.get_field_object::(offset_of!(Node, left)), - ); - populate( - thread, - depth, - this_node.get_field_object::(offset_of!(Node, right)), - ); -} - -fn make_tree(thread: &Thread, depth: usize) -> VMKitObject { - if depth <= 0 { - return make_node(thread, VMKitObject::NULL, VMKitObject::NULL, 0, 0); + pub fn left(&self) -> NodeRef { + unsafe { + let node = self.0.as_address().as_ref::(); + node.left + } } - let left = make_tree(thread, depth - 1); - let right = make_tree(thread, depth - 1); - make_node(thread, left, right, 0, 0) -} - -fn time_construction(thread: &Thread, stretch_tree_depth: usize, depth: usize) { - let i_num_iters = num_iters(stretch_tree_depth, depth); - println!("creating {} trees of depth {}", i_num_iters, depth); - let start = std::time::Instant::now(); - - let mut i = 0; - while i < i_num_iters { - let temp_tree = make_node(thread, VMKitObject::NULL, VMKitObject::NULL, 0, 0); - populate(thread, depth, temp_tree); - i += 1; + pub fn right(&self) -> NodeRef { + unsafe { + let node = self.0.as_address().as_ref::(); + node.right + } } - let finish = std::time::Instant::now(); - println!("\tTop down construction took: {:04}ms", finish.duration_since(start).as_micros() as f64 / 1000.0); - + pub fn null() -> Self { + Self(VMKitObject::NULL) + } - let duration = start.elapsed(); - println!("time_construction: {:?}", duration); + pub fn item_check(&self) -> usize { + if self.left() == NodeRef::null() { + 1 + } else { + 1 + self.left().item_check() + self.right().item_check() + } + } + + pub fn leaf(thread: &Thread) -> Self { + Self::new(thread, NodeRef::null(), NodeRef::null()) + } } +fn bottom_up_tree(thread: &Thread, depth: usize) -> NodeRef { + if thread.take_yieldpoint() != 0 { + Thread::::yieldpoint(0, Address::ZERO); + } + if depth > 0 { + NodeRef::new( + thread, + bottom_up_tree(thread, depth - 1), + bottom_up_tree(thread, depth - 1), + ) + } else { + NodeRef::leaf(thread) + } +} + +const MIN_DEPTH: usize = 4; + fn main() { env_logger::init(); - let mut options = MMTKBuilder::new(); - options.options.plan.set(PlanSelector::StickyImmix); - options.options.gc_trigger.set(mmtk::util::options::GCTriggerSelector::DynamicHeapSize(64*1024*1024, 8*1024*1024*1024)); - let vm = BenchVM { - vmkit: VMKit::new(options) - }; - - VM.set(vm).unwrap_or_else(|_| panic!("Failed to set VM")); + let nthreads = std::env::var("THREADS") + .unwrap_or("4".to_string()) + .parse::() + .unwrap(); + let mut builder = MMTKBuilder::new(); + builder.options.plan.set(PlanSelector::Immix); + builder.options.threads.set(nthreads); + builder.options.thread_affinity.set(AffinityKind::RoundRobin(vec![0, 1, 2, 3, 4, 5, 6, 7, 8])); + builder.options.gc_trigger.set(mmtk::util::options::GCTriggerSelector::DynamicHeapSize(1*1024*1024*1024, 3*1024*1024*1024)); + VM.set(BenchVM { + vmkit: VMKit::new(builder), + }) + .unwrap_or_else(|_| panic!()); Thread::::main(ThreadBenchContext, || { - let tls= Thread::::current(); - - let depth = std::env::var("DEPTH").unwrap_or("18".to_string()).parse::().unwrap(); - let long_lived_tree_depth = depth; - - let stretch_tree_depth = depth + 1; - - println!("stretching memory with tree of depth: {}", stretch_tree_depth); + let thread = Thread::::current(); let start = std::time::Instant::now(); - make_tree(tls, stretch_tree_depth as _); + let n = std::env::var("DEPTH") + .unwrap_or("18".to_string()) + .parse::() + .unwrap(); + let max_depth = if n < MIN_DEPTH + 2 { MIN_DEPTH + 2 } else { n }; - println!("creating long-lived tree of depth: {}", long_lived_tree_depth); - let long_lived_tree = make_node(tls, VMKitObject::NULL, VMKitObject::NULL, 0, 0); - populate(tls, long_lived_tree_depth as _, long_lived_tree); + let stretch_depth = max_depth + 1; + println!("stretch tree of depth {stretch_depth}"); + + let _ = bottom_up_tree(&thread, stretch_depth); + let duration = start.elapsed(); + println!("time: {duration:?}"); - let mut d = 4; + let results = Arc::new(Monitor::new(vec![ + RefCell::new(String::new()); + (max_depth - MIN_DEPTH) / 2 + 1 + ])); - while d <= depth { - time_construction(tls, stretch_tree_depth, d); - d += 2; + let mut handles = Vec::new(); + + for d in (MIN_DEPTH..=max_depth).step_by(2) { + let depth = d; + + let thread = Thread::::for_mutator(ThreadBenchContext); + let results = results.clone(); + let handle = thread.start(move || { + let thread = Thread::::current(); + let mut check = 0; + + let iterations = 1 << (max_depth - depth + MIN_DEPTH); + for _ in 1..=iterations { + if thread.take_yieldpoint() != 0 { + Thread::::yieldpoint(0, Address::ZERO); + } + let tree_node = bottom_up_tree(&thread, depth); + check += tree_node.item_check(); + } + + *results.lock_with_handshake::()[(depth - MIN_DEPTH) / 2].borrow_mut() = + format!("{iterations}\t trees of depth {depth}\t check: {check}"); + }); + handles.push(handle); } - let finish = std::time::Instant::now(); - println!("total execution time: {:04}ms", finish.duration_since(start).as_micros() as f64 / 1000.0); - + parked_scope::<(), BenchVM>(|| { + while let Some(handle) = handles.pop() { + handle.join().unwrap(); + } + }); + + for result in results.lock_with_handshake::().iter() { + println!("{}", result.borrow()); + } + + println!( + "long lived tree of depth {max_depth}\t check: {}", + bottom_up_tree(&thread, max_depth).item_check() + ); + + let duration = start.elapsed(); + println!("time: {duration:?}"); }); -} \ No newline at end of file +} diff --git a/vmkit/src/mm.rs b/vmkit/src/mm.rs index 419ca99..1d45907 100644 --- a/vmkit/src/mm.rs +++ b/vmkit/src/mm.rs @@ -10,7 +10,7 @@ use crate::{ use easy_bitfield::{AtomicBitfieldContainer, ToBitfield}; use mmtk::{ util::{ - alloc::{AllocatorSelector, BumpAllocator, ImmixAllocator}, + alloc::{AllocatorSelector, BumpAllocator, FreeListAllocator, ImmixAllocator}, metadata::side_metadata::GLOBAL_SIDE_METADATA_BASE_ADDRESS, VMMutatorThread, }, @@ -65,6 +65,49 @@ impl MemoryManager { ) } + /// General purpose allocation function. Always goes to `mmtk::memory_manager::alloc` + /// and does not attempt to perform fast-path allocation. This is useful for debugging + /// or when your JIT/AOT compiler is not yet able to produce fast-path allocation. + #[inline(never)] + pub extern "C-unwind" fn allocate_out_of_line( + thread: &Thread, + mut size: usize, + alignment: usize, + metadata: VM::Metadata, + mut semantics: AllocationSemantics, + ) -> VMKitObject { + size += size_of::>(); + if semantics == AllocationSemantics::Default + && size >= thread.max_non_los_default_alloc_bytes() + { + semantics = AllocationSemantics::Los; + } + + match semantics { + AllocationSemantics::Los => Self::allocate_los(thread, size, alignment, metadata), + AllocationSemantics::NonMoving => { + Self::allocate_nonmoving(thread, size, alignment, metadata) + } + AllocationSemantics::Immortal => { + Self::allocate_immortal(thread, size, alignment, metadata) + } + _ => unsafe { + Self::flush_tlab(thread); + let object_start = + mmtk::memory_manager::alloc(thread.mutator(), size, alignment, 0, semantics); + + object_start.store(HeapObjectHeader:: { + metadata: AtomicBitfieldContainer::new(metadata.to_bitfield()), + marker: PhantomData, + }); + let object = VMKitObject::from_address(object_start + OBJECT_REF_OFFSET); + Self::set_vo_bit(object); + Self::refill_tlab(thread); + object + }, + } + } + /// Allocate object with `size`, `alignment`, and `metadata` with specified `semantics`. /// /// This function is a fast-path for allocation. If you allocate with `Default` semantics, @@ -78,41 +121,46 @@ impl MemoryManager { metadata: VM::Metadata, mut semantics: AllocationSemantics, ) -> VMKitObject { + let orig_size = size; + let orig_semantics = semantics; size += size_of::>(); if semantics == AllocationSemantics::Default && size >= thread.max_non_los_default_alloc_bytes() { semantics = AllocationSemantics::Los; } + // all allocator functions other than this actually invoke `flush_tlab` due to the fact // that GC can happen inside them. match semantics { - AllocationSemantics::Los => Self::allocate_los(thread, size, alignment, metadata), - AllocationSemantics::NonMoving => { - Self::allocate_nonmoving(thread, size, alignment, metadata) - } - AllocationSemantics::Immortal => { - Self::allocate_immortal(thread, size, alignment, metadata) - } - _ => unsafe { - let tlab = thread.tlab.get().as_mut().unwrap(); - let object_start = tlab.allocate(size, alignment); - if !object_start.is_zero() { - object_start.store(HeapObjectHeader:: { - metadata: AtomicBitfieldContainer::new(metadata.to_bitfield()), - marker: PhantomData, - }); - let object = VMKitObject::from_address(object_start + OBJECT_REF_OFFSET); - Self::set_vo_bit(object); - return object; - } + AllocationSemantics::Default => match thread.alloc_fastpath() { + AllocFastPath::TLAB => unsafe { + let tlab = thread.tlab.get().as_mut().unwrap(); + let object_start = tlab.allocate(size, alignment); + if !object_start.is_zero() { + object_start.store(HeapObjectHeader:: { + metadata: AtomicBitfieldContainer::new(metadata.to_bitfield()), + marker: PhantomData, + }); + let object = VMKitObject::from_address(object_start + OBJECT_REF_OFFSET); + Self::set_vo_bit(object); + return object; + } - Self::allocate_slow(thread, size, alignment, metadata, semantics) + return Self::allocate_slow(thread, size, alignment, metadata, semantics) + }, + + _ => () }, + + _ => () } + + Self::allocate_out_of_line(thread, orig_size, alignment, metadata, orig_semantics) } - pub extern "C-unwind" fn allocate_los( + #[inline(never)] + extern "C-unwind" fn allocate_los( thread: &Thread, size: usize, alignment: usize, @@ -142,7 +190,8 @@ impl MemoryManager { } } - pub extern "C-unwind" fn allocate_nonmoving( + #[inline(never)] + extern "C-unwind" fn allocate_nonmoving( thread: &Thread, size: usize, alignment: usize, @@ -171,7 +220,8 @@ impl MemoryManager { } } - pub extern "C-unwind" fn allocate_immortal( + #[inline(never)] + extern "C-unwind" fn allocate_immortal( thread: &Thread, size: usize, alignment: usize, @@ -227,7 +277,7 @@ impl MemoryManager { } } - #[inline(never)] + #[inline(always)] pub extern "C-unwind" fn set_vo_bit(object: VMKitObject) { #[cfg(feature = "cooperative")] unsafe { @@ -400,3 +450,10 @@ impl MemoryManager { Self::object_reference_write_post(thread, src, slot, target); } } + +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub enum AllocFastPath { + TLAB, + FreeList, + None, +} diff --git a/vmkit/src/mm/active_plan.rs b/vmkit/src/mm/active_plan.rs index 1f0c2db..1f3a100 100644 --- a/vmkit/src/mm/active_plan.rs +++ b/vmkit/src/mm/active_plan.rs @@ -10,7 +10,7 @@ pub struct VMKitActivePlan(PhantomData); impl ActivePlan> for VMKitActivePlan { fn is_mutator(tls: mmtk::util::VMThread) -> bool { - let x = Thread::::from_vm_thread(tls); + Thread::::from_vm_thread(tls).active_mutator_context() } diff --git a/vmkit/src/mm/aslr.rs b/vmkit/src/mm/aslr.rs index 2697e75..a8d9ed8 100644 --- a/vmkit/src/mm/aslr.rs +++ b/vmkit/src/mm/aslr.rs @@ -1,3 +1,4 @@ +#![allow(dead_code, unused_imports, unused_variables)] //! Address space layout randomization for MMTk. use mmtk::util::{ diff --git a/vmkit/src/mm/scanning.rs b/vmkit/src/mm/scanning.rs index 24bb22c..9853395 100644 --- a/vmkit/src/mm/scanning.rs +++ b/vmkit/src/mm/scanning.rs @@ -117,6 +117,7 @@ impl Scanning> for VMKitScanning { VM::scan_vm_specific_roots(tls, factory); } + #[inline(always)] fn support_slot_enqueuing( _tls: mmtk::util::VMWorkerThread, object: mmtk::util::ObjectReference, diff --git a/vmkit/src/object_model/object.rs b/vmkit/src/object_model/object.rs index bb8cedd..1eca9ec 100644 --- a/vmkit/src/object_model/object.rs +++ b/vmkit/src/object_model/object.rs @@ -137,15 +137,21 @@ impl VMKitObject { pub fn bytes_used(self) -> usize { let metadata = self.header::().metadata().gc_metadata(); let overhead = self.hashcode_overhead::(); - + if metadata.instance_size != 0 { - raw_align_up(metadata.instance_size + size_of::>(), align_of::()) + overhead + raw_align_up( + metadata.instance_size + size_of::>(), + align_of::(), + ) + overhead } else { let Some(compute_size) = metadata.compute_size else { panic!("compute_size is not set for object at {}", self.0); }; - raw_align_up(compute_size(self) + size_of::>(), align_of::()) + overhead + raw_align_up( + compute_size(self) + size_of::>(), + align_of::(), + ) + overhead } } @@ -198,7 +204,7 @@ impl VMKitObject { let res = self .0 .offset(-(OBJECT_REF_OFFSET as isize + self.hashcode_overhead::() as isize)); - + res } @@ -258,7 +264,7 @@ impl VMKitObject { "attempt to access field out of bounds" ); let ordering = if !VOLATILE { - atomic::Ordering::Relaxed + return self.as_address().add(offset).load::(); } else { atomic::Ordering::SeqCst }; @@ -282,7 +288,8 @@ impl VMKitObject { ); unsafe { let ordering = if !VOLATILE { - atomic::Ordering::Relaxed + self.as_address().add(offset).store(value); + return; } else { atomic::Ordering::SeqCst }; diff --git a/vmkit/src/threading.rs b/vmkit/src/threading.rs index 197b58a..4f54779 100644 --- a/vmkit/src/threading.rs +++ b/vmkit/src/threading.rs @@ -11,14 +11,15 @@ use std::{ use atomic::Atomic; use mmtk::{ - util::{Address, VMMutatorThread, VMThread}, + util::{alloc::AllocatorSelector, Address, VMMutatorThread, VMThread}, vm::RootsWorkFactory, - BarrierSelector, Mutator, + AllocationSemantics, BarrierSelector, Mutator, }; use crate::{ mm::{ - conservative_roots::ConservativeRoots, stack_bounds::StackBounds, tlab::TLAB, MemoryManager, + conservative_roots::ConservativeRoots, stack_bounds::StackBounds, tlab::TLAB, + AllocFastPath, MemoryManager, }, object_model::compression::CompressedOps, sync::{Monitor, MonitorGuard}, @@ -156,6 +157,7 @@ pub struct Thread { pub tlab: UnsafeCell, max_non_los_default_alloc_bytes: Cell, barrier: Cell, + alloc_fastpath: Cell, mmtk_mutator: UnsafeCell>>>, has_collector_context: AtomicBool, exec_status: Atomic, @@ -223,6 +225,7 @@ impl Thread { tlab: UnsafeCell::new(TLAB::new()), stack_bounds: OnceCell::new(), barrier: Cell::new(BarrierSelector::NoBarrier), + alloc_fastpath: Cell::new(AllocFastPath::None), max_non_los_default_alloc_bytes: Cell::new(0), take_yieldpoint: AtomicI32::new(0), context: ctx.unwrap_or_else(|| VM::ThreadContext::new(collector_context)), @@ -305,6 +308,10 @@ impl Thread { self.barrier.get() } + pub fn alloc_fastpath(&self) -> AllocFastPath { + self.alloc_fastpath.get() + } + pub fn max_non_los_default_alloc_bytes(&self) -> usize { self.max_non_los_default_alloc_bytes.get() } @@ -321,6 +328,23 @@ impl Thread { self.max_non_los_default_alloc_bytes .set(constraints.max_non_los_default_alloc_bytes); self.barrier.set(constraints.barrier); + + let selector = mmtk::memory_manager::get_allocator_mapping( + &VM::get().vmkit().mmtk, + AllocationSemantics::Default, + ); + match selector { + AllocatorSelector::BumpPointer(_) | AllocatorSelector::Immix(_) => { + self.alloc_fastpath.set(AllocFastPath::TLAB); + } + + AllocatorSelector::FreeList(_) => { + self.alloc_fastpath.set(AllocFastPath::FreeList); + } + + _ => self.alloc_fastpath.set(AllocFastPath::None), + } + self.stack_bounds .set(StackBounds::current_thread_stack_bounds()) .unwrap(); @@ -542,6 +566,10 @@ impl Thread { let mut lock = self.monitor().lock_no_handshake(); self.is_blocking.store(true, Ordering::Relaxed); + log::trace!("Thread #{} in check_block_no_save_context", self.thread_id); + + let mut had_really_blocked = false; + loop { // deal with block requests self.acknowledge_block_requests(); @@ -549,11 +577,26 @@ impl Thread { if !self.is_blocked() { break; } + + had_really_blocked = true; + log::trace!( + "Thread #{} is really blocked with status {:?}", + self.thread_id, + self.get_exec_status() + ); // what if a GC request comes while we're here for a suspend() // request? // answer: we get awoken, reloop, and acknowledge the GC block // request. lock.wait_no_handshake(); + log::trace!( + "Thread #{} has awoken; checking if we're still blocked", + self.thread_id + ); + } + + if had_really_blocked { + log::trace!("Thread #{} is unblocking", self.thread_id); } // SAFETY: We are holding the monitor lock. @@ -689,22 +732,43 @@ impl Thread { let mut lock = self.monitor.lock_no_handshake(); let token = A::request_block(self); - + log::trace!( + "Thread #{} is requesting that thread #{} blocks", + current_thread::().thread_id, + self.thread_id + ); if current_thread::().thread_id == self.thread_id { + log::trace!("Thread #{} is blocking itself", self.thread_id); self.check_block(); result = self.get_exec_status(); } else { if self.is_about_to_terminate() { + log::trace!( + "Thread #{} is about to terminate, returning as if blocked in TERMINATED state", + self.thread_id + ); result = ThreadState::Terminated; } else { self.take_yieldpoint.store(1, Ordering::Relaxed); let new_state = self.set_blocked_exec_status(); result = new_state; + log::trace!( + "Thread #{} is blocking thread #{} which is in state {:?}", + current_thread::().thread_id, + self.thread_id, + new_state + ); self.monitor.notify_all(); if new_state == ThreadState::InManagedToBlock { if !asynchronous { + log::trace!( + "Thread #{} is waiting for thread #{} to block", + current_thread::().thread_id, + self.thread_id + ); + while A::has_block_request_with_token(self, token) && !A::is_blocked(self) && !self.is_about_to_terminate() @@ -874,7 +938,7 @@ impl Thread { pub fn mutator(&self) -> &'static mut Mutator> { unsafe { - assert!(Thread::::current().thread_id == self.thread_id); + debug_assert!(Thread::::current().thread_id == self.thread_id); self.mutator_unchecked() } } @@ -1042,6 +1106,8 @@ impl Thread { /// - `fp`: The frame pointer of the service method that called this method /// /// Exposed as `extern "C-unwind"` to allow directly invoking it from JIT/AOT code. + #[inline(never)] + #[cold] pub extern "C-unwind" fn yieldpoint(where_from: i32, fp: Address) { let thread = Thread::::current(); let _was_at_yieldpoint = thread.at_yieldpoint.load(atomic::Ordering::Relaxed); @@ -1094,7 +1160,7 @@ thread_local! { pub fn current_thread() -> &'static Thread { let addr = CURRENT_THREAD.with(|t| *t.borrow()); - assert!(!addr.is_zero()); + debug_assert!(!addr.is_zero()); unsafe { addr.as_ref() } } @@ -1240,6 +1306,7 @@ impl ThreadManager { /// Fixpoint until there are no threads that we haven't blocked. Fixpoint is needed to /// catch the (unlikely) case that a thread spawns another thread while we are waiting. pub fn block_all_mutators_for_gc(&self) -> Vec>> { + let mut handshake_threads = Vec::with_capacity(4); loop { let lock = self.inner.lock_no_handshake();