diff --git a/vmkit/src/main.rs b/vmkit/src/main.rs
index 8185fdb..8b3e3d0 100644
--- a/vmkit/src/main.rs
+++ b/vmkit/src/main.rs
@@ -1,32 +1,32 @@
-
+use mmtk::util::options::AffinityKind;
+use mmtk::util::Address;
 use mmtk::{util::options::PlanSelector, vm::slot::SimpleSlot, AllocationSemantics, MMTKBuilder};
-use std::mem::offset_of;
+use std::cell::RefCell;
 use std::sync::OnceLock;
+use std::sync::Arc;
+use vmkit::threading::parked_scope;
 use vmkit::{
     mm::{traits::Trace, MemoryManager},
     object_model::{
         metadata::{GCMetadata, TraceCallback},
         object::VMKitObject,
     },
+    sync::Monitor,
     threading::{GCBlockAdapter, Thread, ThreadContext},
     VMKit, VirtualMachine,
 };
 
-const CONSERVATIVE_TRACE_NODE: bool = false;
-
 #[repr(C)]
 struct Node {
-    left: VMKitObject,
-    right: VMKitObject,
-    i: usize,
-    j: usize,
+    left: NodeRef,
+    right: NodeRef,
 }
 
 static METADATA: GCMetadata<BenchVM> = GCMetadata {
     trace: TraceCallback::TraceObject(|object, tracer| unsafe {
         let node = object.as_address().as_mut_ref::<Node>();
-        node.left.trace_object(tracer);
-        node.right.trace_object(tracer);
+        node.left.0.trace_object(tracer);
+        node.right.0.trace_object(tracer);
     }),
     instance_size: size_of::<Node>(),
     compute_size: None,
@@ -104,136 +104,155 @@ impl VirtualMachine for BenchVM {
     }
 }
 
-fn make_node(
-    thread: &Thread<BenchVM>,
-    left: VMKitObject,
-    right: VMKitObject,
-    i: usize,
-    j: usize,
-) -> VMKitObject {
-    let node = MemoryManager::allocate(
-        thread,
-        size_of::<Node>(),
-        16,
-        &METADATA,
-        AllocationSemantics::Default,
-    );
-   
-    unsafe {
-        node.set_field_object_no_write_barrier::<BenchVM, false>(offset_of!(Node, left), left);
-        node.set_field_object_no_write_barrier::<BenchVM, false>(offset_of!(Node, right), right);
-        node.set_field_usize::<BenchVM>(offset_of!(Node, i), i);
-        node.set_field_usize::<BenchVM>(offset_of!(Node, j), j);
-    }
-    node
-}
+#[repr(transparent)]
+#[derive(Clone, Copy, PartialEq, Eq)]
+struct NodeRef(VMKitObject);
 
-fn tree_size(i: usize) -> usize {
-    (1 << (i + 1)) - 1
-}
-
-fn num_iters(stretch_tree_depth: usize, i: usize) -> usize {
-    4 + tree_size(stretch_tree_depth) / tree_size(i)
-}
-
-fn populate(thread: &Thread<BenchVM>, depth: usize, this_node: VMKitObject) {
-    let mut depth = depth;
-    if depth <= 0 {
-        return;
+impl NodeRef {
+    pub fn new(thread: &Thread<BenchVM>, left: NodeRef, right: NodeRef) -> Self {
+        let node = MemoryManager::<BenchVM>::allocate(
+            thread,
+            size_of::<Node>(),
+            16,
+            &METADATA,
+            AllocationSemantics::Default,
+        );
+        unsafe {
+            let node = node.as_address().as_mut_ref::<Node>();
+            node.left = left;
+            node.right = right;
+        }
+        Self(node)
     }
 
-    depth -= 1;
-    this_node.set_field_object::<BenchVM, false>(
-        offset_of!(Node, left),
-        make_node(thread, VMKitObject::NULL, VMKitObject::NULL, 0, 0),
-    );
-    let left = this_node.get_field_object::<BenchVM, false>(offset_of!(Node, left));
-    this_node.set_field_object::<BenchVM, false>(
-        offset_of!(Node, right),
-        make_node(thread, VMKitObject::NULL, VMKitObject::NULL, 0, 0),
-    );
-
-    
-
-    populate(
-        thread,
-        depth,
-        this_node.get_field_object::<BenchVM, false>(offset_of!(Node, left)),
-    );
-    populate(
-        thread,
-        depth,
-        this_node.get_field_object::<BenchVM, false>(offset_of!(Node, right)),
-    );
-}
-
-fn make_tree(thread: &Thread<BenchVM>, depth: usize) -> VMKitObject {
-    if depth <= 0 {
-        return make_node(thread, VMKitObject::NULL, VMKitObject::NULL, 0, 0);
+    pub fn left(&self) -> NodeRef {
+        unsafe {
+            let node = self.0.as_address().as_ref::<Node>();
+            node.left
+        }
     }
 
-    let left = make_tree(thread, depth - 1);
-    let right = make_tree(thread, depth - 1);
-    make_node(thread, left, right, 0, 0)
-}
-
-fn time_construction(thread: &Thread<BenchVM>, stretch_tree_depth: usize, depth: usize) {
-    let i_num_iters = num_iters(stretch_tree_depth, depth);
-    println!("creating {} trees of depth {}", i_num_iters, depth);
-    let start = std::time::Instant::now();
-    
-    let mut i = 0;
-    while i < i_num_iters {
-        let temp_tree = make_node(thread, VMKitObject::NULL, VMKitObject::NULL, 0, 0);
-        populate(thread, depth, temp_tree);
-        i += 1;
+    pub fn right(&self) -> NodeRef {
+        unsafe {
+            let node = self.0.as_address().as_ref::<Node>();
+            node.right
+        }
     }
 
-    let finish = std::time::Instant::now();
-    println!("\tTop down construction took: {:04}ms", finish.duration_since(start).as_micros() as f64 / 1000.0);
-    
+    pub fn null() -> Self {
+        Self(VMKitObject::NULL)
+    }
 
-    let duration = start.elapsed();
-    println!("time_construction: {:?}", duration);
+    pub fn item_check(&self) -> usize {
+        if self.left() == NodeRef::null() {
+            1
+        } else {
+            1 + self.left().item_check() + self.right().item_check()
+        }
+    }
+
+    pub fn leaf(thread: &Thread<BenchVM>) -> Self {
+        Self::new(thread, NodeRef::null(), NodeRef::null())
+    }
 }
 
+fn bottom_up_tree(thread: &Thread<BenchVM>, depth: usize) -> NodeRef {
+    if thread.take_yieldpoint() != 0 {
+        Thread::<BenchVM>::yieldpoint(0, Address::ZERO);
+    }
+    if depth > 0 {
+        NodeRef::new(
+            thread,
+            bottom_up_tree(thread, depth - 1),
+            bottom_up_tree(thread, depth - 1),
+        )
+    } else {
+        NodeRef::leaf(thread)
+    }
+}
+
+const MIN_DEPTH: usize = 4;
+
 fn main() {
     env_logger::init();
-    let mut options = MMTKBuilder::new();
-    options.options.plan.set(PlanSelector::StickyImmix);
-    options.options.gc_trigger.set(mmtk::util::options::GCTriggerSelector::DynamicHeapSize(64*1024*1024, 8*1024*1024*1024));
-    let vm = BenchVM {
-        vmkit: VMKit::new(options)
-    };
-
-    VM.set(vm).unwrap_or_else(|_| panic!("Failed to set VM"));
+    let nthreads = std::env::var("THREADS")
+        .unwrap_or("4".to_string())
+        .parse::<usize>()
+        .unwrap();
+    let mut builder = MMTKBuilder::new();
+    builder.options.plan.set(PlanSelector::Immix);
+    builder.options.threads.set(nthreads);
+    builder.options.thread_affinity.set(AffinityKind::RoundRobin(vec![0, 1, 2, 3, 4, 5, 6, 7, 8]));
+    builder.options.gc_trigger.set(mmtk::util::options::GCTriggerSelector::DynamicHeapSize(1*1024*1024*1024, 3*1024*1024*1024));
+    VM.set(BenchVM {
+        vmkit: VMKit::new(builder),
+    })
+    .unwrap_or_else(|_| panic!());
 
     Thread::<BenchVM>::main(ThreadBenchContext, || {
-        let tls= Thread::<BenchVM>::current();
-        
-        let depth = std::env::var("DEPTH").unwrap_or("18".to_string()).parse::<usize>().unwrap();
-        let long_lived_tree_depth = depth;
-        
-        let stretch_tree_depth = depth + 1;
-        
-        println!("stretching memory with tree of depth: {}", stretch_tree_depth);
+        let thread = Thread::<BenchVM>::current();
         let start = std::time::Instant::now();
-        make_tree(tls, stretch_tree_depth as _);
+        let n = std::env::var("DEPTH")
+            .unwrap_or("18".to_string())
+            .parse::<usize>()
+            .unwrap();
+        let max_depth = if n < MIN_DEPTH + 2 { MIN_DEPTH + 2 } else { n };
 
-        println!("creating long-lived tree of depth: {}", long_lived_tree_depth);
-        let long_lived_tree = make_node(tls, VMKitObject::NULL, VMKitObject::NULL, 0, 0);
-        populate(tls, long_lived_tree_depth as _, long_lived_tree);
+        let stretch_depth = max_depth + 1;
 
+        println!("stretch tree of depth {stretch_depth}");
+        
+        let _ = bottom_up_tree(&thread, stretch_depth);
+        let duration = start.elapsed();
+        println!("time: {duration:?}");
 
-        let mut d = 4;
+        let results = Arc::new(Monitor::new(vec![
+            RefCell::new(String::new());
+            (max_depth - MIN_DEPTH) / 2 + 1
+        ]));
 
-        while d <= depth {
-            time_construction(tls, stretch_tree_depth, d);
-            d += 2;
+        let mut handles = Vec::new();
+
+        for d in (MIN_DEPTH..=max_depth).step_by(2) {
+            let depth = d;
+
+            let thread = Thread::<BenchVM>::for_mutator(ThreadBenchContext);
+            let results = results.clone();
+            let handle = thread.start(move || {
+                let thread = Thread::<BenchVM>::current();
+                let mut check = 0;
+
+                let iterations = 1 << (max_depth - depth + MIN_DEPTH);
+                for _ in 1..=iterations {
+                    if thread.take_yieldpoint() != 0 {
+                        Thread::<BenchVM>::yieldpoint(0, Address::ZERO);
+                    }
+                    let tree_node = bottom_up_tree(&thread, depth);
+                    check += tree_node.item_check();
+                }
+
+                *results.lock_with_handshake::<BenchVM>()[(depth - MIN_DEPTH) / 2].borrow_mut() =
+                    format!("{iterations}\t trees of depth {depth}\t check: {check}");
+            });
+            handles.push(handle);
         }
 
-        let finish = std::time::Instant::now();
-        println!("total execution time: {:04}ms", finish.duration_since(start).as_micros() as f64 / 1000.0);
-        
+        parked_scope::<(), BenchVM>(|| {
+            while let Some(handle) = handles.pop() {
+                handle.join().unwrap();
+            }
+        });
+
+        for result in results.lock_with_handshake::<BenchVM>().iter() {
+            println!("{}", result.borrow());
+        }
+
+        println!(
+            "long lived tree of depth {max_depth}\t check: {}",
+            bottom_up_tree(&thread, max_depth).item_check()
+        );
+
+        let duration = start.elapsed();
+        println!("time: {duration:?}");
     });
-}
\ No newline at end of file
+}
diff --git a/vmkit/src/mm.rs b/vmkit/src/mm.rs
index 419ca99..1d45907 100644
--- a/vmkit/src/mm.rs
+++ b/vmkit/src/mm.rs
@@ -10,7 +10,7 @@ use crate::{
 use easy_bitfield::{AtomicBitfieldContainer, ToBitfield};
 use mmtk::{
     util::{
-        alloc::{AllocatorSelector, BumpAllocator, ImmixAllocator},
+        alloc::{AllocatorSelector, BumpAllocator, FreeListAllocator, ImmixAllocator},
         metadata::side_metadata::GLOBAL_SIDE_METADATA_BASE_ADDRESS,
         VMMutatorThread,
     },
@@ -65,6 +65,49 @@ impl<VM: VirtualMachine> MemoryManager<VM> {
         )
     }
 
+    /// General purpose allocation function. Always goes to `mmtk::memory_manager::alloc`
+    /// and does not attempt to perform fast-path allocation. This is useful for debugging
+    /// or when your JIT/AOT compiler is not yet able to produce fast-path allocation.
+    #[inline(never)]
+    pub extern "C-unwind" fn allocate_out_of_line(
+        thread: &Thread<VM>,
+        mut size: usize,
+        alignment: usize,
+        metadata: VM::Metadata,
+        mut semantics: AllocationSemantics,
+    ) -> VMKitObject {
+        size += size_of::<HeapObjectHeader<VM>>();
+        if semantics == AllocationSemantics::Default
+            && size >= thread.max_non_los_default_alloc_bytes()
+        {
+            semantics = AllocationSemantics::Los;
+        }
+
+        match semantics {
+            AllocationSemantics::Los => Self::allocate_los(thread, size, alignment, metadata),
+            AllocationSemantics::NonMoving => {
+                Self::allocate_nonmoving(thread, size, alignment, metadata)
+            }
+            AllocationSemantics::Immortal => {
+                Self::allocate_immortal(thread, size, alignment, metadata)
+            }
+            _ => unsafe {
+                Self::flush_tlab(thread);
+                let object_start =
+                    mmtk::memory_manager::alloc(thread.mutator(), size, alignment, 0, semantics);
+
+                object_start.store(HeapObjectHeader::<VM> {
+                    metadata: AtomicBitfieldContainer::new(metadata.to_bitfield()),
+                    marker: PhantomData,
+                });
+                let object = VMKitObject::from_address(object_start + OBJECT_REF_OFFSET);
+                Self::set_vo_bit(object);
+                Self::refill_tlab(thread);
+                object
+            },
+        }
+    }
+
     /// Allocate object with `size`, `alignment`, and `metadata` with specified `semantics`.
     ///
     /// This function is a fast-path for allocation. If you allocate with `Default` semantics,
@@ -78,41 +121,46 @@ impl<VM: VirtualMachine> MemoryManager<VM> {
         metadata: VM::Metadata,
         mut semantics: AllocationSemantics,
     ) -> VMKitObject {
+        let orig_size = size;
+        let orig_semantics = semantics;
         size += size_of::<HeapObjectHeader<VM>>();
         if semantics == AllocationSemantics::Default
             && size >= thread.max_non_los_default_alloc_bytes()
         {
             semantics = AllocationSemantics::Los;
         }
+
         // all allocator functions other than this actually invoke `flush_tlab` due to the fact
         // that GC can happen inside them.
         match semantics {
-            AllocationSemantics::Los => Self::allocate_los(thread, size, alignment, metadata),
-            AllocationSemantics::NonMoving => {
-                Self::allocate_nonmoving(thread, size, alignment, metadata)
-            }
-            AllocationSemantics::Immortal => {
-                Self::allocate_immortal(thread, size, alignment, metadata)
-            }
-            _ => unsafe {
-                let tlab = thread.tlab.get().as_mut().unwrap();
-                let object_start = tlab.allocate(size, alignment);
-                if !object_start.is_zero() {
-                    object_start.store(HeapObjectHeader::<VM> {
-                        metadata: AtomicBitfieldContainer::new(metadata.to_bitfield()),
-                        marker: PhantomData,
-                    });
-                    let object = VMKitObject::from_address(object_start + OBJECT_REF_OFFSET);
-                    Self::set_vo_bit(object);
-                    return object;
-                }
+            AllocationSemantics::Default => match thread.alloc_fastpath() {
+                AllocFastPath::TLAB => unsafe {
+                    let tlab = thread.tlab.get().as_mut().unwrap();
+                    let object_start = tlab.allocate(size, alignment);
+                    if !object_start.is_zero() {
+                        object_start.store(HeapObjectHeader::<VM> {
+                            metadata: AtomicBitfieldContainer::new(metadata.to_bitfield()),
+                            marker: PhantomData,
+                        });
+                        let object = VMKitObject::from_address(object_start + OBJECT_REF_OFFSET);
+                        Self::set_vo_bit(object);
+                        return object;
+                    }
 
-                Self::allocate_slow(thread, size, alignment, metadata, semantics)
+                    return Self::allocate_slow(thread, size, alignment, metadata, semantics)
+                },
+
+                _ => ()
             },
+
+            _ => ()
         }
+
+        Self::allocate_out_of_line(thread, orig_size, alignment, metadata, orig_semantics)
     }
 
-    pub extern "C-unwind" fn allocate_los(
+    #[inline(never)]
+    extern "C-unwind" fn allocate_los(
         thread: &Thread<VM>,
         size: usize,
         alignment: usize,
@@ -142,7 +190,8 @@ impl<VM: VirtualMachine> MemoryManager<VM> {
         }
     }
 
-    pub extern "C-unwind" fn allocate_nonmoving(
+    #[inline(never)]
+    extern "C-unwind" fn allocate_nonmoving(
         thread: &Thread<VM>,
         size: usize,
         alignment: usize,
@@ -171,7 +220,8 @@ impl<VM: VirtualMachine> MemoryManager<VM> {
         }
     }
 
-    pub extern "C-unwind" fn allocate_immortal(
+    #[inline(never)]
+    extern "C-unwind" fn allocate_immortal(
         thread: &Thread<VM>,
         size: usize,
         alignment: usize,
@@ -227,7 +277,7 @@ impl<VM: VirtualMachine> MemoryManager<VM> {
         }
     }
 
-    #[inline(never)]
+    #[inline(always)]
     pub extern "C-unwind" fn set_vo_bit(object: VMKitObject) {
         #[cfg(feature = "cooperative")]
         unsafe {
@@ -400,3 +450,10 @@ impl<VM: VirtualMachine> MemoryManager<VM> {
         Self::object_reference_write_post(thread, src, slot, target);
     }
 }
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub enum AllocFastPath {
+    TLAB,
+    FreeList,
+    None,
+}
diff --git a/vmkit/src/mm/active_plan.rs b/vmkit/src/mm/active_plan.rs
index 1f0c2db..1f3a100 100644
--- a/vmkit/src/mm/active_plan.rs
+++ b/vmkit/src/mm/active_plan.rs
@@ -10,7 +10,7 @@ pub struct VMKitActivePlan<VM: VirtualMachine>(PhantomData<VM>);
 impl<VM: VirtualMachine> ActivePlan<MemoryManager<VM>> for VMKitActivePlan<VM> {
     
     fn is_mutator(tls: mmtk::util::VMThread) -> bool {
-        let x = Thread::<VM>::from_vm_thread(tls);
+    
         Thread::<VM>::from_vm_thread(tls).active_mutator_context()
     }
 
diff --git a/vmkit/src/mm/aslr.rs b/vmkit/src/mm/aslr.rs
index 2697e75..a8d9ed8 100644
--- a/vmkit/src/mm/aslr.rs
+++ b/vmkit/src/mm/aslr.rs
@@ -1,3 +1,4 @@
+#![allow(dead_code, unused_imports, unused_variables)]
 //! Address space layout randomization for MMTk.
 
 use mmtk::util::{
diff --git a/vmkit/src/mm/scanning.rs b/vmkit/src/mm/scanning.rs
index 24bb22c..9853395 100644
--- a/vmkit/src/mm/scanning.rs
+++ b/vmkit/src/mm/scanning.rs
@@ -117,6 +117,7 @@ impl<VM: VirtualMachine> Scanning<MemoryManager<VM>> for VMKitScanning<VM> {
         VM::scan_vm_specific_roots(tls, factory);
     }
 
+    #[inline(always)]
     fn support_slot_enqueuing(
         _tls: mmtk::util::VMWorkerThread,
         object: mmtk::util::ObjectReference,
diff --git a/vmkit/src/object_model/object.rs b/vmkit/src/object_model/object.rs
index bb8cedd..1eca9ec 100644
--- a/vmkit/src/object_model/object.rs
+++ b/vmkit/src/object_model/object.rs
@@ -137,15 +137,21 @@ impl VMKitObject {
     pub fn bytes_used<VM: VirtualMachine>(self) -> usize {
         let metadata = self.header::<VM>().metadata().gc_metadata();
         let overhead = self.hashcode_overhead::<VM, false>();
-        
+
         if metadata.instance_size != 0 {
-            raw_align_up(metadata.instance_size + size_of::<HeapObjectHeader<VM>>(), align_of::<usize>()) + overhead
+            raw_align_up(
+                metadata.instance_size + size_of::<HeapObjectHeader<VM>>(),
+                align_of::<usize>(),
+            ) + overhead
         } else {
             let Some(compute_size) = metadata.compute_size else {
                 panic!("compute_size is not set for object at {}", self.0);
             };
 
-            raw_align_up(compute_size(self) + size_of::<HeapObjectHeader<VM>>(), align_of::<usize>()) + overhead
+            raw_align_up(
+                compute_size(self) + size_of::<HeapObjectHeader<VM>>(),
+                align_of::<usize>(),
+            ) + overhead
         }
     }
 
@@ -198,7 +204,7 @@ impl VMKitObject {
         let res = self
             .0
             .offset(-(OBJECT_REF_OFFSET as isize + self.hashcode_overhead::<VM, false>() as isize));
-        
+
         res
     }
 
@@ -258,7 +264,7 @@ impl VMKitObject {
                 "attempt to access field out of bounds"
             );
             let ordering = if !VOLATILE {
-                atomic::Ordering::Relaxed
+                return self.as_address().add(offset).load::<T>();
             } else {
                 atomic::Ordering::SeqCst
             };
@@ -282,7 +288,8 @@ impl VMKitObject {
         );
         unsafe {
             let ordering = if !VOLATILE {
-                atomic::Ordering::Relaxed
+                self.as_address().add(offset).store(value);
+                return;
             } else {
                 atomic::Ordering::SeqCst
             };
diff --git a/vmkit/src/threading.rs b/vmkit/src/threading.rs
index 197b58a..4f54779 100644
--- a/vmkit/src/threading.rs
+++ b/vmkit/src/threading.rs
@@ -11,14 +11,15 @@ use std::{
 
 use atomic::Atomic;
 use mmtk::{
-    util::{Address, VMMutatorThread, VMThread},
+    util::{alloc::AllocatorSelector, Address, VMMutatorThread, VMThread},
     vm::RootsWorkFactory,
-    BarrierSelector, Mutator,
+    AllocationSemantics, BarrierSelector, Mutator,
 };
 
 use crate::{
     mm::{
-        conservative_roots::ConservativeRoots, stack_bounds::StackBounds, tlab::TLAB, MemoryManager,
+        conservative_roots::ConservativeRoots, stack_bounds::StackBounds, tlab::TLAB,
+        AllocFastPath, MemoryManager,
     },
     object_model::compression::CompressedOps,
     sync::{Monitor, MonitorGuard},
@@ -156,6 +157,7 @@ pub struct Thread<VM: VirtualMachine> {
     pub tlab: UnsafeCell<TLAB>,
     max_non_los_default_alloc_bytes: Cell<usize>,
     barrier: Cell<BarrierSelector>,
+    alloc_fastpath: Cell<AllocFastPath>,
     mmtk_mutator: UnsafeCell<MaybeUninit<Mutator<MemoryManager<VM>>>>,
     has_collector_context: AtomicBool,
     exec_status: Atomic<ThreadState>,
@@ -223,6 +225,7 @@ impl<VM: VirtualMachine> Thread<VM> {
             tlab: UnsafeCell::new(TLAB::new()),
             stack_bounds: OnceCell::new(),
             barrier: Cell::new(BarrierSelector::NoBarrier),
+            alloc_fastpath: Cell::new(AllocFastPath::None),
             max_non_los_default_alloc_bytes: Cell::new(0),
             take_yieldpoint: AtomicI32::new(0),
             context: ctx.unwrap_or_else(|| VM::ThreadContext::new(collector_context)),
@@ -305,6 +308,10 @@ impl<VM: VirtualMachine> Thread<VM> {
         self.barrier.get()
     }
 
+    pub fn alloc_fastpath(&self) -> AllocFastPath {
+        self.alloc_fastpath.get()
+    }
+
     pub fn max_non_los_default_alloc_bytes(&self) -> usize {
         self.max_non_los_default_alloc_bytes.get()
     }
@@ -321,6 +328,23 @@ impl<VM: VirtualMachine> Thread<VM> {
         self.max_non_los_default_alloc_bytes
             .set(constraints.max_non_los_default_alloc_bytes);
         self.barrier.set(constraints.barrier);
+
+        let selector = mmtk::memory_manager::get_allocator_mapping(
+            &VM::get().vmkit().mmtk,
+            AllocationSemantics::Default,
+        );
+        match selector {
+            AllocatorSelector::BumpPointer(_) | AllocatorSelector::Immix(_) => {
+                self.alloc_fastpath.set(AllocFastPath::TLAB);
+            }
+
+            AllocatorSelector::FreeList(_) => {
+                self.alloc_fastpath.set(AllocFastPath::FreeList);
+            }
+
+            _ => self.alloc_fastpath.set(AllocFastPath::None),
+        }
+
         self.stack_bounds
             .set(StackBounds::current_thread_stack_bounds())
             .unwrap();
@@ -542,6 +566,10 @@ impl<VM: VirtualMachine> Thread<VM> {
         let mut lock = self.monitor().lock_no_handshake();
         self.is_blocking.store(true, Ordering::Relaxed);
 
+        log::trace!("Thread #{} in check_block_no_save_context", self.thread_id);
+
+        let mut had_really_blocked = false;
+
         loop {
             // deal with block requests
             self.acknowledge_block_requests();
@@ -549,11 +577,26 @@ impl<VM: VirtualMachine> Thread<VM> {
             if !self.is_blocked() {
                 break;
             }
+
+            had_really_blocked = true;
+            log::trace!(
+                "Thread #{} is really blocked with status {:?}",
+                self.thread_id,
+                self.get_exec_status()
+            );
             // what if a GC request comes while we're here for a suspend()
             // request?
             // answer: we get awoken, reloop, and acknowledge the GC block
             // request.
             lock.wait_no_handshake();
+            log::trace!(
+                "Thread #{} has awoken; checking if we're still blocked",
+                self.thread_id
+            );
+        }
+
+        if had_really_blocked {
+            log::trace!("Thread #{} is unblocking", self.thread_id);
         }
 
         // SAFETY: We are holding the monitor lock.
@@ -689,22 +732,43 @@ impl<VM: VirtualMachine> Thread<VM> {
 
         let mut lock = self.monitor.lock_no_handshake();
         let token = A::request_block(self);
-
+        log::trace!(
+            "Thread #{} is requesting that thread #{} blocks",
+            current_thread::<VM>().thread_id,
+            self.thread_id
+        );
         if current_thread::<VM>().thread_id == self.thread_id {
+            log::trace!("Thread #{} is blocking itself", self.thread_id);
             self.check_block();
             result = self.get_exec_status();
         } else {
             if self.is_about_to_terminate() {
+                log::trace!(
+                    "Thread #{} is about to terminate, returning as if blocked in TERMINATED state",
+                    self.thread_id
+                );
                 result = ThreadState::Terminated;
             } else {
                 self.take_yieldpoint.store(1, Ordering::Relaxed);
                 let new_state = self.set_blocked_exec_status();
                 result = new_state;
 
+                log::trace!(
+                    "Thread #{} is blocking thread #{} which is in state {:?}",
+                    current_thread::<VM>().thread_id,
+                    self.thread_id,
+                    new_state
+                );
                 self.monitor.notify_all();
 
                 if new_state == ThreadState::InManagedToBlock {
                     if !asynchronous {
+                        log::trace!(
+                            "Thread #{} is waiting for thread #{} to block",
+                            current_thread::<VM>().thread_id,
+                            self.thread_id
+                        );
+                        
                         while A::has_block_request_with_token(self, token)
                             && !A::is_blocked(self)
                             && !self.is_about_to_terminate()
@@ -874,7 +938,7 @@ impl<VM: VirtualMachine> Thread<VM> {
 
     pub fn mutator(&self) -> &'static mut Mutator<MemoryManager<VM>> {
         unsafe {
-            assert!(Thread::<VM>::current().thread_id == self.thread_id);
+            debug_assert!(Thread::<VM>::current().thread_id == self.thread_id);
             self.mutator_unchecked()
         }
     }
@@ -1042,6 +1106,8 @@ impl<VM: VirtualMachine> Thread<VM> {
     /// - `fp`: The frame pointer of the service method that called this method
     ///
     /// Exposed as `extern "C-unwind"` to allow directly invoking it from JIT/AOT code.
+    #[inline(never)]
+    #[cold]
     pub extern "C-unwind" fn yieldpoint(where_from: i32, fp: Address) {
         let thread = Thread::<VM>::current();
         let _was_at_yieldpoint = thread.at_yieldpoint.load(atomic::Ordering::Relaxed);
@@ -1094,7 +1160,7 @@ thread_local! {
 pub fn current_thread<VM: VirtualMachine>() -> &'static Thread<VM> {
     let addr = CURRENT_THREAD.with(|t| *t.borrow());
 
-    assert!(!addr.is_zero());
+    debug_assert!(!addr.is_zero());
     unsafe { addr.as_ref() }
 }
 
@@ -1240,6 +1306,7 @@ impl<VM: VirtualMachine> ThreadManager<VM> {
     /// Fixpoint until there are no threads that we haven't blocked. Fixpoint is needed to
     /// catch the (unlikely) case that a thread spawns another thread while we are waiting.
     pub fn block_all_mutators_for_gc(&self) -> Vec<Arc<Thread<VM>>> {
+
         let mut handshake_threads = Vec::with_capacity(4);
         loop {
             let lock = self.inner.lock_no_handshake();