diff --git a/ChangeLog.md b/ChangeLog.md
index c4e6edbd910fb..9a28bc93b0ad3 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -23,6 +23,7 @@ See docs/process.md for more on how version tagging works.
 - The emscripten_futux_wait API is now documented to explicitly allow spurious
   wakeups. This was part of an internal change to improve inter-thread
   communication. (#26659)
+- mimalloc was updated to 3.3.0. (#26696)
 
 5.0.6 - 04/14/26
 ----------------
diff --git a/system/lib/mimalloc/LICENSE b/system/lib/mimalloc/LICENSE
index 670b668a0c928..53315ebee557a 100644
--- a/system/lib/mimalloc/LICENSE
+++ b/system/lib/mimalloc/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2018-2021 Microsoft Corporation, Daan Leijen
+Copyright (c) 2018-2025 Microsoft Corporation, Daan Leijen
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/system/lib/mimalloc/README.emscripten b/system/lib/mimalloc/README.emscripten
index 92ffdea281d27..1f5c933e65b7a 100644
--- a/system/lib/mimalloc/README.emscripten
+++ b/system/lib/mimalloc/README.emscripten
@@ -1,5 +1,5 @@
 
-This contains mimalloc 8c532c32c3c96e5ba1f2283e032f69ead8add00f (v2.1.7) with
+This contains mimalloc 0ddf397796fbefa35b3278bd4431c2913a9892eb (v3.3.0) with
 Emscripten-specific changes.
 
 Origin: https://github.com/microsoft/mimalloc
diff --git a/system/lib/mimalloc/include/mimalloc-stats.h b/system/lib/mimalloc/include/mimalloc-stats.h
new file mode 100644
index 0000000000000..3d7fbee6bf677
--- /dev/null
+++ b/system/lib/mimalloc/include/mimalloc-stats.h
@@ -0,0 +1,164 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2024-2025, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_STATS_H
+#define MIMALLOC_STATS_H
+
+#include <mimalloc.h>
+#include <stdint.h>
+
+#define MI_STAT_VERSION   5  // increased on every backward incompatible change
+
+// alignment for atomic fields
+#if defined(_MSC_VER)
+#define mi_decl_align(a)        __declspec(align(a))
+#elif defined(__GNUC__)
+#define mi_decl_align(a)        __attribute__((aligned(a)))
+#elif __cplusplus >= 201103L
+#define mi_decl_align(a)        alignas(a)
+#else
+#define mi_decl_align(a)
+#endif
+
+
+// count allocation over time
+typedef struct mi_stat_count_s {
+  int64_t total;                              // total allocated
+  int64_t peak;                               // peak allocation
+  int64_t current;                            // current allocation
+} mi_stat_count_t;
+
+// counters only increase
+typedef struct mi_stat_counter_s {
+  int64_t total;                              // total count
+} mi_stat_counter_t;
+
+#define MI_STAT_FIELDS() \
+  MI_STAT_COUNT(pages)                      /* count of mimalloc pages */ \
+  MI_STAT_COUNT(reserved)                   /* reserved memory bytes */ \
+  MI_STAT_COUNT(committed)                  /* committed bytes */ \
+  MI_STAT_COUNTER(reset)                    /* reset bytes */ \
+  MI_STAT_COUNTER(purged)                   /* purged bytes */ \
+  MI_STAT_COUNT(page_committed)             /* committed memory inside pages */ \
+  MI_STAT_COUNT(pages_abandoned)            /* abandonded pages count */ \
+  MI_STAT_COUNT(threads)                    /* number of threads */ \
+  MI_STAT_COUNT(malloc_normal)              /* allocated bytes <= MI_LARGE_OBJ_SIZE_MAX */ \
+  MI_STAT_COUNT(malloc_huge)                /* allocated bytes in huge pages */ \
+  MI_STAT_COUNT(malloc_requested)           /* malloc requested bytes */ \
+  \
+  MI_STAT_COUNTER(mmap_calls) \
+  MI_STAT_COUNTER(commit_calls) \
+  MI_STAT_COUNTER(reset_calls) \
+  MI_STAT_COUNTER(purge_calls) \
+  MI_STAT_COUNTER(arena_count)              /* number of memory arena's */ \
+  MI_STAT_COUNTER(malloc_normal_count)      /* number of blocks <= MI_LARGE_OBJ_SIZE_MAX */ \
+  MI_STAT_COUNTER(malloc_huge_count)        /* number of huge bloks */ \
+  MI_STAT_COUNTER(malloc_guarded_count)     /* number of allocations with guard pages */ \
+  \
+  /* internal statistics */ \
+  MI_STAT_COUNTER(arena_rollback_count) \
+  MI_STAT_COUNTER(arena_purges) \
+  MI_STAT_COUNTER(pages_extended)           /* number of page extensions */ \
+  MI_STAT_COUNTER(pages_retire)             /* number of pages that are retired */ \
+  MI_STAT_COUNTER(page_searches)            /* total pages searched for a fresh page */ \
+  MI_STAT_COUNTER(page_searches_count)      /* searched count for a fresh page */ \
+  /* only on v1 and v2 */ \
+  MI_STAT_COUNT(segments) \
+  MI_STAT_COUNT(segments_abandoned) \
+  MI_STAT_COUNT(segments_cache) \
+  MI_STAT_COUNT(_segments_reserved) \
+  /* only on v3 */ \
+  MI_STAT_COUNT(heaps) \
+  MI_STAT_COUNT(theaps) \
+  MI_STAT_COUNTER(pages_reclaim_on_alloc) \
+  MI_STAT_COUNTER(pages_reclaim_on_free) \
+  MI_STAT_COUNTER(pages_reabandon_full) \
+  MI_STAT_COUNTER(pages_unabandon_busy_wait) \
+  MI_STAT_COUNTER(heaps_delete_wait)
+
+// Size bins for chunks
+typedef enum mi_chunkbin_e {
+  MI_CBIN_SMALL,    // slice_count == 1
+  MI_CBIN_OTHER,    // slice_count: any other from the other bins, and 1 <= slice_count <= MI_BCHUNK_BITS
+  MI_CBIN_MEDIUM,   // slice_count == 8
+  MI_CBIN_LARGE,    // slice_count == MI_SIZE_BITS  (only used if MI_ENABLE_LARGE_PAGES is 1)
+  MI_CBIN_HUGE,     // slice_count > MI_BCHUNK_BITS
+  MI_CBIN_NONE,     // no bin assigned yet (the chunk is completely free)
+  MI_CBIN_COUNT
+} mi_chunkbin_t;
+
+
+// Define the statistics structure
+#define MI_BIN_HUGE             (73U)   // see types.h
+#define MI_STAT_COUNT(stat)     mi_stat_count_t stat;
+#define MI_STAT_COUNTER(stat)   mi_stat_counter_t stat;
+
+typedef struct mi_stats_s
+{
+  size_t size;          // size of the mi_stats_t structure 
+  size_t version;       
+
+  mi_decl_align(8)  MI_STAT_FIELDS()
+
+  // future extension
+  mi_stat_count_t   _stat_reserved[4];
+  mi_stat_counter_t _stat_counter_reserved[4];
+
+  // size segregated statistics
+  mi_stat_count_t   malloc_bins[MI_BIN_HUGE+1];   // allocation per size bin
+  mi_stat_count_t   page_bins[MI_BIN_HUGE+1];     // pages allocated per size bin
+  mi_stat_count_t   chunk_bins[MI_CBIN_COUNT];    // chunks per page sizes
+} mi_stats_t;
+
+#undef MI_STAT_COUNT
+#undef MI_STAT_COUNTER
+
+// helper
+#if __cplusplus
+#define MI_STATS_ZERO_INIT  { }     /* empty initializer to prevent running the constructor (with msvc) */
+#else
+#define MI_STATS_ZERO_INIT  { 0 }   /* C zero initialize */
+#endif
+
+#define mi_stats_t_decl(name)  mi_stats_t name = MI_STATS_ZERO_INIT; name.size = sizeof(mi_stats_t); name.version = MI_STAT_VERSION;
+
+// Exported definitions
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// stats from a heap
+mi_decl_export bool    mi_heap_stats_get(mi_heap_t* heap, mi_stats_t* stats) mi_attr_noexcept;
+mi_decl_export char*   mi_heap_stats_get_json(mi_heap_t* heap, size_t buf_size, char* buf) mi_attr_noexcept;      // use mi_free to free the result if the input buf == NULL
+mi_decl_export void    mi_heap_stats_print_out(mi_heap_t* heap, mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+// stats from a subprocess and its heaps aggregated
+mi_decl_export bool    mi_subproc_stats_get(mi_subproc_id_t subproc_id, mi_stats_t* stats) mi_attr_noexcept;
+mi_decl_export char*   mi_subproc_stats_get_json(mi_subproc_id_t subproc_id, size_t buf_size, char* buf) mi_attr_noexcept;      // use mi_free to free the result if the input buf == NULL
+mi_decl_export void    mi_subproc_stats_print_out(mi_subproc_id_t subproc_id, mi_output_fun* out, void* arg) mi_attr_noexcept;
+// print subprocess and all its heap stats segregated
+mi_decl_export void    mi_subproc_heap_stats_print_out(mi_subproc_id_t subproc_id, mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+// stats aggregated for the current subprocess and all its heaps.
+mi_decl_export bool    mi_stats_get(mi_stats_t* stats) mi_attr_noexcept;
+mi_decl_export char*   mi_stats_get_json(size_t buf_size, char* buf) mi_attr_noexcept;      // use mi_free to free the result if the input buf == NULL
+mi_decl_export void    mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+// add the stats of the heap to the subprocess and clear the heap stats
+mi_decl_export void    mi_heap_stats_merge_to_subproc(mi_heap_t* heap);
+
+// stats from the subprocess without aggregating its heaps
+mi_decl_export bool    mi_subproc_stats_get_exclusive(mi_subproc_id_t subproc_id, mi_stats_t* stats) mi_attr_noexcept;
+
+mi_decl_export char*   mi_stats_as_json(mi_stats_t* stats, size_t buf_size, char* buf) mi_attr_noexcept;      // use mi_free to free the result if the input buf == NULL
+mi_decl_export size_t  mi_stats_get_bin_size(size_t bin) mi_attr_noexcept;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MIMALLOC_STATS_H
diff --git a/system/lib/mimalloc/include/mimalloc.h b/system/lib/mimalloc/include/mimalloc.h
index c41bcc8039190..b3d55760861c2 100644
--- a/system/lib/mimalloc/include/mimalloc.h
+++ b/system/lib/mimalloc/include/mimalloc.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2026, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 217   // major + 2 digits minor
+#define MI_MALLOC_VERSION 30300   // major + 2 digits minor + 2 digits patch
 
 // ------------------------------------------------------
 // Compiler specific attributes
@@ -97,7 +97,6 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include <stddef.h>     // size_t
 #include <stdbool.h>    // bool
-#include <stdint.h>     // INTPTR_MAX
 
 #ifdef __cplusplus
 extern "C" {
@@ -118,7 +117,7 @@ mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_strndup(const char* s
 mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept mi_attr_malloc;
 
 // ------------------------------------------------------
-// Extended functionality
+// Extended allocation functions
 // ------------------------------------------------------
 #define MI_SMALL_WSIZE_MAX  (128)
 #define MI_SMALL_SIZE_MAX   (MI_SMALL_WSIZE_MAX*sizeof(void*))
@@ -134,9 +133,48 @@ mi_decl_nodiscard mi_decl_export void* mi_reallocf(void* p, size_t newsize)
 mi_decl_nodiscard mi_decl_export size_t mi_usable_size(const void* p) mi_attr_noexcept;
 mi_decl_nodiscard mi_decl_export size_t mi_good_size(size_t size)     mi_attr_noexcept;
 
+// `mi_free_small` is for special applications like language runtimes.
+// it should only be used to free objects from `mi_(heap_)(m|z)alloc_small` and is potentially a tiny bit faster than `mi_free`
+mi_decl_export void mi_free_small(void* p) mi_attr_noexcept;  
+
+// -------------------------------------------------------------------------------------
+// Aligned allocation
+// Note that `alignment` always follows `size` for consistency with unaligned
+// allocation, but unfortunately this differs from `posix_memalign` and `aligned_alloc`.
+// -------------------------------------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1, 2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1, 2);
+mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(2);
+
+
+// ------------------------------------------------------
+// Typed allocation, the type is always the first parameter
+// ------------------------------------------------------
+
+#define mi_malloc_tp(tp)                 ((tp*)mi_malloc(sizeof(tp)))
+#define mi_zalloc_tp(tp)                 ((tp*)mi_zalloc(sizeof(tp)))
+#define mi_calloc_tp(tp,n)               ((tp*)mi_calloc(n,sizeof(tp)))
+#define mi_mallocn_tp(tp,n)              ((tp*)mi_mallocn(n,sizeof(tp)))
+#define mi_reallocn_tp(tp,p,n)           ((tp*)mi_reallocn(p,n,sizeof(tp)))
+#define mi_recalloc_tp(tp,p,n)           ((tp*)mi_recalloc(p,n,sizeof(tp)))
+
+#define mi_heap_malloc_tp(tp,hp)         ((tp*)mi_heap_malloc(hp,sizeof(tp)))
+#define mi_heap_zalloc_tp(tp,hp)         ((tp*)mi_heap_zalloc(hp,sizeof(tp)))
+#define mi_heap_calloc_tp(tp,hp,n)       ((tp*)mi_heap_calloc(hp,n,sizeof(tp)))
+#define mi_heap_mallocn_tp(tp,hp,n)      ((tp*)mi_heap_mallocn(hp,n,sizeof(tp)))
+#define mi_heap_reallocn_tp(tp,hp,p,n)   ((tp*)mi_heap_reallocn(hp,p,n,sizeof(tp)))
+#define mi_heap_recalloc_tp(tp,hp,p,n)   ((tp*)mi_heap_recalloc(hp,p,n,sizeof(tp)))
+
 
 // ------------------------------------------------------
 // Internals
+// See also `mimalloc-stats.h` for statistics
 // ------------------------------------------------------
 
 typedef void (mi_cdecl mi_deferred_free_fun)(bool force, unsigned long long heartbeat, void* arg);
@@ -148,62 +186,71 @@ mi_decl_export void mi_register_output(mi_output_fun* out, void* arg) mi_attr_no
 typedef void (mi_cdecl mi_error_fun)(int err, void* arg);
 mi_decl_export void mi_register_error(mi_error_fun* fun, void* arg);
 
-mi_decl_export void mi_collect(bool force)    mi_attr_noexcept;
-mi_decl_export int  mi_version(void)          mi_attr_noexcept;
-mi_decl_export void mi_stats_reset(void)      mi_attr_noexcept;
-mi_decl_export void mi_stats_merge(void)      mi_attr_noexcept;
-mi_decl_export void mi_stats_print(void* out) mi_attr_noexcept;  // backward compatibility: `out` is ignored and should be NULL
-mi_decl_export void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+mi_decl_export void mi_collect(bool force)      mi_attr_noexcept;
+mi_decl_export int  mi_version(void)            mi_attr_noexcept;
+mi_decl_export void mi_options_print(void)      mi_attr_noexcept;
+mi_decl_export void mi_process_info_print(void) mi_attr_noexcept;
+mi_decl_export void mi_options_print_out(mi_output_fun* out, void* arg)      mi_attr_noexcept;
+mi_decl_export void mi_process_info_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs,
+                                    size_t* current_rss, size_t* peak_rss,
+                                    size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept;
+
+
 
+// Generally do not use the following as these are usually called automatically
 mi_decl_export void mi_process_init(void)     mi_attr_noexcept;
+mi_decl_export void mi_cdecl mi_process_done(void) mi_attr_noexcept;
 mi_decl_export void mi_thread_init(void)      mi_attr_noexcept;
 mi_decl_export void mi_thread_done(void)      mi_attr_noexcept;
-mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+mi_decl_export void mi_thread_set_in_threadpool(void) mi_attr_noexcept; // communicate that a thread is in a threadpool
 
-mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs,
-                                    size_t* current_rss, size_t* peak_rss,
-                                    size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept;
 
-// -------------------------------------------------------------------------------------
-// Aligned allocation
-// Note that `alignment` always follows `size` for consistency with unaligned
-// allocation, but unfortunately this differs from `posix_memalign` and `aligned_alloc`.
-// -------------------------------------------------------------------------------------
+// -----------------------------------------------------------------
+// Return allocated block size (if the return value is not NULL)
+// -----------------------------------------------------------------
 
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2) mi_attr_alloc_align(3);
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
-mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(2) mi_attr_alloc_align(3);
-mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_umalloc(size_t size, size_t* block_size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_ucalloc(size_t count, size_t size, size_t* block_size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export void* mi_urealloc(void* p, size_t newsize, size_t* block_size_pre, size_t* block_size_post) mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_export void mi_ufree(void* p, size_t* block_size) mi_attr_noexcept;
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_umalloc_aligned(size_t size, size_t alignment, size_t* block_size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_uzalloc_aligned(size_t size, size_t alignment, size_t* block_size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_umalloc_small(size_t size, size_t* block_size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_uzalloc_small(size_t size, size_t* block_size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
 
 
 // -------------------------------------------------------------------------------------
-// Heaps: first-class, but can only allocate from the same thread that created it.
+// Heaps: first-class. Can allocate from any thread (and be free'd from any thread)
+// Heaps keep allocations in separate pages from each other (but share the arena's and free'd pages)
 // -------------------------------------------------------------------------------------
 
 struct mi_heap_s;
 typedef struct mi_heap_s mi_heap_t;
 
 mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new(void);
-mi_decl_export void       mi_heap_delete(mi_heap_t* heap);
-mi_decl_export void       mi_heap_destroy(mi_heap_t* heap);
-mi_decl_export mi_heap_t* mi_heap_set_default(mi_heap_t* heap);
-mi_decl_export mi_heap_t* mi_heap_get_default(void);
-mi_decl_export mi_heap_t* mi_heap_get_backing(void);
-mi_decl_export void       mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept;
-
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_export void mi_heap_delete(mi_heap_t* heap);            // move live blocks to the main heap
+mi_decl_export void mi_heap_destroy(mi_heap_t* heap);           // free all live blocks
+mi_decl_export void mi_heap_set_numa_affinity(mi_heap_t* heap, int numa_node);
+mi_decl_export void mi_heap_collect(mi_heap_t* heap, bool force);
+
+mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_main(void);
+mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_of(const void* p);
+mi_decl_nodiscard mi_decl_export bool       mi_heap_contains(const mi_heap_t* heap, const void* p);
+mi_decl_nodiscard mi_decl_export bool       mi_any_heap_contains(const void* p);
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc(mi_heap_t* theap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
 mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
 mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
 
 mi_decl_nodiscard mi_decl_export void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize)              mi_attr_noexcept mi_attr_alloc_size(3);
-mi_decl_nodiscard mi_decl_export void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3,4);
-mi_decl_nodiscard mi_decl_export void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize)             mi_attr_noexcept mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3, 4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_reallocf(mi_heap_t* theap, void* p, size_t newsize)            mi_attr_noexcept mi_attr_alloc_size(3);
 
 mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s)            mi_attr_noexcept mi_attr_malloc;
 mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept mi_attr_malloc;
@@ -235,81 +282,152 @@ mi_decl_nodiscard mi_decl_export void* mi_recalloc_aligned(void* p, size_t newco
 mi_decl_nodiscard mi_decl_export void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(2,3);
 
 mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize)                mi_attr_noexcept mi_attr_alloc_size(3);
-mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3,4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3, 4);
 
 mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(3) mi_attr_alloc_align(4);
 mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(3);
-mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_alloc_size2(3,4) mi_attr_alloc_align(5);
-mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(3,4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_alloc_size2(3, 4) mi_attr_alloc_align(5);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(3, 4);
+
 
 
 // ------------------------------------------------------
-// Analysis
+// Visiting pages and individual blocks in a heap.
 // ------------------------------------------------------
 
-mi_decl_export bool mi_heap_contains_block(mi_heap_t* heap, const void* p);
-mi_decl_export bool mi_heap_check_owned(mi_heap_t* heap, const void* p);
-mi_decl_export bool mi_check_owned(const void* p);
-
 // An area of heap space contains blocks of a single size.
 typedef struct mi_heap_area_s {
-  void*  blocks;      // start of the area containing heap blocks
+  void*  blocks;      // start of the area containing theap blocks
   size_t reserved;    // bytes reserved for this area (virtual)
   size_t committed;   // current available bytes for this area
   size_t used;        // number of allocated blocks
   size_t block_size;  // size in bytes of each block
   size_t full_block_size; // size in bytes of a full block including padding and metadata.
+  void*  reserved1;   // internal
 } mi_heap_area_t;
 
 typedef bool (mi_cdecl mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg);
 
-mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_blocks, mi_block_visit_fun* visitor, void* arg);
+mi_decl_export bool   mi_heap_visit_blocks(mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
+mi_decl_export bool   mi_heap_visit_abandoned_blocks(mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
+
+
+// ------------------------------------------------------
+// Arena memory management
+// Arena's are larger memory area's provided by the OS or user
+// ------------------------------------------------------
 
-// Experimental
-mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
 mi_decl_nodiscard mi_decl_export bool mi_is_redirected(void) mi_attr_noexcept;
 
-mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept;
-mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;
+mi_decl_export int    mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept;
+mi_decl_export int    mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;
 
-mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
-mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept;
+mi_decl_export int    mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
+mi_decl_export bool   mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned /* cannot decommit/reset? */, bool is_zero, int numa_node) mi_attr_noexcept;
 
-mi_decl_export void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept;
+mi_decl_export void   mi_debug_show_arenas(void) mi_attr_noexcept;
+mi_decl_export void   mi_arenas_print(void) mi_attr_noexcept;
+mi_decl_export size_t mi_arena_min_alignment(void);
+mi_decl_export size_t mi_arena_min_size(void);
 
-// Experimental: heaps associated with specific memory arena's
-typedef int mi_arena_id_t;
-mi_decl_export void* mi_arena_area(mi_arena_id_t arena_id, size_t* size);
-mi_decl_export int   mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
-mi_decl_export int   mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
-mi_decl_export bool  mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+typedef void* mi_arena_id_t;
+mi_decl_export void*  mi_arena_area(mi_arena_id_t arena_id, size_t* size);
+mi_decl_export int    mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export int    mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export bool   mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export bool   mi_arena_contains(mi_arena_id_t arena_id, const void* p);
 
-#if MI_MALLOC_VERSION >= 182
 // Create a heap that only allocates in the specified arena
 mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id);
-#endif
 
-// deprecated
-mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
 
+// ------------------------------------------------------
+// Subprocesses
+// Advanced: allow sub-processes whose memory arena's stay fully separated (and no reclamation between them).
+// Used for example for separate interpreters in one process.
+// ------------------------------------------------------
+
+typedef void* mi_subproc_id_t;                        
+mi_decl_export mi_subproc_id_t mi_subproc_main(void);
+mi_decl_export mi_subproc_id_t mi_subproc_current(void);
+mi_decl_export mi_subproc_id_t mi_subproc_new(void);
+mi_decl_export void mi_subproc_destroy(mi_subproc_id_t subproc);
+mi_decl_export void mi_subproc_add_current_thread(mi_subproc_id_t subproc); // this should be called right after a thread is created (and no allocation has taken place yet)
+
+typedef bool (mi_cdecl mi_heap_visit_fun)(mi_heap_t* heap, void* arg);
+mi_decl_export bool mi_subproc_visit_heaps(mi_subproc_id_t subproc, mi_heap_visit_fun* visitor, void* arg);
+
+
+// -------------------------------------------------------------------------------------
+// A "theap" is a thread-local heap. This API is only provided for special circumstances like runtimes
+// that already have a thread-local context and can store the theap there for (slightly) faster allocations.
+// This also allows to set a default theap for the current thread so that `malloc` etc. allocate from
+// that theap (instead of the main (t)heap).
+// Theaps are first-class, but can only allocate from the same thread that created it.
+// Allocation through a `theap` may be a tiny bit faster than using plain malloc
+// (as we don't need to lookup the thread local variable).
+// -------------------------------------------------------------------------------------
+
+struct mi_theap_s;
+typedef struct mi_theap_s mi_theap_t;
+
+mi_decl_export mi_theap_t* mi_heap_theap(mi_heap_t* heap);
+mi_decl_export mi_theap_t* mi_theap_set_default(mi_theap_t* theap);
+mi_decl_export mi_theap_t* mi_theap_get_default(void);
+mi_decl_export void        mi_theap_collect(mi_theap_t* theap, bool force) mi_attr_noexcept;
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_theap_malloc(mi_theap_t* theap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_theap_zalloc(mi_theap_t* theap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_theap_calloc(mi_theap_t* theap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_theap_malloc_small(mi_theap_t* theap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_theap_zalloc_small(mi_theap_t* theap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_theap_malloc_aligned(mi_theap_t* theap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export                  void* mi_theap_realloc(mi_theap_t* theap, void* p, size_t newsize)              mi_attr_noexcept mi_attr_alloc_size(3);
+
+
+// ------------------------------------------------------
+// Experimental
+// ------------------------------------------------------
+
+// Experimental: objects followed by a guard page.
+// Setting the sample rate on a specific theap can be used to test parts of the program more
+// specifically (in combination with `mi_theap_set_default`).
+// A sample rate of 0 disables guarded objects, while 1 uses a guard page for every object.
+// A seed of 0 uses a random start point. Only objects within the size bound are eligable for guard pages.
+mi_decl_export void mi_theap_guarded_set_sample_rate(mi_theap_t* theap, size_t sample_rate, size_t seed);
+mi_decl_export void mi_theap_guarded_set_size_bound(mi_theap_t* theap, size_t min, size_t max);
+
+// very experimental
+typedef bool (mi_cdecl mi_commit_fun_t)(bool commit, void* start, size_t size, bool* is_zero, void* user_arg);
+mi_decl_export bool  mi_manage_memory(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node, bool exclusive,
+                                      mi_commit_fun_t* commit_fun, void* commit_fun_arg, mi_arena_id_t* arena_id) mi_attr_noexcept;
+
+//mi_decl_export bool  mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* size);
+//mi_decl_export bool  mi_arena_reload(void* start, size_t size, mi_commit_fun_t* commit_fun, void* commit_fun_arg, mi_arena_id_t* arena_id);
+//mi_decl_export bool  mi_theap_reload(mi_theap_t* theap, mi_arena_id_t arena);
+//mi_decl_export void  mi_theap_unload(mi_theap_t* theap);
+
+// unsafe: assumes the page belonging to `p` is only accessed by the calling thread.
+mi_decl_export bool mi_unsafe_heap_page_is_under_utilized(mi_heap_t* heap, void* p, size_t perc_threshold) mi_attr_noexcept;
 
 // ------------------------------------------------------
-// Convenience
+// Deprecated
 // ------------------------------------------------------
 
-#define mi_malloc_tp(tp)                ((tp*)mi_malloc(sizeof(tp)))
-#define mi_zalloc_tp(tp)                ((tp*)mi_zalloc(sizeof(tp)))
-#define mi_calloc_tp(tp,n)              ((tp*)mi_calloc(n,sizeof(tp)))
-#define mi_mallocn_tp(tp,n)             ((tp*)mi_mallocn(n,sizeof(tp)))
-#define mi_reallocn_tp(p,tp,n)          ((tp*)mi_reallocn(p,n,sizeof(tp)))
-#define mi_recalloc_tp(p,tp,n)          ((tp*)mi_recalloc(p,n,sizeof(tp)))
+mi_decl_export bool mi_check_owned(const void* p);
+
+mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
+mi_decl_export bool mi_theap_visit_blocks(const mi_theap_t* theap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
+
+mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
+mi_decl_export void mi_collect_reduce(size_t target_thread_owned) mi_attr_noexcept;
+
+mi_decl_export void mi_stats_reset(void)      mi_attr_noexcept;
+mi_decl_export void mi_stats_merge(void)      mi_attr_noexcept;
+mi_decl_export void mi_stats_print(void* out) mi_attr_noexcept;  // backward compatibility: `out` is ignored and should be NULL
 
-#define mi_heap_malloc_tp(hp,tp)        ((tp*)mi_heap_malloc(hp,sizeof(tp)))
-#define mi_heap_zalloc_tp(hp,tp)        ((tp*)mi_heap_zalloc(hp,sizeof(tp)))
-#define mi_heap_calloc_tp(hp,tp,n)      ((tp*)mi_heap_calloc(hp,n,sizeof(tp)))
-#define mi_heap_mallocn_tp(hp,tp,n)     ((tp*)mi_heap_mallocn(hp,n,sizeof(tp)))
-#define mi_heap_reallocn_tp(hp,p,tp,n)  ((tp*)mi_heap_reallocn(hp,p,n,sizeof(tp)))
-#define mi_heap_recalloc_tp(hp,p,tp,n)  ((tp*)mi_heap_recalloc(hp,p,n,sizeof(tp)))
+mi_decl_export void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;  // not deprecated but declared in `mimalloc-stats.h` now.
 
 
 // ------------------------------------------------------
@@ -322,39 +440,55 @@ typedef enum mi_option_e {
   mi_option_show_stats,                 // print statistics on termination
   mi_option_verbose,                    // print verbose messages
   // advanced options
-  mi_option_eager_commit,               // eager commit segments? (after `eager_commit_delay` segments) (=1)
+  mi_option_deprecated_eager_commit,    
   mi_option_arena_eager_commit,         // eager commit arenas? Use 2 to enable just on overcommit systems (=2)
   mi_option_purge_decommits,            // should a memory purge decommit? (=1). Set to 0 to use memory reset on a purge (instead of decommit)
-  mi_option_allow_large_os_pages,       // allow large (2 or 4 MiB) OS pages, implies eager commit. If false, also disables THP for the process.
+  mi_option_allow_large_os_pages,       // allow use of large (2 or 4 MiB) OS pages, implies eager commit.
   mi_option_reserve_huge_os_pages,      // reserve N huge OS pages (1GiB pages) at startup
   mi_option_reserve_huge_os_pages_at,   // reserve huge OS pages at a specific NUMA node
   mi_option_reserve_os_memory,          // reserve specified amount of OS memory in an arena at startup (internally, this value is in KiB; use `mi_option_get_size`)
   mi_option_deprecated_segment_cache,
   mi_option_deprecated_page_reset,
-  mi_option_abandoned_page_purge,       // immediately purge delayed purges on thread termination
-  mi_option_deprecated_segment_reset, 
-  mi_option_eager_commit_delay,         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
+  mi_option_deprecated_abandoned_page_purge,
+  mi_option_deprecated_segment_reset,
+  mi_option_deprecated_eager_commit_delay, 
   mi_option_purge_delay,                // memory purging is delayed by N milli seconds; use 0 for immediate purging or -1 for no purging at all. (=10)
   mi_option_use_numa_nodes,             // 0 = use all available numa nodes, otherwise use at most N nodes.
   mi_option_disallow_os_alloc,          // 1 = do not use OS memory for allocation (but only programmatically reserved arenas)
   mi_option_os_tag,                     // tag used for OS logging (macOS only for now) (=100)
   mi_option_max_errors,                 // issue at most N error messages
   mi_option_max_warnings,               // issue at most N warning messages
-  mi_option_max_segment_reclaim,        // max. percentage of the abandoned segments can be reclaimed per try (=10%)
+  mi_option_deprecated_max_segment_reclaim,  // max. percentage of the abandoned segments can be reclaimed per try (=10%)
   mi_option_destroy_on_exit,            // if set, release all memory on exit; sometimes used for dynamic unloading but can be unsafe
   mi_option_arena_reserve,              // initial memory size for arena reservation (= 1 GiB on 64-bit) (internally, this value is in KiB; use `mi_option_get_size`)
   mi_option_arena_purge_mult,           // multiplier for `purge_delay` for the purging delay for arenas (=10)
-  mi_option_purge_extend_delay,
-  mi_option_abandoned_reclaim_on_free,  // allow to reclaim an abandoned segment on a free (=1)
+  mi_option_deprecated_purge_extend_delay,
   mi_option_disallow_arena_alloc,       // 1 = do not use arena's for allocation (except if using specific arena id's)
   mi_option_retry_on_oom,               // retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows)
+  mi_option_visit_abandoned,            // allow visiting theap blocks from abandoned threads (=0)
+  mi_option_guarded_min,                // only used when building with MI_GUARDED: minimal rounded object size for guarded objects (=0)
+  mi_option_guarded_max,                // only used when building with MI_GUARDED: maximal rounded object size for guarded objects (=0)
+  mi_option_guarded_precise,            // disregard minimal alignment requirement to always place guarded blocks exactly in front of a guard page (=0)
+  mi_option_guarded_sample_rate,        // 1 out of N allocations in the min/max range will be guarded (=1000)
+  mi_option_guarded_sample_seed,        // can be set to allow for a (more) deterministic re-execution when a guard page is triggered (=0)
+  mi_option_generic_collect,            // collect theaps every N (=10000) generic allocation calls
+  mi_option_page_reclaim_on_free,       // reclaim abandoned pages on a free (=0). -1 disallowr always, 0 allows if the page originated from the current theap, 1 allow always
+  mi_option_page_full_retain,           // retain N full (small) pages per size class (=2)
+  mi_option_page_max_candidates,        // max candidate pages to consider for allocation (=4)
+  mi_option_max_vabits,                 // max user space virtual address bits to consider (=48)
+  mi_option_pagemap_commit,             // commit the full pagemap (to always catch invalid pointer uses) (=0)
+  mi_option_page_commit_on_demand,      // commit page memory on-demand
+  mi_option_page_max_reclaim,           // don't reclaim pages of the same originating theap if we already own N pages (in that size class) (=-1 (unlimited))
+  mi_option_page_cross_thread_max_reclaim, // don't reclaim pages across threads if we already own N pages (in that size class) (=16)
+  mi_option_allow_thp,                  // allow transparent huge pages? (=1) (on Android =0 by default). Set to 0 to disable THP for the process.
+  mi_option_minimal_purge_size,         // set minimal purge size (in KiB) (=0). By default set to either 64 or 2048 if THP is enabled.
+  mi_option_arena_max_object_size,      // set maximal object size that can be allocated in an arena (in KiB) (=2GiB on 64-bit). 
   _mi_option_last,
   // legacy option names
   mi_option_large_os_pages = mi_option_allow_large_os_pages,
   mi_option_eager_region_commit = mi_option_arena_eager_commit,
   mi_option_reset_decommits = mi_option_purge_decommits,
   mi_option_reset_delay = mi_option_purge_delay,
-  mi_option_abandoned_page_reset = mi_option_abandoned_page_purge,
   mi_option_limit_os_alloc = mi_option_disallow_os_alloc
 } mi_option_t;
 
@@ -375,7 +509,7 @@ mi_decl_export void mi_option_set_default(mi_option_t option, long value);
 // -------------------------------------------------------------------------------------------------------
 // "mi" prefixed implementations of various posix, Unix, Windows, and C++ allocation functions.
 // (This can be convenient when providing overrides of these functions as done in `mimalloc-override.h`.)
-// note: we use `mi_cfree` as "checked free" and it checks if the pointer is in our heap before free-ing.
+// note: we use `mi_cfree` as "checked free" and it checks if the pointer is in our theap before free-ing.
 // -------------------------------------------------------------------------------------------------------
 
 mi_decl_export void  mi_cfree(void* p) mi_attr_noexcept;
@@ -414,7 +548,7 @@ mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_n(size_t count, s
 mi_decl_nodiscard mi_decl_export void* mi_new_realloc(void* p, size_t newsize)                mi_attr_alloc_size(2);
 mi_decl_nodiscard mi_decl_export void* mi_new_reallocn(void* p, size_t newcount, size_t size) mi_attr_alloc_size2(2, 3);
 
-mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_alloc_new(mi_heap_t* heap, size_t size)                mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_alloc_new(mi_heap_t* heap, size_t size)                 mi_attr_malloc mi_attr_alloc_size(2);
 mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_alloc_new_n(mi_heap_t* heap, size_t count, size_t size) mi_attr_malloc mi_attr_alloc_size2(2, 3);
 
 #ifdef __cplusplus
@@ -492,7 +626,7 @@ template<class T1,class T2> bool operator!=(const mi_stl_allocator<T1>& , const
 
 #include <memory>      // std::shared_ptr
 
-// Common base class for STL allocators in a specific heap
+// Common base class for STL allocators in a specific theap
 template<class T, bool _mi_destroy> struct _mi_heap_stl_allocator_common : public _mi_stl_allocator_common<T> {
   using typename _mi_stl_allocator_common<T>::size_type;
   using typename _mi_stl_allocator_common<T>::value_type;
diff --git a/system/lib/mimalloc/include/mimalloc/atomic.h b/system/lib/mimalloc/include/mimalloc/atomic.h
index d5333dd90f7ca..e21bf901b287e 100644
--- a/system/lib/mimalloc/include/mimalloc/atomic.h
+++ b/system/lib/mimalloc/include/mimalloc/atomic.h
@@ -1,17 +1,28 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023 Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_ATOMIC_H
-#define MIMALLOC_ATOMIC_H
+#ifndef MI_ATOMIC_H
+#define MI_ATOMIC_H
+
+// include windows.h or pthreads.h
+#if defined(_WIN32)
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#elif !defined(__wasi__) && (!defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__))
+#define  MI_USE_PTHREADS
+#include <pthread.h>
+#endif
 
 // --------------------------------------------------------------------------------------------
 // Atomics
 // We need to be portable between C, C++, and MSVC.
-// We base the primitives on the C/C++ atomics and create a mimimal wrapper for MSVC in C compilation mode.
+// We base the primitives on the C/C++ atomics and create a minimal wrapper for MSVC in C compilation mode.
 // This is why we try to use only `uintptr_t` and `<type>*` as atomic types.
 // To gain better insight in the range of used atomics, we use explicitly named memory order operations
 // instead of passing the memory order as a parameter.
@@ -20,33 +31,33 @@ terms of the MIT license. A copy of the license can be found in the file
 #if defined(__cplusplus)
 // Use C++ atomics
 #include <atomic>
-#define  _Atomic(tp)            std::atomic<tp>
-#define  mi_atomic(name)        std::atomic_##name
-#define  mi_memory_order(name)  std::memory_order_##name
-#if (__cplusplus >= 202002L)    // c++20, see issue #571
-#define MI_ATOMIC_VAR_INIT(x)  x
+#define  _Atomic(tp)              std::atomic<tp>
+#define  mi_atomic(name)          std::atomic_##name
+#define  mi_memory_order(name)    std::memory_order_##name
+#if (__cplusplus >= 202002L)      // c++20, see issue #571
+ #define MI_ATOMIC_VAR_INIT(x)    x
 #elif !defined(ATOMIC_VAR_INIT)
-#define MI_ATOMIC_VAR_INIT(x)  x
+ #define MI_ATOMIC_VAR_INIT(x)    x
 #else
- #define MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
+ #define MI_ATOMIC_VAR_INIT(x)    ATOMIC_VAR_INIT(x)
 #endif
 #elif defined(_MSC_VER)
 // Use MSVC C wrapper for C11 atomics
-#define  _Atomic(tp)            tp
-#define  MI_ATOMIC_VAR_INIT(x)  x
-#define  mi_atomic(name)        mi_atomic_##name
-#define  mi_memory_order(name)  mi_memory_order_##name
+#define  _Atomic(tp)              tp
+#define  MI_ATOMIC_VAR_INIT(x)    x
+#define  mi_atomic(name)          mi_atomic_##name
+#define  mi_memory_order(name)    mi_memory_order_##name
 #else
 // Use C11 atomics
 #include <stdatomic.h>
-#define  mi_atomic(name)        atomic_##name
-#define  mi_memory_order(name)  memory_order_##name
+#define  mi_atomic(name)          atomic_##name
+#define  mi_memory_order(name)    memory_order_##name
 #if (__STDC_VERSION__ >= 201710L) // c17, see issue #735
- #define MI_ATOMIC_VAR_INIT(x) x
+ #define MI_ATOMIC_VAR_INIT(x)    x
 #elif !defined(ATOMIC_VAR_INIT)
- #define MI_ATOMIC_VAR_INIT(x) x
+ #define MI_ATOMIC_VAR_INIT(x)    x
 #else
- #define MI_ATOMIC_VAR_INIT(x) ATOMIC_VAR_INIT(x)
+ #define MI_ATOMIC_VAR_INIT(x)    ATOMIC_VAR_INIT(x)
 #endif
 #endif
 
@@ -61,18 +72,24 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_atomic_load_relaxed(p)                mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
 #define mi_atomic_store_release(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(release))
 #define mi_atomic_store_relaxed(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_exchange_relaxed(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_exchange_release(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
 #define mi_atomic_exchange_acq_rel(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
+
+#define mi_atomic_cas_weak_relaxed(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
 #define mi_atomic_cas_weak_release(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
 #define mi_atomic_cas_weak_acq_rel(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+#define mi_atomic_cas_strong_relaxed(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
 #define mi_atomic_cas_strong_release(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
 #define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
 
 #define mi_atomic_add_relaxed(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed))
-#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_add_acq_rel(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_sub_acq_rel(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_and_relaxed(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_and_acq_rel(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_or_relaxed(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_or_acq_rel(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel))
 
 #define mi_atomic_increment_relaxed(p)           mi_atomic_add_relaxed(p,(uintptr_t)1)
@@ -99,6 +116,8 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,(tp*)des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,(tp*)des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,(tp*)des)
+#define mi_atomic_cas_ptr_strong_acq_rel(tp,p,exp,des)  mi_atomic_cas_strong_acq_rel(p,exp,(tp*)des)
+#define mi_atomic_exchange_ptr_relaxed(tp,p,x)          mi_atomic_exchange_relaxed(p,(tp*)x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,(tp*)x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,(tp*)x)
 #else
@@ -107,6 +126,8 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,des)
+#define mi_atomic_cas_ptr_strong_acq_rel(tp,p,exp,des)  mi_atomic_cas_strong_acq_rel(p,exp,des)
+#define mi_atomic_exchange_ptr_relaxed(tp,p,x)          mi_atomic_exchange_relaxed(p,x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,x)
 #endif
@@ -115,6 +136,12 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
 static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add) {
   return mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed));
 }
+static inline void mi_atomic_void_addi64_relaxed(volatile int64_t* p, const volatile int64_t* padd) {
+  const int64_t add = mi_atomic_load_relaxed((_Atomic(int64_t)*)padd);
+  if (add != 0) {
+    mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed));
+  }
+}
 static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
   int64_t current = mi_atomic_load_relaxed((_Atomic(int64_t)*)p);
   while (current < x && !mi_atomic_cas_weak_release((_Atomic(int64_t)*)p, &current, x)) { /* nothing */ };
@@ -133,10 +160,6 @@ static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
 #elif defined(_MSC_VER)
 
 // Legacy MSVC plain C compilation wrapper that uses Interlocked operations to model C11 atomics.
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
 #include <intrin.h>
 #ifdef _WIN64
 typedef LONG64   msc_intptr_t;
@@ -250,6 +273,14 @@ static inline int64_t mi_atomic_addi64_relaxed(volatile _Atomic(int64_t)*p, int6
   return current;
 #endif
 }
+
+static inline void mi_atomic_void_addi64_relaxed(volatile int64_t* p, const volatile int64_t* padd) {
+  const int64_t add = *padd;
+  if (add != 0) {
+    mi_atomic_addi64_relaxed((volatile _Atomic(int64_t)*)p, add);
+  }
+}
+
 static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t x) {
   int64_t current;
   do {
@@ -280,6 +311,8 @@ static inline bool mi_atomic_casi64_strong_acq_rel(volatile _Atomic(int64_t*)p,
 #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
 #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
 #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_cas_ptr_strong_acq_rel(tp,p,exp,des)  mi_atomic_cas_strong_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_exchange_ptr_relaxed(tp,p,x)          (tp*)mi_atomic_exchange_relaxed((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
 #define mi_atomic_exchange_ptr_release(tp,p,x)          (tp*)mi_atomic_exchange_release((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
 #define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          (tp*)mi_atomic_exchange_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
 
@@ -302,11 +335,16 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub) {
   return (intptr_t)mi_atomic_addi(p, -sub);
 }
 
+
+// ----------------------------------------------------------------------
+// Once and Guard
+// ----------------------------------------------------------------------
+
 typedef _Atomic(uintptr_t) mi_atomic_once_t;
 
 // Returns true only on the first invocation
 static inline bool mi_atomic_once( mi_atomic_once_t* once ) {
-  if (mi_atomic_load_relaxed(once) != 0) return false;     // quick test 
+  if (mi_atomic_load_relaxed(once) != 0) return false;     // quick test
   uintptr_t expected = 0;
   return mi_atomic_cas_strong_acq_rel(once, &expected, (uintptr_t)1); // try to set to 1
 }
@@ -322,19 +360,13 @@ typedef _Atomic(uintptr_t) mi_atomic_guard_t;
 
 
 
+// ----------------------------------------------------------------------
 // Yield
-#if defined(__cplusplus)
-#include <thread>
-static inline void mi_atomic_yield(void) {
-  std::this_thread::yield();
-}
-#elif defined(_WIN32)
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
+// ----------------------------------------------------------------------
+
+#if defined(_WIN32)
 static inline void mi_atomic_yield(void) {
-  YieldProcessor();
+  YieldProcessor();  // see issue #1215 and #1225 why this is preferred over __yield or SwitchToThread
 }
 #elif defined(__SSE2__)
 #include <emmintrin.h>
@@ -342,20 +374,27 @@ static inline void mi_atomic_yield(void) {
   _mm_pause();
 }
 #elif (defined(__GNUC__) || defined(__clang__)) && \
-      (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__armel__) || defined(__ARMEL__) || \
-       defined(__aarch64__) || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)) || defined(__POWERPC__)
+      (defined(__x86_64__) || defined(__i386__) || \
+       defined(__aarch64__) || defined(__arm__) || \
+       defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__POWERPC__))
 #if defined(__x86_64__) || defined(__i386__)
 static inline void mi_atomic_yield(void) {
   __asm__ volatile ("pause" ::: "memory");
 }
 #elif defined(__aarch64__)
 static inline void mi_atomic_yield(void) {
-  __asm__ volatile("wfe");
+  __asm__ volatile("isb");
 }
-#elif (defined(__arm__) && __ARM_ARCH__ >= 7)
+#elif defined(__arm__)
+#if __ARM_ARCH >= 7
 static inline void mi_atomic_yield(void) {
   __asm__ volatile("yield" ::: "memory");
 }
+#else
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("nop" ::: "memory");
+}
+#endif
 #elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__POWERPC__)
 #ifdef __APPLE__
 static inline void mi_atomic_yield(void) {
@@ -366,13 +405,8 @@ static inline void mi_atomic_yield(void) {
   __asm__ __volatile__ ("or 27,27,27" ::: "memory");
 }
 #endif
-#elif defined(__armel__) || defined(__ARMEL__)
-static inline void mi_atomic_yield(void) {
-  __asm__ volatile ("nop" ::: "memory");
-}
 #endif
 #elif defined(__sun)
-// Fallback for other archs
 #include <synch.h>
 static inline void mi_atomic_yield(void) {
   smt_pause();
@@ -382,6 +416,12 @@ static inline void mi_atomic_yield(void) {
 static inline void mi_atomic_yield(void) {
   sched_yield();
 }
+// Fallback for other archs
+#elif defined(__cplusplus)  
+#include <thread>
+static inline void mi_atomic_yield(void) {
+  std::this_thread::yield();
+}
 #else
 #include <unistd.h>
 static inline void mi_atomic_yield(void) {
@@ -390,4 +430,156 @@ static inline void mi_atomic_yield(void) {
 #endif
 
 
-#endif // __MIMALLOC_ATOMIC_H
+// ----------------------------------------------------------------------
+// Locks
+// These should be light-weight in-process only locks.
+// Only used for reserving arena's and to maintain the abandoned list.
+// ----------------------------------------------------------------------
+#if _MSC_VER
+#pragma warning(disable:26110)  // unlock with holding lock
+#endif
+
+#define mi_lock(lock)                  for(bool _go = (mi_lock_acquire(lock),true); _go; (mi_lock_release(lock), _go=false) )
+#define mi_lock_maybe(lock,acquire)    for(bool _go = (acquire ? (mi_lock_acquire(lock),true) : true); _go; _go = (acquire ? (mi_lock_release(lock),false) : false) )
+
+#if defined(_WIN32)
+
+#if 1
+
+typedef union mi_lock_u {
+  size_t   _init;    // for static initialization
+  SRWLOCK  mutex;    // slim reader-writer lock
+} mi_lock_t;
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return TryAcquireSRWLockExclusive(&lock->mutex);
+}
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  AcquireSRWLockExclusive(&lock->mutex);
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  ReleaseSRWLockExclusive(&lock->mutex);
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  InitializeSRWLock(&lock->mutex);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  (void)(lock);
+}
+
+#else
+
+typedef union mi_lock_u {
+  size_t           _init;    // for static initialization
+  CRITICAL_SECTION mutex;
+} mi_lock_t;
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return TryEnterCriticalSection(&lock->mutex);
+}
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  EnterCriticalSection(&lock->mutex);
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  LeaveCriticalSection(&lock->mutex);
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  InitializeCriticalSection(&lock->mutex);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  DeleteCriticalSection(&lock->mutex);
+}
+
+#endif
+
+#elif defined(MI_USE_PTHREADS)
+
+#include <string.h> // memcpy
+void _mi_error_message(int err, const char* fmt, ...);
+
+typedef union mi_lock_u {
+  size_t          _init;    // for static initialization
+  pthread_mutex_t mutex;
+} mi_lock_t;
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return (pthread_mutex_trylock(&lock->mutex) == 0);
+}
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  const int err = pthread_mutex_lock(&lock->mutex);
+  if (err != 0) {
+    _mi_error_message(err, "internal error: lock cannot be acquired (err %i)\n", err);
+  }
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  pthread_mutex_unlock(&lock->mutex);
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  if(lock==NULL) return;
+  // use instead of pthread_mutex_init since that can cause allocation on some platforms (and recursively initialize)
+  const pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;  
+  memcpy(&lock->mutex,&mutex,sizeof(mutex));
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  pthread_mutex_destroy(&lock->mutex);
+}
+
+#elif defined(__cplusplus)
+
+#include <mutex>
+typedef union mi_lock_u {
+  size_t     _init;    // for static initialization
+  std::mutex mutex;
+} mi_lock_t;
+
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return lock->mutex.try_lock();
+}
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  lock->mutex.lock();
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  lock->mutex.unlock();
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  (void)(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  (void)(lock);
+}
+
+#else
+
+// fall back to poor man's locks.
+// this should only be the case in a single-threaded environment (like __wasi__)
+
+typedef union mi_lock_u {
+  size_t             _init;    // for static initialization
+  _Atomic(uintptr_t) mutex;
+} mi_lock_t;
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  uintptr_t expected = 0;
+  return mi_atomic_cas_strong_acq_rel(&lock->mutex, &expected, (uintptr_t)1);
+}
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  for (int i = 0; i < 1000; i++) {  // for at most 1000 tries?
+    if (mi_lock_try_acquire(lock)) return;
+    mi_atomic_yield();
+  }
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  mi_atomic_store_release(&lock->mutex, (uintptr_t)0);
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  mi_lock_release(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  (void)(lock);
+}
+
+#endif
+
+
+#endif // MI_ATOMIC_H
diff --git a/system/lib/mimalloc/include/mimalloc/bits.h b/system/lib/mimalloc/include/mimalloc/bits.h
new file mode 100644
index 0000000000000..9d776514c10ab
--- /dev/null
+++ b/system/lib/mimalloc/include/mimalloc/bits.h
@@ -0,0 +1,342 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+  Bit operation, and platform dependent definition (MI_INTPTR_SIZE etc)
+---------------------------------------------------------------------------- */
+
+#pragma once
+#ifndef MI_BITS_H
+#define MI_BITS_H
+
+#include <stddef.h>   // size_t
+#include <stdint.h>   // int64_t etc
+#include <stdbool.h>  // bool
+
+// ------------------------------------------------------
+// Size of a pointer.
+// We assume that `sizeof(void*)==sizeof(intptr_t)`
+// and it holds for all platforms we know of.
+//
+// However, the C standard only requires that:
+//  p == (void*)((intptr_t)p))
+// but we also need:
+//  i == (intptr_t)((void*)i)
+// or otherwise one might define an intptr_t type that is larger than a pointer...
+// ------------------------------------------------------
+
+#if INTPTR_MAX > INT64_MAX
+# define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
+#elif INTPTR_MAX == INT64_MAX
+# define MI_INTPTR_SHIFT (3)
+#elif INTPTR_MAX == INT32_MAX
+# define MI_INTPTR_SHIFT (2)
+#else
+#error platform pointers must be 32, 64, or 128 bits
+#endif
+
+#if (INTPTR_MAX) > LONG_MAX
+# define MI_PU(x)  x##ULL
+#else
+# define MI_PU(x)  x##UL
+#endif
+
+#if SIZE_MAX == UINT64_MAX
+# define MI_SIZE_SHIFT (3)
+typedef int64_t  mi_ssize_t;
+#elif SIZE_MAX == UINT32_MAX
+# define MI_SIZE_SHIFT (2)
+typedef int32_t  mi_ssize_t;
+#else
+#error platform objects must be 32 or 64 bits in size
+#endif
+
+#if (SIZE_MAX/2) > LONG_MAX
+# define MI_ZU(x)  x##ULL
+#else
+# define MI_ZU(x)  x##UL
+#endif
+
+#define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
+#define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
+
+#define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
+#define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
+
+#define MI_KiB     (MI_ZU(1024))
+#define MI_MiB     (MI_KiB*MI_KiB)
+#define MI_GiB     (MI_MiB*MI_KiB)
+
+
+/* --------------------------------------------------------------------------------
+  Architecture
+-------------------------------------------------------------------------------- */
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)  // consider arm64ec as arm64
+#define MI_ARCH_ARM64     1
+#elif defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+#define MI_ARCH_X64       1
+#elif defined(__i386__) || defined(__i386) || defined(_M_IX86) || defined(_X86_) || defined(__X86__)
+#define MI_ARCH_X86       1
+#elif defined(__arm__) || defined(_ARM) || defined(_M_ARM)  || defined(_M_ARMT) || defined(__arm)
+#define MI_ARCH_ARM32     1
+#elif defined(__riscv) || defined(_M_RISCV)
+#define MI_ARCH_RISCV     1
+#if (LONG_MAX == INT32_MAX)
+#define MI_ARCH_RISCV32   1
+#else
+#define MI_ARCH_RISCV64   1
+#endif
+#endif
+
+#if MI_ARCH_X64 && defined(__AVX2__)
+#include <immintrin.h>
+#elif MI_ARCH_ARM64 && MI_OPT_SIMD
+#include <arm_neon.h>
+#endif
+#if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+#include <intrin.h>
+#endif
+
+#if MI_ARCH_X64 && defined(__AVX2__) && !defined(__BMI2__) // msvc
+#define __BMI2__  1
+#endif
+#if MI_ARCH_X64 && (defined(__AVX2__) || defined(__BMI2__)) && !defined(__BMI1__) // msvc
+#define __BMI1__  1
+#endif
+
+// Define big endian if needed
+// #define MI_BIG_ENDIAN  1
+
+// maximum virtual address bits in a user-space pointer
+#if MI_DEFAULT_VIRTUAL_ADDRESS_BITS > 0 
+#define MI_MAX_VABITS     MI_DEFAULT_VIRTUAL_ADDRESS_BITS
+#elif   MI_ARCH_X64
+#define MI_MAX_VABITS     (47)
+#elif MI_INTPTR_SIZE > 4
+#define MI_MAX_VABITS     (48)
+#else
+#define MI_MAX_VABITS     (32)
+#endif
+
+// use a flat page-map or a 2-level one
+#ifndef MI_PAGE_MAP_FLAT
+#if MI_MAX_VABITS <= 40 && !defined(__APPLE__) && MI_SECURE==0 && !MI_PAGE_META_IS_SEPARATED
+#define MI_PAGE_MAP_FLAT  1
+#else
+#define MI_PAGE_MAP_FLAT  0
+#endif
+#endif
+
+
+/* --------------------------------------------------------------------------------
+  Builtin's
+-------------------------------------------------------------------------------- */
+
+#ifndef __has_builtin
+#define __has_builtin(x)  0
+#endif
+
+#define mi_builtin(name)        __builtin_##name
+#define mi_has_builtin(name)    __has_builtin(__builtin_##name)
+
+#if (LONG_MAX == INT32_MAX)
+#define mi_builtin32(name)       mi_builtin(name##l)
+#define mi_has_builtin32(name)   mi_has_builtin(name##l)
+#else
+#define mi_builtin32(name)       mi_builtin(name)
+#define mi_has_builtin32(name)   mi_has_builtin(name)
+#endif
+#if (LONG_MAX == INT64_MAX)
+#define mi_builtin64(name)       mi_builtin(name##l)
+#define mi_has_builtin64(name)   mi_has_builtin(name##l)
+#else
+#define mi_builtin64(name)       mi_builtin(name##ll)
+#define mi_has_builtin64(name)   mi_has_builtin(name##ll)
+#endif
+
+#if (MI_SIZE_BITS == 32)
+#define mi_builtinz(name)        mi_builtin32(name)
+#define mi_has_builtinz(name)    mi_has_builtin32(name)
+#define mi_msc_builtinz(name)    name
+#elif (MI_SIZE_BITS == 64)
+#define mi_builtinz(name)        mi_builtin64(name)
+#define mi_has_builtinz(name)    mi_has_builtin64(name)
+#define mi_msc_builtinz(name)    name##64
+#endif
+
+/* --------------------------------------------------------------------------------
+  Popcount and count trailing/leading zero's
+-------------------------------------------------------------------------------- */
+
+size_t _mi_popcount_generic(size_t x);
+
+static inline size_t mi_popcount(size_t x) {
+  #if mi_has_builtinz(popcount)
+    return mi_builtinz(popcount)(x);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    return mi_msc_builtinz(__popcnt)(x);
+  #elif MI_ARCH_X64 && defined(__BMI1__)
+    return (size_t)_mm_popcnt_u64(x);
+  #else
+    #define MI_HAS_FAST_POPCOUNT  0
+    return (x<=1 ? x : _mi_popcount_generic(x));
+  #endif
+}
+
+#ifndef MI_HAS_FAST_POPCOUNT
+#define MI_HAS_FAST_POPCOUNT 1
+#endif
+
+
+
+size_t _mi_clz_generic(size_t x);
+size_t _mi_ctz_generic(size_t x);
+
+static inline size_t mi_ctz(size_t x) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 tzcnt is defined for 0
+    size_t r;
+    __asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    return r;
+  #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__) 
+    return _tzcnt_u64(x);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long idx;
+    return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS);
+  #elif mi_has_builtinz(ctz)
+    return (x!=0 ? (size_t)mi_builtinz(ctz)(x) : MI_SIZE_BITS);
+  #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86)
+    size_t r = MI_SIZE_BITS;  // bsf leaves destination unmodified if the argument is 0 (see <https://github.com/llvm/llvm-project/pull/102885>)
+    __asm ("bsf\t%1, %0" : "+r"(r) : "r"(x) : "cc");
+    return r;
+  #elif MI_HAS_FAST_POPCOUNT
+    return (x!=0 ? (mi_popcount(x^(x-1))-1) : MI_SIZE_BITS);
+  #else
+    #define MI_HAS_FAST_BITSCAN  0
+    return (x!=0 ? _mi_ctz_generic(x) : MI_SIZE_BITS);
+  #endif
+}
+
+static inline size_t mi_clz(size_t x) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0
+    size_t r;
+    __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    return r;
+  #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__) 
+    return _lzcnt_u64(x);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long idx;
+    return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS);
+  #elif mi_has_builtinz(clz)
+    return (x!=0 ? (size_t)mi_builtinz(clz)(x) : MI_SIZE_BITS);
+  #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86)
+    if (x==0) return MI_SIZE_BITS;
+    size_t r;
+    __asm ("bsr\t%1, %0" : "=r"(r) : "r"(x) : "cc");
+    return (MI_SIZE_BITS - 1 - r);
+  #else
+    #define MI_HAS_FAST_BITSCAN  0
+    return (x!=0 ? _mi_clz_generic(x) : MI_SIZE_BITS);
+  #endif
+}
+
+#ifndef MI_HAS_FAST_BITSCAN
+#define MI_HAS_FAST_BITSCAN 1
+#endif
+
+/* --------------------------------------------------------------------------------
+  find trailing/leading zero  (bit scan forward/reverse)
+-------------------------------------------------------------------------------- */
+
+// Bit scan forward: find the least significant bit that is set (i.e. count trailing zero's)
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bsf(size_t x, size_t* idx) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9)
+    // on x64 the carry flag is set on zero which gives better codegen
+    bool is_zero;
+    __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" );
+    return !is_zero;
+  #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long i;
+    return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false);
+  #else
+    return (x!=0 ? (*idx = mi_ctz(x), true) : false);
+  #endif
+}
+
+// Bit scan reverse: find the most significant bit that is set
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bsr(size_t x, size_t* idx) {
+  #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)  && (!defined(__clang_major__) || __clang_major__ >= 9)
+    // on x64 the carry flag is set on zero which gives better codegen
+    bool is_zero;
+    __asm ("lzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc");
+    return !is_zero;
+  #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    unsigned long i;
+    return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false);
+  #else
+    return (x!=0 ? (*idx = MI_SIZE_BITS - 1 - mi_clz(x), true) : false);
+  #endif
+}
+
+
+/* --------------------------------------------------------------------------------
+  rotate
+-------------------------------------------------------------------------------- */
+
+static inline size_t mi_rotr(size_t x, size_t r) {
+  #if (mi_has_builtin(rotateright64) && MI_SIZE_BITS==64)
+    return mi_builtin(rotateright64)(x,r);
+  #elif (mi_has_builtin(rotateright32) && MI_SIZE_BITS==32)
+    return mi_builtin(rotateright32)(x,r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_ARM64)
+    return _rotr64(x, (int)r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X86 || MI_ARCH_ARM32)
+    return _lrotr(x,(int)r);
+  #else
+    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
+    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
+    const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1);
+    return ((x >> rshift) | (x << ((-rshift) & (MI_SIZE_BITS-1))));
+  #endif
+}
+
+static inline size_t mi_rotl(size_t x, size_t r) {
+  #if (mi_has_builtin(rotateleft64) && MI_SIZE_BITS==64)
+    return mi_builtin(rotateleft64)(x,r);
+  #elif (mi_has_builtin(rotateleft32) && MI_SIZE_BITS==32)
+    return mi_builtin(rotateleft32)(x,r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_ARM64)
+    return _rotl64(x, (int)r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X86 || MI_ARCH_ARM32)
+    return _lrotl(x, (int)r);
+  #else
+    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
+    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
+    const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1);
+    return ((x << rshift) | (x >> ((-rshift) & (MI_SIZE_BITS-1))));
+  #endif
+}
+
+static inline uint32_t mi_rotl32(uint32_t x, uint32_t r) {
+  #if mi_has_builtin(rotateleft32)
+    return mi_builtin(rotateleft32)(x,r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    return _lrotl(x, (int)r);
+  #else
+    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
+    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
+    const unsigned int rshift = (unsigned int)(r) & 31;
+    return ((x << rshift) | (x >> ((-rshift) & 31)));
+  #endif
+}
+
+
+#endif // MI_BITS_H
diff --git a/system/lib/mimalloc/include/mimalloc/internal.h b/system/lib/mimalloc/include/mimalloc/internal.h
index 6c6e5ed04f1db..d9fb1335e4aed 100644
--- a/system/lib/mimalloc/include/mimalloc/internal.h
+++ b/system/lib/mimalloc/include/mimalloc/internal.h
@@ -1,21 +1,26 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_INTERNAL_H
-#define MIMALLOC_INTERNAL_H
-
+#ifndef MI_INTERNAL_H
+#define MI_INTERNAL_H
 
 // --------------------------------------------------------------------------
-// This file contains the interal API's of mimalloc and various utility
+// This file contains the internal API's of mimalloc and various utility
 // functions and macros.
 // --------------------------------------------------------------------------
 
 #include "types.h"
 #include "track.h"
+#include "bits.h"
+
+
+// --------------------------------------------------------------------------
+// Compiler defines
+// --------------------------------------------------------------------------
 
 #if (MI_DEBUG>0)
 #define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
@@ -23,262 +28,396 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_trace_message(...)
 #endif
 
-#define MI_CACHE_LINE          64
+#define mi_decl_cache_align     mi_decl_align(64)
+
 #if defined(_MSC_VER)
 #pragma warning(disable:4127)   // suppress constant conditional warning (due to MI_SECURE paths)
 #pragma warning(disable:26812)  // unscoped enum warning
+#define mi_decl_forceinline     __forceinline
 #define mi_decl_noinline        __declspec(noinline)
 #define mi_decl_thread          __declspec(thread)
-#define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
+#define mi_decl_noreturn        __declspec(noreturn)
 #define mi_decl_weak
+#define mi_decl_hidden
+#define mi_decl_cold
 #elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
+#if !MI_TRACK_ASAN
+#define mi_decl_forceinline     __attribute__((always_inline)) inline
+#else
+#define mi_decl_forceinline     inline
+#endif
 #define mi_decl_noinline        __attribute__((noinline))
 #define mi_decl_thread          __thread
-#define mi_decl_cache_align     __attribute__((aligned(MI_CACHE_LINE)))
+#define mi_decl_noreturn        __attribute__((noreturn))
 #define mi_decl_weak            __attribute__((weak))
+#define mi_decl_hidden          __attribute__((visibility("hidden")))
+#if (__GNUC__ >= 4) || defined(__clang__)
+#define mi_decl_cold            __attribute__((cold))
+#else
+#define mi_decl_cold
+#endif
+#elif __cplusplus >= 201103L    // c++11
+#define mi_decl_forceinline     inline
+#define mi_decl_noinline
+#define mi_decl_thread          thread_local
+#define mi_decl_noreturn        [[noreturn]]
+#define mi_decl_weak
+#define mi_decl_hidden
+#define mi_decl_cold
 #else
+#define mi_decl_forceinline     inline
 #define mi_decl_noinline
 #define mi_decl_thread          __thread        // hope for the best :-)
-#define mi_decl_cache_align
+#define mi_decl_noreturn
 #define mi_decl_weak
+#define mi_decl_hidden
+#define mi_decl_cold
 #endif
 
-#if defined(__EMSCRIPTEN__) && !defined(__wasi__)
-#define __wasi__
+#if defined(__GNUC__) || defined(__clang__)
+#define mi_unlikely(x)     (__builtin_expect(!!(x),false))
+#define mi_likely(x)       (__builtin_expect(!!(x),true))
+#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+#define mi_unlikely(x)     (x) [[unlikely]]
+#define mi_likely(x)       (x) [[likely]]
+#else
+#define mi_unlikely(x)     (x)
+#define mi_likely(x)       (x)
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x)    0
+#endif
+
+#if defined(__cplusplus)
+#define mi_decl_externc     extern "C"
+#else
+#define mi_decl_externc
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 7)) || defined(__clang__) // includes clang and icc
+#define mi_decl_maybe_unused    __attribute__((unused))
+#elif __cplusplus >= 201703L    // c++17
+#define mi_decl_maybe_unused    [[maybe_unused]]
+#else
+#define mi_decl_maybe_unused
 #endif
 
 #if defined(__cplusplus)
-#define mi_decl_externc       extern "C"
+#define mi_decl_externc         extern "C"
 #else
 #define mi_decl_externc
 #endif
 
-// pthreads
-#if !defined(_WIN32) && !defined(__wasi__)
-#define  MI_USE_PTHREADS
-#include <pthread.h>
+
+#if defined(__EMSCRIPTEN__) && !defined(__wasi__)
+#define __wasi__
 #endif
 
+
+// --------------------------------------------------------------------------
+// Internal functions
+// --------------------------------------------------------------------------
+
+
+// "libc.c"
+#include <stdarg.h>
+int           _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args);
+int           _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...);
+char          _mi_toupper(char c);
+int           _mi_strnicmp(const char* s, const char* t, size_t n);
+void          _mi_strlcpy(char* dest, const char* src, size_t dest_size);
+void          _mi_strlcat(char* dest, const char* src, size_t dest_size);
+size_t        _mi_strlen(const char* s);
+size_t        _mi_strnlen(const char* s, size_t max_len);
+char*         _mi_strnstr(char* s, size_t max_len, const char* pat);
+bool          _mi_getenv(const char* name, char* result, size_t result_size);
+
 // "options.c"
-void       _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
-void       _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
-void       _mi_warning_message(const char* fmt, ...);
-void       _mi_verbose_message(const char* fmt, ...);
-void       _mi_trace_message(const char* fmt, ...);
-void       _mi_options_init(void);
-void       _mi_error_message(int err, const char* fmt, ...);
+void          _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
+void          _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
+void          _mi_raw_message(const char* fmt, ...);
+void          _mi_message(const char* fmt, ...);
+void          _mi_warning_message(const char* fmt, ...);
+void          _mi_verbose_message(const char* fmt, ...);
+void          _mi_trace_message(const char* fmt, ...);
+void          _mi_options_init(void);
+void          _mi_options_post_init(void);
+long          _mi_option_get_fast(mi_option_t option);
+void          _mi_error_message(int err, const char* fmt, ...);
 
 // random.c
-void       _mi_random_init(mi_random_ctx_t* ctx);
-void       _mi_random_init_weak(mi_random_ctx_t* ctx);
-void       _mi_random_reinit_if_weak(mi_random_ctx_t * ctx);
-void       _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
-uintptr_t  _mi_random_next(mi_random_ctx_t* ctx);
-uintptr_t  _mi_heap_random_next(mi_heap_t* heap);
-uintptr_t  _mi_os_random_weak(uintptr_t extra_seed);
+void          _mi_random_init(mi_random_ctx_t* ctx);
+void          _mi_random_init_weak(mi_random_ctx_t* ctx);
+void          _mi_random_reinit_if_weak(mi_random_ctx_t * ctx);
+void          _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
+uintptr_t     _mi_random_next(mi_random_ctx_t* ctx);
+uintptr_t     _mi_theap_random_next(mi_theap_t* theap);
+uintptr_t     _mi_os_random_weak(uintptr_t extra_seed);
 static inline uintptr_t _mi_random_shuffle(uintptr_t x);
 
 // init.c
-extern mi_decl_cache_align mi_stats_t       _mi_stats_main;
-extern mi_decl_cache_align const mi_page_t  _mi_page_empty;
-bool       _mi_is_main_thread(void);
-size_t     _mi_current_thread_count(void);
-bool       _mi_preloading(void);           // true while the C runtime is not initialized yet
-mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
-mi_heap_t*    _mi_heap_main_get(void);     // statically allocated main backing heap
-void       _mi_thread_done(mi_heap_t* heap);
-void       _mi_thread_data_collect(void);
-void       _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap);
+extern mi_decl_hidden mi_decl_cache_align const mi_page_t  _mi_page_empty;
+void          _mi_auto_process_init(void);
+void mi_cdecl _mi_auto_process_done(void) mi_attr_noexcept;
+bool          _mi_is_redirected(void);
+bool          _mi_allocator_init(const char** message);
+void          _mi_allocator_done(void);
+bool          _mi_is_main_thread(void);
+bool          _mi_preloading(void);           // true while the C runtime is not initialized yet
+void          _mi_thread_done(mi_theap_t* theap);
+
+mi_subproc_t* _mi_subproc(void);
+mi_subproc_t* _mi_subproc_main(void);
+mi_heap_t*    _mi_subproc_heap_main(mi_subproc_t* subproc);
+mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id);
 
+mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
+size_t        _mi_thread_seq_id(void) mi_attr_noexcept;
+bool          _mi_is_heap_main(const mi_heap_t* heap);
+bool          _mi_is_theap_main(const mi_theap_t* theap);
+void          _mi_theap_guarded_init(mi_theap_t* theap);
+void          _mi_theap_options_init(mi_theap_t* theap);
+mi_theap_t*   _mi_theap_default_safe(void);             // ensure the returned theap is initialized
+mi_theap_t*   _mi_theap_main_safe(void);
+   
 // os.c
-void       _mi_os_init(void);                                            // called from process init
-void*      _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats);
-void       _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats);
-void       _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats);
-
-size_t     _mi_os_page_size(void);
-size_t     _mi_os_good_alloc_size(size_t size);
-bool       _mi_os_has_overcommit(void);
-bool       _mi_os_has_virtual_reserve(void);
-
-bool       _mi_os_purge(void* p, size_t size, mi_stats_t* stats);
-bool       _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats);
-bool       _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-bool       _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
-bool       _mi_os_protect(void* addr, size_t size);
-bool       _mi_os_unprotect(void* addr, size_t size);
-bool       _mi_os_purge(void* p, size_t size, mi_stats_t* stats);
-bool       _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats);
-
-void*      _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats);
-void*      _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* tld_stats);
-
-void*      _mi_os_get_aligned_hint(size_t try_alignment, size_t size);
-bool       _mi_os_use_large_page(size_t size, size_t alignment);
-size_t     _mi_os_large_page_size(void);
-
-void*      _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);
+void          _mi_os_init(void);                                            // called from process init
+void*         _mi_os_alloc(size_t size, mi_memid_t* memid);
+void*         _mi_os_zalloc(size_t size, mi_memid_t* memid);
+void          _mi_os_free(void* p, size_t size, mi_memid_t memid);
+void          _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid, mi_subproc_t* subproc );
+
+size_t        _mi_os_page_size(void);
+size_t        _mi_os_guard_page_size(void);
+size_t        _mi_os_good_alloc_size(size_t size);
+bool          _mi_os_has_overcommit(void);
+bool          _mi_os_has_virtual_reserve(void);
+size_t        _mi_os_virtual_address_bits(void);
+size_t        _mi_os_minimal_purge_size(void);
+
+bool          _mi_os_reset(void* addr, size_t size);
+bool          _mi_os_decommit(void* addr, size_t size);
+void          _mi_os_reuse(void* p, size_t size);
+mi_decl_nodiscard bool _mi_os_commit(void* p, size_t size, bool* is_zero);
+mi_decl_nodiscard bool _mi_os_commit_ex(void* addr, size_t size, bool* is_zero, size_t stat_size);
+mi_decl_nodiscard bool _mi_os_protect(void* addr, size_t size);
+bool          _mi_os_unprotect(void* addr, size_t size);
+bool          _mi_os_purge(void* p, size_t size);
+bool          _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stats_size, mi_commit_fun_t* commit_fun, void* commit_fun_arg);
+
+size_t        _mi_os_secure_guard_page_size(void);
+bool          _mi_os_secure_guard_page_set_at(void* addr, mi_memid_t memid);
+bool          _mi_os_secure_guard_page_set_before(void* addr, mi_memid_t memid);
+bool          _mi_os_secure_guard_page_reset_at(void* addr, mi_memid_t memid);
+bool          _mi_os_secure_guard_page_reset_before(void* addr, mi_memid_t memid);
+
+int           _mi_os_numa_node(void);
+int           _mi_os_numa_node_count(void);
+
+void*         _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid);
+void*         _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid);
+
+void*         _mi_os_get_aligned_hint(size_t try_alignment, size_t size);
+bool          _mi_os_canuse_large_page(size_t size, size_t alignment);
+size_t        _mi_os_large_page_size(void);
+void*         _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid);
+
+// threadlocal.c
+
+mi_thread_local_t _mi_thread_local_create(void);
+void          _mi_thread_local_free( mi_thread_local_t key );
+bool          _mi_thread_local_set(  mi_thread_local_t key, void* val );
+void*         _mi_thread_local_get(  mi_thread_local_t key );
+void          _mi_thread_locals_init(void);
+void          _mi_thread_locals_done(void);
+void          _mi_thread_locals_thread_done(void);
 
 // arena.c
 mi_arena_id_t _mi_arena_id_none(void);
-void       _mi_arena_free(void* p, size_t size, size_t still_committed_size, mi_memid_t memid, mi_stats_t* stats);
-void*      _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld);
-void*      _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld);
-bool       _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id);
-bool       _mi_arena_contains(const void* p);
-void       _mi_arenas_collect(bool force_purge, mi_stats_t* stats);
-void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
-
-bool       _mi_arena_segment_clear_abandoned(mi_segment_t* segment);
-void       _mi_arena_segment_mark_abandoned(mi_segment_t* segment);
-size_t     _mi_arena_segment_abandoned_count(void);
-
-typedef struct mi_arena_field_cursor_s { // abstract
-  mi_arena_id_t  start;
-  int            count;
-  size_t         bitmap_idx;
-} mi_arena_field_cursor_t;
-void          _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current);
-mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous);
-
-// "segment-map.c"
-void       _mi_segment_map_allocated_at(const mi_segment_t* segment);
-void       _mi_segment_map_freed_at(const mi_segment_t* segment);
-
-// "segment.c"
-mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
-void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
-void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
-bool       _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld);
-void       _mi_segment_collect(mi_segment_t* segment, bool force, mi_segments_tld_t* tld);
-
-#if MI_HUGE_PAGE_ABANDON
-void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
-#else
-void       _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
-#endif
-
-uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size); // page start for any page
-void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
-void       _mi_abandoned_await_readers(void);
-void       _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld);
-bool       _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment);
+mi_arena_t*   _mi_arena_from_id(mi_arena_id_t id);
+bool          _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena);
+
+void*         _mi_arenas_alloc(mi_heap_t* heap, size_t size, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid);
+void*         _mi_arenas_alloc_aligned(mi_heap_t* heap, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid);
+void          _mi_arenas_free(void* p, size_t size, mi_memid_t memid);
+bool          _mi_arenas_contain(const void* p);
+void          _mi_arenas_collect(bool force_purge, bool visit_all, mi_tld_t* tld);
+void          _mi_arenas_unsafe_destroy_all(mi_subproc_t* subproc);
+
+mi_page_t*    _mi_arenas_page_alloc(mi_theap_t* theap, size_t block_size, size_t page_alignment);
+void          _mi_arenas_page_free(mi_page_t* page, mi_theap_t* current_theapx /* can be NULL */);
+void          _mi_arenas_page_abandon(mi_page_t* page, mi_theap_t* current_theap);
+void          _mi_arenas_page_unabandon(mi_page_t* page, mi_theap_t* current_theapx /* can be NULL */);
+bool          _mi_arenas_page_try_reabandon_to_mapped(mi_page_t* page);
+
+// arena-meta.c
+void*         _mi_meta_zalloc( size_t size, mi_memid_t* memid );
+void          _mi_meta_free(void* p, size_t size, mi_memid_t memid);
+bool          _mi_meta_is_meta_page(void* p);
+
+// "page-map.c"
+bool          _mi_page_map_init(void);
+mi_decl_nodiscard bool _mi_page_map_register(mi_page_t* page);
+void          _mi_page_map_unregister(mi_page_t* page);
+void          _mi_page_map_unregister_range(void* start, size_t size);
+mi_page_t*    _mi_safe_ptr_page(const void* p);
+void          _mi_page_map_unsafe_destroy(mi_subproc_t* subproc);
 
 // "page.c"
-void*      _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
-
-void       _mi_page_retire(mi_page_t* page) mi_attr_noexcept;                  // free the page if there are no other pages with many free blocks
-void       _mi_page_unfull(mi_page_t* page);
-void       _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force);   // free the page
-void       _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);            // abandon the page, to be picked up by another thread...
-void       _mi_heap_delayed_free_all(mi_heap_t* heap);
-bool       _mi_heap_delayed_free_partial(mi_heap_t* heap);
-void       _mi_heap_collect_retired(mi_heap_t* heap, bool force);
-
-void       _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
-bool       _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
-size_t     _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
-void       _mi_deferred_free(mi_heap_t* heap, bool force);
-
-void       _mi_page_free_collect(mi_page_t* page,bool force);
-void       _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page);   // callback from segments
-
-size_t     _mi_bin_size(uint8_t bin);           // for stats
-uint8_t    _mi_bin(size_t size);                // for stats
+void*         _mi_malloc_generic(mi_theap_t* theap, size_t size, size_t zero_huge_alignment, size_t* usable)  mi_attr_noexcept mi_attr_malloc;
+
+void          _mi_page_retire(mi_page_t* page) mi_attr_noexcept;       // free the page if there are no other pages with many free blocks
+void          _mi_page_unfull(mi_page_t* page);
+void          _mi_page_free(mi_page_t* page, mi_page_queue_t* pq);     // free the page
+void          _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);  // abandon the page, to be picked up by another thread...
+
+size_t        _mi_page_queue_append(mi_theap_t* theap, mi_page_queue_t* pq, mi_page_queue_t* append);
+void          _mi_deferred_free(mi_theap_t* theap, bool force);
+
+void          _mi_page_free_collect(mi_page_t* page, bool force);
+void          _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head);
+mi_decl_nodiscard bool _mi_page_init(mi_theap_t* theap, mi_page_t* page);
+bool          _mi_page_queue_is_valid(mi_theap_t* theap, const mi_page_queue_t* pq);
+
+size_t        _mi_page_stats_bin(const mi_page_t* page); // for stats
+size_t        _mi_bin_size(size_t bin);                  // for stats
+size_t        _mi_bin(size_t size);                      // for stats
+
+// "theap.c"
+mi_theap_t*   _mi_theap_create(mi_heap_t* heap, mi_tld_t* tld);
+void          _mi_theap_delete(mi_theap_t* theap, bool acquire_tld_theaps_lock);
+void          _mi_theap_default_set(mi_theap_t* theap);
+void          _mi_theap_cached_set(mi_theap_t* theap);
+void          _mi_theap_collect_retired(mi_theap_t* theap, bool force);
+void          _mi_theap_collect_abandon(mi_theap_t* theap);
+bool          _mi_theap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg);
+void          _mi_theap_page_reclaim(mi_theap_t* theap, mi_page_t* page);
+bool          _mi_theap_free(mi_theap_t* theap, bool acquire_heap_theaps_lock, bool acquire_tld_theaps_lock);
+void          _mi_theap_incref(mi_theap_t* theap);
+void          _mi_theap_decref(mi_theap_t* theap);
 
 // "heap.c"
-void       _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag);
-void       _mi_heap_destroy_pages(mi_heap_t* heap);
-void       _mi_heap_collect_abandon(mi_heap_t* heap);
-void       _mi_heap_set_default_direct(mi_heap_t* heap);
-bool       _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid);
-void       _mi_heap_unsafe_destroy_all(void);
-mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag);
+void          _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page);
+mi_decl_cold  mi_theap_t* _mi_heap_theap_get_or_init(const mi_heap_t* heap);  // get (and possible create) the theap belonging to a heap
+mi_decl_cold  mi_theap_t* _mi_heap_theap_get_peek(const mi_heap_t* heap);     // get the theap for a heap without initializing (and return NULL in that case)
+void          _mi_heap_move_pages(mi_heap_t* heap_from, mi_heap_t* heap_to);  // in "arena.c"
+void          _mi_heap_destroy_pages(mi_heap_t* heap_from);                   // in "arena.c"
+void          _mi_heap_force_destroy(mi_heap_t* heap);                        // allow destroying the main heap
 
 // "stats.c"
-void       _mi_stats_done(mi_stats_t* stats);
-mi_msecs_t  _mi_clock_now(void);
-mi_msecs_t  _mi_clock_end(mi_msecs_t start);
-mi_msecs_t  _mi_clock_start(void);
+void          _mi_stats_init(void);
+void          _mi_stats_merge_into(mi_stats_t* to, mi_stats_t* from);
 
-// "alloc.c"
-void*       _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept;  // called from `_mi_malloc_generic`
-void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;                  // called from `_mi_heap_malloc_aligned`
-void*       _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;           // called from `_mi_heap_malloc_aligned`
-void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
-void*       _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept;     // called from `_mi_heap_malloc_aligned`
-void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
-mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p);
-bool        _mi_free_delayed_block(mi_block_t* block);
-void        _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
-void        _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
+mi_msecs_t    _mi_clock_now(void);
+mi_msecs_t    _mi_clock_end(mi_msecs_t start);
+mi_msecs_t    _mi_clock_start(void);
 
-// "libc.c"
-#include    <stdarg.h>
-void        _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args);
-void        _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...);
-char        _mi_toupper(char c);
-int         _mi_strnicmp(const char* s, const char* t, size_t n);
-void        _mi_strlcpy(char* dest, const char* src, size_t dest_size);
-void        _mi_strlcat(char* dest, const char* src, size_t dest_size);
-size_t      _mi_strlen(const char* s);
-size_t      _mi_strnlen(const char* s, size_t max_len);
-bool        _mi_getenv(const char* name, char* result, size_t result_size);
+// "alloc.c"
+void*         _mi_page_malloc_zero(mi_theap_t* theap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept;                  // called from `_mi_theap_malloc_aligned`
+void*         _mi_theap_malloc_zero(mi_theap_t* theap, size_t size, bool zero, size_t* usable) mi_attr_noexcept;
+void*         _mi_theap_malloc_zero_ex(mi_theap_t* theap, size_t size, bool zero, size_t huge_alignment, size_t* usable) mi_attr_noexcept;     // called from `_mi_theap_malloc_aligned`
+void*         _mi_theap_realloc_zero(mi_theap_t* theap, void* p, size_t newsize, bool zero, size_t* usable_pre, size_t* usable_post) mi_attr_noexcept;
+mi_block_t*   _mi_page_ptr_unalign(const mi_page_t* page, const void* p);
+void          _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
 
 #if MI_DEBUG>1
-bool        _mi_page_is_valid(mi_page_t* page);
+bool          _mi_page_is_valid(mi_page_t* page);
 #endif
 
 
 // ------------------------------------------------------
-// Branches
+// Assertions
 // ------------------------------------------------------
 
-#if defined(__GNUC__) || defined(__clang__)
-#define mi_unlikely(x)     (__builtin_expect(!!(x),false))
-#define mi_likely(x)       (__builtin_expect(!!(x),true))
-#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
-#define mi_unlikely(x)     (x) [[unlikely]]
-#define mi_likely(x)       (x) [[likely]]
+#if (MI_DEBUG)
+// use our own assertion to print without memory allocation
+mi_decl_noreturn mi_decl_cold void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func) mi_attr_noexcept;
+#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
 #else
-#define mi_unlikely(x)     (x)
-#define mi_likely(x)       (x)
+#define mi_assert(x)
 #endif
 
-#ifndef __has_builtin
-#define __has_builtin(x)  0
+#if (MI_DEBUG>1)
+#define mi_assert_internal    mi_assert
+#else
+#define mi_assert_internal(x)
+#endif
+
+#if (MI_DEBUG>2)
+#define mi_assert_expensive   mi_assert
+#else
+#define mi_assert_expensive(x)
 #endif
 
 
 /* -----------------------------------------------------------
-  Error codes passed to `_mi_fatal_error`
-  All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
-  For portability define undefined error codes using common Unix codes:
-  <https://www-numi.fnal.gov/offline_software/srt_public_context/WebDocs/Errors/unix_system_errors.html>
+  Statistics (in `stats.c`)
 ----------------------------------------------------------- */
-#include <errno.h>
-#ifndef EAGAIN         // double free
-#define EAGAIN (11)
-#endif
-#ifndef ENOMEM         // out of memory
-#define ENOMEM (12)
-#endif
-#ifndef EFAULT         // corrupted free-list or meta-data
-#define EFAULT (14)
-#endif
-#ifndef EINVAL         // trying to free an invalid pointer
-#define EINVAL (22)
-#endif
-#ifndef EOVERFLOW      // count*size overflow
-#define EOVERFLOW (75)
-#endif
+
+// add to stat keeping track of the peak
+void __mi_stat_increase(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount);
+
+// adjust stat in special cases to compensate for double counting (and does not adjust peak values and can decrease the total)
+void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount);
+
+// counters can just be increased
+void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount);
+
+#define mi_heap_stat_counter_increase(heap,stat,amount)         __mi_stat_counter_increase_mt( &(heap)->stats.stat, amount)
+#define mi_heap_stat_increase(heap,stat,amount)                 __mi_stat_increase_mt( &(heap)->stats.stat, amount)
+#define mi_heap_stat_decrease(heap,stat,amount)                 __mi_stat_decrease_mt( &(heap)->stats.stat, amount)
+#define mi_heap_stat_adjust_increase(heap,stat,amnt)            __mi_stat_adjust_increase_mt( &(heap)->stats.stat, amnt)
+#define mi_heap_stat_adjust_decrease(heap,stat,amnt)            __mi_stat_adjust_decrease_mt( &(heap)->stats.stat, amnt)
+
+#define mi_subproc_stat_counter_increase(subproc,stat,amount)   __mi_stat_counter_increase_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_increase(subproc,stat,amount)           __mi_stat_increase_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_decrease(subproc,stat,amount)           __mi_stat_decrease_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_adjust_increase(subproc,stat,amount)    __mi_stat_adjust_increase_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_adjust_decrease(subproc,stat,amount)    __mi_stat_adjust_decrease_mt( &(subproc)->stats.stat, amount)
+
+#define mi_os_stat_counter_increase(stat,amount)                mi_subproc_stat_counter_increase(_mi_subproc(),stat,amount)
+#define mi_os_stat_increase(stat,amount)                        mi_subproc_stat_increase(_mi_subproc(),stat,amount)
+#define mi_os_stat_decrease(stat,amount)                        mi_subproc_stat_decrease(_mi_subproc(),stat,amount)
+
+#define mi_theap_stat_counter_increase(theap,stat,amount)       __mi_stat_counter_increase( &(theap)->stats.stat, amount)
+#define mi_theap_stat_increase(theap,stat,amount)               __mi_stat_increase( &(theap)->stats.stat, amount)
+#define mi_theap_stat_decrease(theap,stat,amount)               __mi_stat_decrease( &(theap)->stats.stat, amount)
+#define mi_theap_stat_adjust_increase(theap,stat,amnt)          __mi_stat_adjust_increase( &(theap)->stats.stat, amnt)
+#define mi_theap_stat_adjust_decrease(theap,stat,amnt)          __mi_stat_adjust_decrease( &(theap)->stats.stat, amnt)
+
+
+/* -----------------------------------------------------------
+  Options (exposed for the debugger)
+----------------------------------------------------------- */
+typedef enum mi_option_init_e {
+  MI_OPTION_UNINIT,       // not yet initialized
+  MI_OPTION_DEFAULTED,    // not found in the environment, use default value
+  MI_OPTION_INITIALIZED   // found in environment or set explicitly
+} mi_option_init_t;
+
+typedef struct mi_option_desc_s {
+  long              value;  // the value
+  mi_option_init_t  init;   // is it initialized yet? (from the environment)
+  mi_option_t       option; // for debugging: the option index should match the option
+  const char*       name;   // option name without `mimalloc_` prefix
+  const char*       legacy_name; // potential legacy option name
+} mi_option_desc_t;
+
 
 
 /* -----------------------------------------------------------
   Inlined definitions
 ----------------------------------------------------------- */
 #define MI_UNUSED(x)     (void)(x)
-#if (MI_DEBUG>0)
+#ifndef NDEBUG
 #define MI_UNUSED_RELEASE(x)
 #else
 #define MI_UNUSED_RELEASE(x)  MI_UNUSED(x)
@@ -292,6 +431,9 @@ bool        _mi_page_is_valid(mi_page_t* page);
 #define MI_INIT128(x) MI_INIT64(x),MI_INIT64(x)
 #define MI_INIT256(x) MI_INIT128(x),MI_INIT128(x)
 
+#define MI_INIT74(x)  MI_INIT64(x),MI_INIT8(x),x(),x()
+#define MI_INIT5(x)   MI_INIT4(x),x()
+#define MI_INIT6(x)   MI_INIT4(x),x(),x()
 
 #include <string.h>
 // initialize a local variable to zero; use memset as compilers optimize constant sized memset's
@@ -303,7 +445,7 @@ static inline bool _mi_is_power_of_two(uintptr_t x) {
 }
 
 // Is a pointer aligned?
-static inline bool _mi_is_aligned(void* p, size_t alignment) {
+static inline bool _mi_is_aligned(const void* p, size_t alignment) {
   mi_assert_internal(alignment != 0);
   return (((uintptr_t)p % alignment) == 0);
 }
@@ -320,7 +462,11 @@ static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
   }
 }
 
-// Align downwards
+// Align a pointer upwards
+static inline void* _mi_align_up_ptr(const void* p, size_t alignment) {
+  return (void*)_mi_align_up((uintptr_t)p, alignment);
+}
+
 static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
   mi_assert_internal(alignment != 0);
   uintptr_t mask = alignment - 1;
@@ -332,23 +478,25 @@ static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
   }
 }
 
-// Align a pointer upwards
-static inline void* mi_align_up_ptr(void* p, size_t alignment) {
-  return (void*)_mi_align_up((uintptr_t)p, alignment);
-}
-
-// Align a pointer downwards
-static inline void* mi_align_down_ptr(void* p, size_t alignment) {
+// align a pointer downwards
+static inline void* _mi_align_down_ptr(const void* p, size_t alignment) {
   return (void*)_mi_align_down((uintptr_t)p, alignment);
 }
 
-
 // Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
 static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
   mi_assert_internal(divider != 0);
   return (divider == 0 ? size : ((size + divider - 1) / divider));
 }
 
+
+// clamp an integer
+static inline size_t _mi_clamp(size_t sz, size_t min, size_t max) {
+  if (sz < min) return min;
+  else if (sz > max) return max;
+  else return sz;
+}
+
 // Is memory zero initialized?
 static inline bool mi_mem_is_zero(const void* p, size_t size) {
   for (size_t i = 0; i < size; i++) {
@@ -357,7 +505,6 @@ static inline bool mi_mem_is_zero(const void* p, size_t size) {
   return true;
 }
 
-
 // Align a byte size to a size in _machine words_,
 // i.e. byte size == `wsize*sizeof(void*)`.
 static inline size_t _mi_wsize_from_size(size_t size) {
@@ -382,10 +529,13 @@ static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
 }
 #else /* __builtin_umul_overflow is unavailable */
 static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
-  #define MI_MUL_COULD_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
-  *total = count * size;
-  // note: gcc/clang optimize this to directly check the overflow flag
-  return ((size >= MI_MUL_COULD_OVERFLOW || count >= MI_MUL_COULD_OVERFLOW) && size > 0 && (SIZE_MAX / size) < count);
+  *total = count*size;
+  if mi_likely(((size|count)>>(4*MI_SIZE_SIZE))==0) {  // did size and count fit both in the lower half bits of a size_t?
+    return false;
+  }
+  else {
+    return (size!=0 && (SIZE_MAX / size) < count);
+  }
 }
 #endif
 
@@ -395,14 +545,16 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot
     *total = size;
     return false;
   }
-  else if mi_unlikely(mi_mul_overflow(count, size, total)) {
+  else if mi_likely(!mi_mul_overflow(count, size, total)) {
+    return false;
+  }
+  else {
     #if MI_DEBUG > 0
     _mi_error_message(EOVERFLOW, "allocation request is too large (%zu * %zu bytes)\n", count, size);
     #endif
     *total = SIZE_MAX;
     return true;
   }
-  else return false;
 }
 
 
@@ -410,163 +562,192 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot
   Heap functions
 ------------------------------------------------------------------------------------------- */
 
-extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
+extern mi_decl_hidden const mi_theap_t _mi_theap_empty;       // read-only empty theap, initial value of the thread local default theap (in the MI_TLS_MODEL_THREAD_LOCAL)
+extern mi_decl_hidden const mi_theap_t _mi_theap_empty_wrong; // read-only empty theap used to signal that a theap for a heap could not be allocated
+
 
-static inline bool mi_heap_is_backing(const mi_heap_t* heap) {
-  return (heap->tld->heap_backing == heap);
+static inline mi_heap_t* _mi_theap_heap(const mi_theap_t* theap) {
+  return mi_atomic_load_ptr_acquire(mi_heap_t,&theap->heap);
 }
 
-static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
-  mi_assert_internal(heap != NULL);
-  return (heap != &_mi_heap_empty);
+static inline bool mi_theap_is_initialized(const mi_theap_t* theap) {
+  return (theap != NULL && _mi_theap_heap(theap) != NULL);
 }
 
-static inline uintptr_t _mi_ptr_cookie(const void* p) {
-  extern mi_heap_t _mi_heap_main;
-  mi_assert_internal(_mi_heap_main.cookie != 0);
-  return ((uintptr_t)p ^ _mi_heap_main.cookie);
+static inline mi_page_t* _mi_theap_get_free_small_page(mi_theap_t* theap, size_t size) {
+  mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE));
+  const size_t idx = _mi_wsize_from_size(size);
+  mi_assert_internal(idx < MI_PAGES_DIRECT);
+  return theap->pages_free_direct[idx];
 }
 
+
+//static inline uintptr_t _mi_ptr_cookie(const void* p) {
+//  extern mi_theap_t _mi_theap_main;
+//  mi_assert_internal(_mi_theap_main.cookie != 0);
+//  return ((uintptr_t)p ^ _mi_theap_main.cookie);
+//}
+
+
 /* -----------------------------------------------------------
-  Pages
+  The page map maps addresses to `mi_page_t` pointers
 ----------------------------------------------------------- */
 
-static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t size) {
-  mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE));
-  const size_t idx = _mi_wsize_from_size(size);
-  mi_assert_internal(idx < MI_PAGES_DIRECT);
-  return heap->pages_free_direct[idx];
-}
-
-// Segment that contains the pointer
-// Large aligned blocks may be aligned at N*MI_SEGMENT_SIZE (inside a huge segment > MI_SEGMENT_SIZE),
-// and we need align "down" to the segment info which is `MI_SEGMENT_SIZE` bytes before it;
-// therefore we align one byte before `p`.
-// We check for NULL afterwards on 64-bit systems to improve codegen for `mi_free`.
-static inline mi_segment_t* _mi_ptr_segment(const void* p) {
-  mi_segment_t* const segment = (mi_segment_t*)(((uintptr_t)p - 1) & ~MI_SEGMENT_MASK);
-  #if MI_INTPTR_SIZE <= 4
-  return (p==NULL ? NULL : segment);
-  #else
-  return ((intptr_t)segment <= 0 ? NULL : segment);
-  #endif
+#if MI_PAGE_MAP_FLAT
+
+// flat page-map committed on demand, using one byte per slice (64 KiB).
+// single indirection and low commit, but large initial virtual reserve (4 GiB with 48 bit virtual addresses)
+// used by default on <= 40 bit virtual address spaces.
+extern mi_decl_hidden uint8_t* _mi_page_map;
+
+static inline size_t _mi_page_map_index(const void* p) {
+  return (size_t)((uintptr_t)p >> MI_ARENA_SLICE_SHIFT);
 }
 
-static inline mi_page_t* mi_slice_to_page(mi_slice_t* s) {
-  mi_assert_internal(s->slice_offset== 0 && s->slice_count > 0);
-  return (mi_page_t*)(s);
+static inline mi_page_t* _mi_ptr_page_ex(const void* p, bool* valid) {
+  const size_t idx = _mi_page_map_index(p);
+  const size_t ofs = _mi_page_map[idx];
+  if (valid != NULL) { *valid = (ofs != 0); }
+  return (mi_page_t*)((((uintptr_t)p >> MI_ARENA_SLICE_SHIFT) + 1 - ofs) << MI_ARENA_SLICE_SHIFT);
 }
 
-static inline mi_slice_t* mi_page_to_slice(mi_page_t* p) {
-  mi_assert_internal(p->slice_offset== 0 && p->slice_count > 0);
-  return (mi_slice_t*)(p);
+static inline mi_page_t* _mi_checked_ptr_page(const void* p) {
+  bool valid;
+  mi_page_t* const page = _mi_ptr_page_ex(p, &valid);
+  return (valid ? page : NULL);
 }
 
-// Segment belonging to a page
-static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
-  mi_assert_internal(page!=NULL);
-  mi_segment_t* segment = _mi_ptr_segment(page);
-  mi_assert_internal(segment == NULL || ((mi_slice_t*)page >= segment->slices && (mi_slice_t*)page < segment->slices + segment->slice_entries));
-  return segment;
+static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) {
+  return _mi_ptr_page_ex(p, NULL);
+}
+
+#else
+
+// 2-level page map:
+// double indirection, but low commit and low virtual reserve.
+//
+// the page-map is usually 4 MiB (for 48 bit virtual addresses) and points to sub maps of 64 KiB.
+// the page-map is committed on-demand (in 64 KiB parts) (and sub-maps are committed on-demand as well)
+// one sub page-map = 64 KiB => covers 2^(16-3) * 2^16 = 2^29 = 512 MiB address space
+// the page-map needs 48-(16+13) = 19 bits => 2^19 sub map pointers = 2^22 bytes = 4 MiB reserved size.
+#define MI_PAGE_MAP_SUB_SHIFT     (13)
+#define MI_PAGE_MAP_SUB_COUNT     (MI_ZU(1) << MI_PAGE_MAP_SUB_SHIFT)
+#define MI_PAGE_MAP_SHIFT         (MI_MAX_VABITS - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT)
+#define MI_PAGE_MAP_COUNT         (MI_ZU(1) << MI_PAGE_MAP_SHIFT)
+
+typedef mi_page_t**   mi_submap_t;
+extern mi_decl_hidden _Atomic(mi_submap_t)* _mi_page_map;
+
+static inline size_t _mi_page_map_index(const void* p, size_t* sub_idx) {
+  const size_t u = (size_t)((uintptr_t)p / MI_ARENA_SLICE_SIZE);
+  if (sub_idx != NULL) { *sub_idx = u % MI_PAGE_MAP_SUB_COUNT; }
+  return (u / MI_PAGE_MAP_SUB_COUNT);
 }
 
-static inline mi_slice_t* mi_slice_first(const mi_slice_t* slice) {
-  mi_slice_t* start = (mi_slice_t*)((uint8_t*)slice - slice->slice_offset);
-  mi_assert_internal(start >= _mi_ptr_segment(slice)->slices);
-  mi_assert_internal(start->slice_offset == 0);
-  mi_assert_internal(start + start->slice_count > slice);
-  return start;
+static inline mi_submap_t _mi_page_map_at(size_t idx) {
+  return mi_atomic_load_ptr_relaxed(mi_page_t*, &_mi_page_map[idx]);
 }
 
-// Get the page containing the pointer (performance critical as it is called in mi_free)
-static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
-  mi_assert_internal(p > (void*)segment);
-  ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
-  mi_assert_internal(diff > 0 && diff <= (ptrdiff_t)MI_SEGMENT_SIZE);
-  size_t idx = (size_t)diff >> MI_SEGMENT_SLICE_SHIFT;
-  mi_assert_internal(idx <= segment->slice_entries);
-  mi_slice_t* slice0 = (mi_slice_t*)&segment->slices[idx];
-  mi_slice_t* slice = mi_slice_first(slice0);  // adjust to the block that holds the page data
-  mi_assert_internal(slice->slice_offset == 0);
-  mi_assert_internal(slice >= segment->slices && slice < segment->slices + segment->slice_entries);
-  return mi_slice_to_page(slice);
+static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) {
+  size_t sub_idx;
+  const size_t idx = _mi_page_map_index(p, &sub_idx);
+  return (_mi_page_map_at(idx))[sub_idx];  // NULL if p==NULL
 }
 
-// Quick page start for initialized pages
-static inline uint8_t* mi_page_start(const mi_page_t* page) {
-  mi_assert_internal(page->page_start != NULL);
-  mi_assert_expensive(_mi_segment_page_start(_mi_page_segment(page),page,NULL) == page->page_start);
-  return page->page_start;
+static inline mi_page_t* _mi_checked_ptr_page(const void* p) {
+  size_t sub_idx;
+  const size_t idx = _mi_page_map_index(p, &sub_idx);
+  mi_submap_t const sub = _mi_page_map_at(idx);
+  if mi_unlikely(sub == NULL) return NULL;
+  return sub[sub_idx];
 }
 
-// Get the page containing the pointer
-static inline mi_page_t* _mi_ptr_page(void* p) {
-  mi_assert_internal(p!=NULL);
-  return _mi_segment_page_of(_mi_ptr_segment(p), p);
+#endif
+
+
+static inline mi_page_t* _mi_ptr_page(const void* p) {
+  mi_assert_internal(p==NULL || mi_is_in_heap_region(p));
+  #if MI_DEBUG || MI_SECURE || defined(__APPLE__)
+  return _mi_checked_ptr_page(p);
+  #else
+  return _mi_unchecked_ptr_page(p);
+  #endif
 }
 
-// Get the block size of a page (special case for huge objects)
+
+// Get the block size of a page
 static inline size_t mi_page_block_size(const mi_page_t* page) {
   mi_assert_internal(page->block_size > 0);
   return page->block_size;
 }
 
-static inline bool mi_page_is_huge(const mi_page_t* page) {
-  mi_assert_internal((page->is_huge && _mi_page_segment(page)->kind == MI_SEGMENT_HUGE) ||
-                     (!page->is_huge && _mi_page_segment(page)->kind != MI_SEGMENT_HUGE));
-  return page->is_huge;
+// Page start
+static inline uint8_t* mi_page_start(const mi_page_t* page) {
+  return page->page_start;
 }
 
-// Get the usable block size of a page without fixed padding.
-// This may still include internal padding due to alignment and rounding up size classes.
-static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
-  return mi_page_block_size(page) - MI_PADDING_SIZE;
+static inline size_t mi_page_size(const mi_page_t* page) {
+  return mi_page_block_size(page) * page->reserved;
 }
 
-// size of a segment
-static inline size_t mi_segment_size(mi_segment_t* segment) {
-  return segment->segment_slices * MI_SEGMENT_SLICE_SIZE;
+static inline uint8_t* mi_page_area(const mi_page_t* page, size_t* size) {
+  if (size) { *size = mi_page_size(page); }
+  return mi_page_start(page);
 }
 
-static inline uint8_t* mi_segment_end(mi_segment_t* segment) {
-  return (uint8_t*)segment + mi_segment_size(segment);
+static inline size_t mi_page_info_size(void) {
+  return _mi_align_up(sizeof(mi_page_t), MI_MAX_ALIGN_SIZE);
 }
 
-// Thread free access
-static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
-  return (mi_block_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & ~3);
+static inline bool mi_page_contains_address(const mi_page_t* page, const void* p) {
+  size_t psize;
+  uint8_t* start = mi_page_area(page, &psize);
+  return (start <= (uint8_t*)p && (uint8_t*)p < start + psize);
 }
 
-static inline mi_delayed_t mi_page_thread_free_flag(const mi_page_t* page) {
-  return (mi_delayed_t)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & 3);
+static inline bool mi_page_is_in_arena(const mi_page_t* page) {
+  return (page->memid.memkind == MI_MEM_ARENA);
 }
 
-// Heap access
-static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
-  return (mi_heap_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xheap));
+static inline bool mi_page_is_singleton(const mi_page_t* page) {
+  return (page->reserved == 1);
 }
 
-static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
-  mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
-  mi_atomic_store_release(&page->xheap,(uintptr_t)heap);
-  if (heap != NULL) { page->heap_tag = heap->tag; }
+// Get the usable block size of a page without fixed padding.
+// This may still include internal padding due to alignment and rounding up size classes.
+static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
+  return mi_page_block_size(page) - MI_PADDING_SIZE;
 }
 
-// Thread free flag helpers
-static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
-  return (mi_block_t*)(tf & ~0x03);
-}
-static inline mi_delayed_t mi_tf_delayed(mi_thread_free_t tf) {
-  return (mi_delayed_t)(tf & 0x03);
+static inline bool mi_page_meta_is_separated(const mi_page_t* page) {
+  #if MI_PAGE_META_IS_SEPARATED
+  // usually separated but can still be in front for direct OS allocations (due to size or alignment) or due to MI_PAGE_META_ALIGNED_FREE_SMALL
+  return (page->memid.memkind == MI_MEM_ARENA && page != _mi_align_down_ptr(page->page_start, MI_ARENA_SLICE_ALIGN));
+  #else
+  MI_UNUSED(page);
+  return false;  
+  #endif
 }
-static inline mi_thread_free_t mi_tf_make(mi_block_t* block, mi_delayed_t delayed) {
-  return (mi_thread_free_t)((uintptr_t)block | (uintptr_t)delayed);
+
+static inline uint8_t* mi_page_slice_start(const mi_page_t* page) {
+  if (mi_page_meta_is_separated(page)) {  
+    // page meta info is at a separate location (at `arena->pages`)
+    return (uint8_t*)_mi_align_down_ptr(page->page_start, MI_ARENA_SLICE_ALIGN);
+  }
+  else {
+    // page meta info is at the start of the page slices
+    return (uint8_t*)page;
+  }
 }
-static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) {
-  return mi_tf_make(mi_tf_block(tf),delayed);
+
+// This gives the offset relative to the start slice of a page. 
+static inline size_t mi_page_slice_offset_of(const mi_page_t* page, size_t offset_relative_to_page_start) {
+  return (page->page_start - mi_page_slice_start(page)) + offset_relative_to_page_start;
 }
-static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) {
-  return mi_tf_make(block, mi_tf_delayed(tf));
+
+// Currently committed part of a page
+static inline size_t mi_page_committed(const mi_page_t* page) {
+  return (page->slice_committed == 0 ? mi_page_size(page) : page->slice_committed - mi_page_slice_offset_of(page,0));
 }
 
 // are all blocks in a page freed?
@@ -576,51 +757,245 @@ static inline bool mi_page_all_free(const mi_page_t* page) {
   return (page->used == 0);
 }
 
-// are there any available blocks?
-static inline bool mi_page_has_any_available(const mi_page_t* page) {
-  mi_assert_internal(page != NULL && page->reserved > 0);
-  return (page->used < page->reserved || (mi_page_thread_free(page) != NULL));
-}
-
 // are there immediately available blocks, i.e. blocks available on the free list.
 static inline bool mi_page_immediate_available(const mi_page_t* page) {
   mi_assert_internal(page != NULL);
   return (page->free != NULL);
 }
 
+
+// is the page not yet used up to its reserved space?
+static inline bool mi_page_is_expandable(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(page->capacity <= page->reserved);
+  return (page->capacity < page->reserved);
+}
+
+
+static inline bool mi_page_is_full(const mi_page_t* page) {
+  const bool full = (page->reserved == page->used);
+  mi_assert_internal(!full || page->free == NULL);
+  return full;
+}
+
 // is more than 7/8th of a page in use?
-static inline bool mi_page_mostly_used(const mi_page_t* page) {
+static inline bool mi_page_is_mostly_used(const mi_page_t* page) {
   if (page==NULL) return true;
   uint16_t frac = page->reserved / 8U;
   return (page->reserved - page->used <= frac);
 }
 
-static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) {
-  return &((mi_heap_t*)heap)->pages[_mi_bin(size)];
+// is more than (n-1)/n'th of a page in use?
+static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) {
+  if (page==NULL) return true;
+  uint16_t frac = page->reserved / n;
+  return (page->reserved - page->used <= frac);
+}
+
+
+static inline bool mi_page_is_huge(const mi_page_t* page) {
+  return (mi_page_is_singleton(page) &&
+          (page->block_size > MI_LARGE_MAX_OBJ_SIZE ||
+           (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.base < (void*)page)));
 }
 
+static inline mi_page_queue_t* mi_page_queue(const mi_theap_t* theap, size_t size) {
+  mi_page_queue_t* const pq = &((mi_theap_t*)theap)->pages[_mi_bin(size)];
+  if (size <= MI_LARGE_MAX_OBJ_SIZE) { mi_assert_internal(pq->block_size <= MI_LARGE_MAX_OBJ_SIZE); }
+  return pq;
+}
 
 
 //-----------------------------------------------------------
-// Page flags
+// Page thread id and flags
 //-----------------------------------------------------------
+
+// Thread id of thread that owns this page (with flags in the bottom 2 bits)
+static inline mi_threadid_t mi_page_xthread_id(const mi_page_t* page) {
+  return mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id);
+}
+
+// Plain thread id of the thread that owns this page
+static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) {
+  return (mi_page_xthread_id(page) & ~MI_PAGE_FLAG_MASK);
+}
+
+static inline mi_page_flags_t mi_page_flags(const mi_page_t* page) {
+  return (mi_page_xthread_id(page) & MI_PAGE_FLAG_MASK);
+}
+
+static inline bool mi_page_flags_set(mi_page_t* page, bool set, mi_page_flags_t newflag) {
+  mi_page_flags_t old;
+  if (set) { old = mi_atomic_or_relaxed(&page->xthread_id, newflag); }
+      else { old = mi_atomic_and_relaxed(&page->xthread_id, ~newflag); }
+  return ((old & newflag) == newflag);
+}
+
 static inline bool mi_page_is_in_full(const mi_page_t* page) {
-  return page->flags.x.in_full;
+  return ((mi_page_flags(page) & MI_PAGE_IN_FULL_QUEUE) != 0);
 }
 
 static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
-  page->flags.x.in_full = in_full;
+  const bool was_in_full = mi_page_flags_set(page, in_full, MI_PAGE_IN_FULL_QUEUE);
+  if (was_in_full != in_full) {
+    // optimize: maintain pages_full_size to avoid visiting the full queue (issue #1220)
+    mi_theap_t* const theap = page->theap;
+    mi_assert_internal(theap!=NULL);
+    if (theap != NULL) {
+      const size_t size = page->capacity * mi_page_block_size(page);
+      if (in_full) { theap->pages_full_size += size; }
+              else { mi_assert_internal(size <= theap->pages_full_size); theap->pages_full_size -= size; }
+    }
+  }
+}
+
+static inline bool mi_page_has_interior_pointers(const mi_page_t* page) {
+  return ((mi_page_flags(page) & MI_PAGE_HAS_INTERIOR_POINTERS) != 0);
+}
+
+static inline void mi_page_set_has_interior_pointers(mi_page_t* page, bool has_aligned) {
+  mi_page_flags_set(page, has_aligned, MI_PAGE_HAS_INTERIOR_POINTERS);
+}
+
+static inline void mi_page_set_theap(mi_page_t* page, mi_theap_t* theap) {
+  // mi_assert_internal(!mi_page_is_in_full(page));  // can happen when destroying pages on theap_destroy
+  page->theap = theap;
+  const mi_threadid_t tid = (theap == NULL ? MI_THREADID_ABANDONED : theap->tld->thread_id);
+  mi_assert_internal((tid & MI_PAGE_FLAG_MASK) == 0);
+
+  // we need to use an atomic cas since a concurrent thread may still set the MI_PAGE_HAS_INTERIOR_POINTERS flag (see `alloc_aligned.c`).
+  mi_threadid_t xtid_old = mi_page_xthread_id(page);
+  mi_threadid_t xtid;
+  do {
+    xtid = tid | (xtid_old & MI_PAGE_FLAG_MASK);
+  } while (!mi_atomic_cas_weak_release(&page->xthread_id, &xtid_old, xtid));
 }
 
-static inline bool mi_page_has_aligned(const mi_page_t* page) {
-  return page->flags.x.has_aligned;
+static inline bool mi_page_is_abandoned(const mi_page_t* page) {
+  // note: the xtheap field of an abandoned theap is set to the subproc (for fast reclaim-on-free)
+  return (mi_page_thread_id(page) <= MI_THREADID_ABANDONED_MAPPED);
 }
 
-static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
-  page->flags.x.has_aligned = has_aligned;
+static inline bool mi_page_is_abandoned_mapped(const mi_page_t* page) {
+  return (mi_page_thread_id(page) == MI_THREADID_ABANDONED_MAPPED);
+}
+
+static inline void mi_page_set_abandoned_mapped(mi_page_t* page) {
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_atomic_or_relaxed(&page->xthread_id, (mi_threadid_t)MI_THREADID_ABANDONED_MAPPED);
+}
+
+static inline void mi_page_clear_abandoned_mapped(mi_page_t* page) {
+  mi_assert_internal(mi_page_is_abandoned_mapped(page));
+  mi_atomic_and_relaxed(&page->xthread_id, (mi_threadid_t)MI_PAGE_FLAG_MASK);
+}
+
+
+static inline mi_theap_t* mi_page_theap(const mi_page_t* page) {
+  mi_assert_internal(!mi_page_is_abandoned(page));
+  mi_assert_internal(page->theap != NULL);
+  return page->theap;
+}
+
+static inline mi_tld_t* mi_page_tld(const mi_page_t* page) {
+  mi_assert_internal(!mi_page_is_abandoned(page));
+  mi_assert_internal(page->theap != NULL);
+  return page->theap->tld;
+}
+
+
+static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
+  mi_heap_t* heap = page->heap;
+  // we use NULL for the main heap to make `_mi_page_get_associated_theap` fast in `free.c:mi_abandoned_page_try_reclaim`.
+  if mi_likely(heap==NULL) heap = mi_heap_main();
+  mi_assert_internal(heap != NULL);
+  return heap;
+}
+
+//-----------------------------------------------------------
+// Thread free list and ownership
+//-----------------------------------------------------------
+
+// Thread free flag helpers
+static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
+  return (mi_block_t*)(tf & ~1);
+}
+static inline bool mi_tf_is_owned(mi_thread_free_t tf) {
+  return ((tf & 1) == 1);
+}
+static inline mi_thread_free_t mi_tf_create(mi_block_t* block, bool owned) {
+  return (mi_thread_free_t)((uintptr_t)block | (owned ? 1 : 0));
+}
+
+// Thread free access
+static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
+  return mi_tf_block(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free));
+}
+
+// are there any available blocks?
+static inline bool mi_page_has_any_available(const mi_page_t* page) {
+  mi_assert_internal(page != NULL && page->reserved > 0);
+  return (page->used < page->reserved || (mi_page_thread_free(page) != NULL));
+}
+
+// Owned?
+static inline bool mi_page_is_owned(const mi_page_t* page) {
+  return mi_tf_is_owned(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free));
+}
+
+// get ownership; returns true if the page was not owned before.
+static inline bool mi_page_claim_ownership(mi_page_t* page) {
+  const uintptr_t old = mi_atomic_or_acq_rel(&page->xthread_free, (uintptr_t)1);
+  return ((old&1)==0);
 }
 
 
+/* -------------------------------------------------------------------
+  Guarded objects
+------------------------------------------------------------------- */
+#if MI_GUARDED
+// we always align guarded pointers in a block at an offset
+// the block `next` field is then used as a tag to distinguish regular offset aligned blocks from guarded ones
+#define MI_BLOCK_TAG_ALIGNED   ((mi_encoded_t)(0))
+#define MI_BLOCK_TAG_GUARDED   (~MI_BLOCK_TAG_ALIGNED)
+#endif
+
+static inline bool mi_block_ptr_is_guarded(const mi_block_t* block, const void* p) {
+#if MI_GUARDED
+  const ptrdiff_t offset = (uint8_t*)p - (uint8_t*)block;
+  return (offset >= (ptrdiff_t)(sizeof(mi_block_t)) && block->next == MI_BLOCK_TAG_GUARDED);
+#else
+  MI_UNUSED(block); MI_UNUSED(p);
+  return false;
+#endif  
+}
+
+#if MI_GUARDED
+static inline bool mi_theap_malloc_use_guarded(mi_theap_t* theap, size_t size) {
+  // this code is written to result in fast assembly as it is on the hot path for allocation
+  const size_t count = theap->guarded_sample_count - 1;  // if the rate was 0, this will underflow and count for a long time..
+  if mi_likely(count != 0) {
+    // no sample
+    theap->guarded_sample_count = count;
+    return false;
+  }
+  else if (size >= theap->guarded_size_min && size <= theap->guarded_size_max) {
+    // use guarded allocation
+    theap->guarded_sample_count = theap->guarded_sample_rate;  // reset
+    return (theap->guarded_sample_rate != 0);
+  }
+  else {
+    // failed size criteria, rewind count (but don't write to an empty theap)
+    if (theap->guarded_sample_rate != 0) { theap->guarded_sample_count = 1; }
+    return false;
+  }
+}
+
+mi_decl_restrict void* _mi_theap_malloc_guarded(mi_theap_t* theap, size_t size, bool zero) mi_attr_noexcept;
+
+#endif
+
+
 /* -------------------------------------------------------------------
 Encoding/Decoding the free list next pointers
 
@@ -646,27 +1021,10 @@ We also pass a separate `null` value to be used as `NULL` or otherwise
 `(k2<<<k1)+k1` would appear (too) often as a sentinel value.
 ------------------------------------------------------------------- */
 
-static inline bool mi_is_in_same_segment(const void* p, const void* q) {
-  return (_mi_ptr_segment(p) == _mi_ptr_segment(q));
-}
-
 static inline bool mi_is_in_same_page(const void* p, const void* q) {
-  mi_segment_t* segment = _mi_ptr_segment(p);
-  if (_mi_ptr_segment(q) != segment) return false;
-  // assume q may be invalid // return (_mi_segment_page_of(segment, p) == _mi_segment_page_of(segment, q));
-  mi_page_t* page = _mi_segment_page_of(segment, p);
-  size_t psize;
-  uint8_t* start = _mi_segment_page_start(segment, page, &psize);
-  return (start <= (uint8_t*)q && (uint8_t*)q < start + psize);
-}
-
-static inline uintptr_t mi_rotl(uintptr_t x, uintptr_t shift) {
-  shift %= MI_INTPTR_BITS;
-  return (shift==0 ? x : ((x << shift) | (x >> (MI_INTPTR_BITS - shift))));
-}
-static inline uintptr_t mi_rotr(uintptr_t x, uintptr_t shift) {
-  shift %= MI_INTPTR_BITS;
-  return (shift==0 ? x : ((x >> shift) | (x << (MI_INTPTR_BITS - shift))));
+  mi_page_t* page = _mi_ptr_page(p);
+  return mi_page_contains_address(page,q);
+  // return (_mi_ptr_page(p) == _mi_ptr_page(q));
 }
 
 static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) {
@@ -679,10 +1037,20 @@ static inline mi_encoded_t mi_ptr_encode(const void* null, const void* p, const
   return mi_rotl(x ^ keys[1], keys[0]) + keys[0];
 }
 
+static inline uint32_t mi_ptr_encode_canary(const void* null, const void* p, const uintptr_t* keys) {
+  const uint32_t x = (uint32_t)(mi_ptr_encode(null,p,keys));
+  // make the lowest byte 0 to prevent spurious read overflows which could be a security issue (issue #951)
+  #if MI_BIG_ENDIAN
+  return (x & 0x00FFFFFF);
+  #else
+  return (x & 0xFFFFFF00);
+  #endif
+}
+
 static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, const uintptr_t* keys ) {
   mi_track_mem_defined(block,sizeof(mi_block_t));
   mi_block_t* next;
-  #ifdef MI_ENCODE_FREELIST
+  #if MI_ENCODE_FREELIST
   next = (mi_block_t*)mi_ptr_decode(null, block->next, keys);
   #else
   MI_UNUSED(keys); MI_UNUSED(null);
@@ -694,7 +1062,7 @@ static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* bl
 
 static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, const uintptr_t* keys) {
   mi_track_mem_undefined(block,sizeof(mi_block_t));
-  #ifdef MI_ENCODE_FREELIST
+  #if MI_ENCODE_FREELIST
   block->next = mi_ptr_encode(null, next, keys);
   #else
   MI_UNUSED(keys); MI_UNUSED(null);
@@ -704,7 +1072,7 @@ static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const
 }
 
 static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) {
-  #ifdef MI_ENCODE_FREELIST
+  #if MI_ENCODE_FREELIST
   mi_block_t* next = mi_block_nextx(page,block,page->keys);
   // check for free list corruption: is `next` at least in the same page?
   // TODO: check if `next` is `page->block_size` aligned?
@@ -720,7 +1088,7 @@ static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t*
 }
 
 static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, const mi_block_t* next) {
-  #ifdef MI_ENCODE_FREELIST
+  #if MI_ENCODE_FREELIST
   mi_block_set_nextx(page,block,next, page->keys);
   #else
   MI_UNUSED(page);
@@ -728,50 +1096,20 @@ static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, c
   #endif
 }
 
+/* -----------------------------------------------------------
+  arena blocks
+----------------------------------------------------------- */
 
-// -------------------------------------------------------------------
-// commit mask
-// -------------------------------------------------------------------
-
-static inline void mi_commit_mask_create_empty(mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    cm->mask[i] = 0;
-  }
-}
-
-static inline void mi_commit_mask_create_full(mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    cm->mask[i] = ~((size_t)0);
-  }
-}
-
-static inline bool mi_commit_mask_is_empty(const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    if (cm->mask[i] != 0) return false;
-  }
-  return true;
+// Blocks needed for a given byte size
+static inline size_t mi_slice_count_of_size(size_t size) {
+  return _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
 }
 
-static inline bool mi_commit_mask_is_full(const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    if (cm->mask[i] != ~((size_t)0)) return false;
-  }
-  return true;
+// Byte size of a number of blocks
+static inline size_t mi_size_of_slices(size_t bcount) {
+  return (bcount * MI_ARENA_SLICE_SIZE);
 }
 
-// defined in `segment.c`:
-size_t _mi_commit_mask_committed_size(const mi_commit_mask_t* cm, size_t total);
-size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx);
-
-#define mi_commit_mask_foreach(cm,idx,count) \
-  idx = 0; \
-  while ((count = _mi_commit_mask_next_run(cm,&idx)) > 0) {
-
-#define mi_commit_mask_foreach_end() \
-    idx += count; \
-  }
-
-
 
 /* -----------------------------------------------------------
   memory id's
@@ -788,14 +1126,27 @@ static inline mi_memid_t _mi_memid_none(void) {
   return _mi_memid_create(MI_MEM_NONE);
 }
 
-static inline mi_memid_t _mi_memid_create_os(bool committed, bool is_zero, bool is_large) {
+static inline mi_memid_t _mi_memid_create_os(void* base, size_t size, bool committed, bool is_zero, bool is_large) {
   mi_memid_t memid = _mi_memid_create(MI_MEM_OS);
+  memid.mem.os.base = base;
+  memid.mem.os.size = size;
   memid.initially_committed = committed;
   memid.initially_zero = is_zero;
   memid.is_pinned = is_large;
   return memid;
 }
 
+static inline mi_memid_t _mi_memid_create_meta(void* mpage, size_t block_idx, size_t block_count) {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_META);
+  memid.mem.meta.meta_page = mpage;
+  memid.mem.meta.block_index = (uint32_t)block_idx;
+  memid.mem.meta.block_count = (uint32_t)block_count;
+  memid.initially_committed = true;
+  memid.initially_zero = true;
+  memid.is_pinned = true;
+  return memid;
+}
+
 
 // -------------------------------------------------------------------
 // Fast "random" shuffle
@@ -803,7 +1154,7 @@ static inline mi_memid_t _mi_memid_create_os(bool committed, bool is_zero, bool
 
 static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
   if (x==0) { x = 17; }   // ensure we don't get stuck in generating zeros
-#if (MI_INTPTR_SIZE==8)
+#if (MI_INTPTR_SIZE>=8)
   // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
   x ^= x >> 30;
   x *= 0xbf58476d1ce4e5b9UL;
@@ -821,165 +1172,65 @@ static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
   return x;
 }
 
-// -------------------------------------------------------------------
-// Optimize numa node access for the common case (= one node)
-// -------------------------------------------------------------------
-
-int    _mi_os_numa_node_get(mi_os_tld_t* tld);
-size_t _mi_os_numa_node_count_get(void);
-
-extern _Atomic(size_t) _mi_numa_node_count;
-static inline int _mi_os_numa_node(mi_os_tld_t* tld) {
-  if mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1) { return 0; }
-  else return _mi_os_numa_node_get(tld);
-}
-static inline size_t _mi_os_numa_node_count(void) {
-  const size_t count = mi_atomic_load_relaxed(&_mi_numa_node_count);
-  if mi_likely(count > 0) { return count; }
-  else return _mi_os_numa_node_count_get();
-}
 
+// ---------------------------------------------------------------------------------
+// Provide our own `_mi_memcpy/set` for potential performance optimizations.
+//
+// For now, only on x64/x86 we optimize to `rep movsb/stosb`.
+// Generally, we check for "fast short rep movsb/stosb" (FSRM/FSRS) or "fast enhanced rep movsb" (ERMS) support
+// (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017). See also issue #201 and pr #253.
+// Todo: we see improvements on win32 but less with glibc; we might want to only enable this on windows.
+// ---------------------------------------------------------------------------------
 
+#if !MI_TRACK_ENABLED && (MI_ARCH_X64 || MI_ARCH_X86) && (defined(_WIN32) || defined(__GNUC__))
 
-// -----------------------------------------------------------------------
-// Count bits: trailing or leading zeros (with MI_INTPTR_BITS on all zero)
-// -----------------------------------------------------------------------
-
-#if defined(__GNUC__)
-
-#include <limits.h>       // LONG_MAX
-#define MI_HAVE_FAST_BITSCAN
-static inline size_t mi_clz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-#if (INTPTR_MAX == LONG_MAX)
-  return __builtin_clzl(x);
-#else
-  return __builtin_clzll(x);
-#endif
-}
-static inline size_t mi_ctz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-#if (INTPTR_MAX == LONG_MAX)
-  return __builtin_ctzl(x);
-#else
-  return __builtin_ctzll(x);
-#endif
-}
-
-#elif defined(_MSC_VER)
+extern mi_decl_hidden size_t _mi_cpu_movsb_max;  // in init.c
+extern mi_decl_hidden size_t _mi_cpu_stosb_max;
 
-#include <limits.h>       // LONG_MAX
-#include <intrin.h>       // BitScanReverse64
-#define MI_HAVE_FAST_BITSCAN
-static inline size_t mi_clz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-  unsigned long idx;
-#if (INTPTR_MAX == LONG_MAX)
-  _BitScanReverse(&idx, x);
-#else
-  _BitScanReverse64(&idx, x);
-#endif
-  return ((MI_INTPTR_BITS - 1) - idx);
-}
-static inline size_t mi_ctz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-  unsigned long idx;
-#if (INTPTR_MAX == LONG_MAX)
-  _BitScanForward(&idx, x);
-#else
-  _BitScanForward64(&idx, x);
-#endif
-  return idx;
-}
-
-#else
-static inline size_t mi_ctz32(uint32_t x) {
-  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
-  static const unsigned char debruijn[32] = {
-    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
-    31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
-  };
-  if (x==0) return 32;
-  return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27];
-}
-static inline size_t mi_clz32(uint32_t x) {
-  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
-  static const uint8_t debruijn[32] = {
-    31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
-    23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
-  };
-  if (x==0) return 32;
-  x |= x >> 1;
-  x |= x >> 2;
-  x |= x >> 4;
-  x |= x >> 8;
-  x |= x >> 16;
-  return debruijn[(uint32_t)(x * 0x07C4ACDDUL) >> 27];
-}
-
-static inline size_t mi_clz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-#if (MI_INTPTR_BITS <= 32)
-  return mi_clz32((uint32_t)x);
-#else
-  size_t count = mi_clz32((uint32_t)(x >> 32));
-  if (count < 32) return count;
-  return (32 + mi_clz32((uint32_t)x));
-#endif
-}
-static inline size_t mi_ctz(uintptr_t x) {
-  if (x==0) return MI_INTPTR_BITS;
-#if (MI_INTPTR_BITS <= 32)
-  return mi_ctz32((uint32_t)x);
-#else
-  size_t count = mi_ctz32((uint32_t)x);
-  if (count < 32) return count;
-  return (32 + mi_ctz32((uint32_t)(x>>32)));
-#endif
+static inline void mi_rep_movsb(void* dst, const void* src, size_t n) {
+  #if defined(__GNUC__)
+  __asm volatile("rep movsb" : "+D"(dst), "+c"(n), "+S"(src) : : "memory");
+  #else
+  __movsb((unsigned char*)dst, (const unsigned char*)src, n);
+  #endif
 }
 
-#endif
-
-// "bit scan reverse": Return index of the highest bit (or MI_INTPTR_BITS if `x` is zero)
-static inline size_t mi_bsr(uintptr_t x) {
-  return (x==0 ? MI_INTPTR_BITS : MI_INTPTR_BITS - 1 - mi_clz(x));
+static inline void mi_rep_stosb(void* dst, uint8_t val, size_t n) {
+  #if defined(__GNUC__)
+  __asm volatile("rep stosb" : "+D"(dst), "+c"(n) : "a"(val) : "memory");
+  #else
+  __stosb((unsigned char*)dst, val, n);
+  #endif
 }
 
-
-// ---------------------------------------------------------------------------------
-// Provide our own `_mi_memcpy` for potential performance optimizations.
-//
-// For now, only on Windows with msvc/clang-cl we optimize to `rep movsb` if
-// we happen to run on x86/x64 cpu's that have "fast short rep movsb" (FSRM) support
-// (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017). See also issue #201 and pr #253.
-// ---------------------------------------------------------------------------------
-
-#if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
-#include <intrin.h>
-extern bool _mi_cpu_has_fsrm;
 static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
-  if (_mi_cpu_has_fsrm) {
-    __movsb((unsigned char*)dst, (const unsigned char*)src, n);
+  if mi_likely(n <= _mi_cpu_movsb_max) {  // has fsrm && n <= 127  (todo: and maybe has erms?)
+    mi_rep_movsb(dst, src, n);
   }
   else {
     memcpy(dst, src, n);
   }
 }
-static inline void _mi_memzero(void* dst, size_t n) {
-  if (_mi_cpu_has_fsrm) {
-    __stosb((unsigned char*)dst, 0, n);
+
+static inline void _mi_memset(void* dst, int val, size_t n) {
+  if mi_likely(n <= _mi_cpu_stosb_max) {  // has fsrs && n <= 127
+    mi_rep_stosb(dst, (uint8_t)val, n);
   }
   else {
-    memset(dst, 0, n);
+    memset(dst, val, n);
   }
 }
+
 #else
+
 static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
   memcpy(dst, src, n);
 }
-static inline void _mi_memzero(void* dst, size_t n) {
-  memset(dst, 0, n);
+
+static inline void _mi_memset(void* dst, int val, size_t n) {
+  memset(dst, val, n);
 }
+
 #endif
 
 // -------------------------------------------------------------------------------
@@ -988,6 +1239,7 @@ static inline void _mi_memzero(void* dst, size_t n) {
 // -------------------------------------------------------------------------------
 
 #if (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__)
+
 // On GCC/CLang we provide a hint that the pointers are word aligned.
 static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
   mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
@@ -996,23 +1248,35 @@ static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
   _mi_memcpy(adst, asrc, n);
 }
 
-static inline void _mi_memzero_aligned(void* dst, size_t n) {
+static inline void _mi_memset_aligned(void* dst, int val, size_t n) {
   mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0);
   void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE);
-  _mi_memzero(adst, n);
+  _mi_memset(adst, val, n);
 }
+
 #else
+
 // Default fallback on `_mi_memcpy`
 static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
   mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
   _mi_memcpy(dst, src, n);
 }
 
-static inline void _mi_memzero_aligned(void* dst, size_t n) {
+static inline void _mi_memset_aligned(void* dst, int val, size_t n) {
   mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0);
-  _mi_memzero(dst, n);
+  _mi_memset(dst, val, n);
 }
+
 #endif
 
+static inline void _mi_memzero(void* dst, size_t n) {
+  _mi_memset(dst, 0, n);
+}
 
-#endif
+static inline void _mi_memzero_aligned(void* dst, size_t n) {
+  _mi_memset_aligned(dst, 0, n);
+}
+
+
+
+#endif  // MI_INTERNAL_H
diff --git a/system/lib/mimalloc/include/mimalloc/prim.h b/system/lib/mimalloc/include/mimalloc/prim.h
index 3f4574ddd9270..96642bbd079d6 100644
--- a/system/lib/mimalloc/include/mimalloc/prim.h
+++ b/system/lib/mimalloc/include/mimalloc/prim.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #pragma once
 #ifndef MIMALLOC_PRIM_H
 #define MIMALLOC_PRIM_H
-
+#include "internal.h"             // mi_decl_hidden
 
 // --------------------------------------------------------------------------
 // This file specifies the primitive portability API.
@@ -22,12 +22,15 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // OS memory configuration
 typedef struct mi_os_mem_config_s {
-  size_t  page_size;            // default to 4KiB
-  size_t  large_page_size;      // 0 if not supported, usually 2MiB (4MiB on Windows)
-  size_t  alloc_granularity;    // smallest allocation size (usually 4KiB, on Windows 64KiB)
-  bool    has_overcommit;       // can we reserve more memory than can be actually committed?
-  bool    has_partial_free;     // can allocated blocks be freed partially? (true for mmap, false for VirtualAlloc)
-  bool    has_virtual_reserve;  // supports virtual address space reservation? (if true we can reserve virtual address space without using commit or physical memory)
+  size_t  page_size;              // default to 4KiB
+  size_t  large_page_size;        // 0 if not supported, usually 2MiB (4MiB on Windows)
+  size_t  alloc_granularity;      // smallest allocation size (usually 4KiB, on Windows 64KiB)
+  size_t  physical_memory_in_kib; // physical memory size in KiB
+  size_t  virtual_address_bits;   // usually 48 or 56 bits on 64-bit systems. (used to determine secure randomization)
+  bool    has_overcommit;         // can we reserve more memory than can be actually committed?
+  bool    has_partial_free;       // can allocated blocks be freed partially? (true for mmap, false for VirtualAlloc)
+  bool    has_virtual_reserve;    // supports virtual address space reservation? (if true we can reserve virtual address space without using commit or physical memory)
+  bool    has_transparent_huge_pages;  // true if transparent huge pages are enabled (on Linux)
 } mi_os_mem_config_t;
 
 // Initialize
@@ -41,9 +44,10 @@ int _mi_prim_free(void* addr, size_t size );
 // If `commit` is false, the virtual memory range only needs to be reserved (with no access)
 // which will later be committed explicitly using `_mi_prim_commit`.
 // `is_zero` is set to true if the memory was zero initialized (as on most OS's)
+// The `hint_addr` address is either `NULL` or a preferred allocation address but can be ignored.
 // pre: !commit => !allow_large
 //      try_alignment >= _mi_os_page_size() and a power of 2
-int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr);
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr);
 
 // Commit memory. Returns error code or 0 on success.
 // For example, on Linux this would make the memory PROT_READ|PROT_WRITE.
@@ -56,10 +60,15 @@ int _mi_prim_commit(void* addr, size_t size, bool* is_zero);
 // pre: needs_recommit != NULL
 int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit);
 
-// Reset memory. The range keeps being accessible but the content might be reset.
+// Reset memory. The range keeps being accessible but the content might be reset to zero at any moment.
 // Returns error code or 0 on success.
 int _mi_prim_reset(void* addr, size_t size);
 
+// Reuse memory. This is called for memory that is already committed but
+// may have been reset (`_mi_prim_reset`) or decommitted (`_mi_prim_decommit`) where `needs_recommit` was false.
+// Returns error code or 0 on success. On most platforms this is a no-op.
+int _mi_prim_reuse(void* addr, size_t size);
+
 // Protect memory. Returns error code or 0 on success.
 int _mi_prim_protect(void* addr, size_t size, bool protect);
 
@@ -111,18 +120,18 @@ void _mi_prim_thread_init_auto_done(void);
 // Called on process exit and may take action to clean up resources associated with the thread auto done.
 void _mi_prim_thread_done_auto_done(void);
 
-// Called when the default heap for a thread changes
-void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
+// Called when the default theap for a thread changes
+void _mi_prim_thread_associate_default_theap(mi_theap_t* theap);
+
+// Is this thread part of a thread pool?
+bool _mi_prim_thread_is_in_threadpool(void);
 
 
 //-------------------------------------------------------------------
-// Thread id: `_mi_prim_thread_id()`
-//
-// Getting the thread id should be performant as it is called in the
-// fast path of `_mi_free` and we specialize for various platforms as
-// inlined definitions. Regular code should call `init.c:_mi_thread_id()`.
-// We only require _mi_prim_thread_id() to return a unique id
-// for each thread (unequal to zero).
+// Access to TLS (thread local storage) slots.
+// We need fast access to both a unique thread id (in `free.c:mi_free`) and
+// to a thread-local theap pointer (in `alloc.c:mi_malloc`).
+// To achieve this we use specialized code for various platforms.
 //-------------------------------------------------------------------
 
 // On some libc + platform combinations we can directly access a thread-local storage (TLS) slot.
@@ -132,21 +141,28 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 //
 // Note: we would like to prefer `__builtin_thread_pointer()` nowadays instead of using assembly,
 // but unfortunately we can not detect support reliably (see issue #883)
-// We also use it on Apple OS as we use a TLS slot for the default heap there.
-#if defined(__GNUC__) && ( \
-           (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
+// We also use it on Apple OS as we use a TLS slot for the default theap there.
+#if (defined(_WIN32)) || \
+    (defined(__GNUC__) && ( \
+           (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \
         || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__) || defined(__POWERPC__))) \
-        || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
+        || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \
         || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
         || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
-      )
-
-#define MI_HAS_TLS_SLOT
+      ))
 
 static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
   void* res;
   const size_t ofs = (slot*sizeof(void*));
-  #if defined(__i386__)
+  #if defined(_WIN32)
+    #if (_M_X64 || _M_AMD64) && !defined(_M_ARM64EC)
+      res = (void*)__readgsqword((unsigned long)ofs);   // direct load at offset from gs
+    #elif _M_IX86 && !defined(_M_ARM64EC)
+      res = (void*)__readfsdword((unsigned long)ofs);   // direct load at offset from fs
+    #else
+      res = ((void**)NtCurrentTeb())[slot]; MI_UNUSED(ofs);
+    #endif
+  #elif defined(__i386__)
     __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86 32-bit always uses GS
   #elif defined(__APPLE__) && defined(__x86_64__)
     __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
@@ -169,14 +185,24 @@ static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
   #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781
     MI_UNUSED(ofs);
     res = pthread_getspecific(slot);
+  #else
+    #define MI_HAS_TLS_SLOT 0
+    MI_UNUSED(ofs);
+    res = NULL;
   #endif
   return res;
 }
 
+#ifndef MI_HAS_TLS_SLOT
+#define MI_HAS_TLS_SLOT 1
+#endif
+
 // setting a tls slot is only used on macOS for now
 static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
   const size_t ofs = (slot*sizeof(void*));
-  #if defined(__i386__)
+  #if defined(_WIN32)
+    ((void**)NtCurrentTeb())[slot] = value; MI_UNUSED(ofs);
+  #elif defined(__i386__)
     __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
   #elif defined(__APPLE__) && defined(__x86_64__)
     __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOS uses GS
@@ -199,16 +225,36 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
   #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781
     MI_UNUSED(ofs);
     pthread_setspecific(slot, value);
+  #else
+    MI_UNUSED(ofs); MI_UNUSED(value);
   #endif
 }
 
 #endif
 
+
+// defined in `init.c`; do not use these directly
+extern mi_decl_hidden mi_decl_thread mi_theap_t* __mi_theap_main;     // theap belonging to the main heap
+extern mi_decl_hidden bool _mi_process_is_initialized;                // has mi_process_init been called?
+
+
+//-------------------------------------------------------------------
+// Get a fast unique thread id.
+//
+// Getting the thread id should be performant as it is called in the
+// fast path of `_mi_free` and we specialize for various platforms as
+// inlined definitions. Regular code should call `init.c:_mi_thread_id()`.
+// We only require _mi_prim_thread_id() to return a unique id
+// for each thread (unequal to zero).
+//-------------------------------------------------------------------
+
+
 // Do we have __builtin_thread_pointer? This would be the preferred way to get a unique thread id
 // but unfortunately, it seems we cannot test for this reliably at this time (see issue #883)
 // Nevertheless, it seems needed on older graviton platforms (see issue #851).
 // For now, we only enable this for specific platforms.
 #if !defined(__APPLE__)  /* on apple (M1) the wrong register is read (tpidr_el0 instead of tpidrro_el0) so fall back to TLS slot assembly (<https://github.com/microsoft/mimalloc/issues/343#issuecomment-763272369>)*/ \
+    && !defined(__CYGWIN__) \
     && !defined(MI_LIBC_MUSL) \
     && (!defined(__clang_major__) || __clang_major__ >= 14)  /* older clang versions emit bad code; fall back to using the TLS slot (<https://lore.kernel.org/linux-arm-kernel/202110280952.352F66D8@keescook/T/>) */
   #if    (defined(__GNUC__) && (__GNUC__ >= 7)  && defined(__aarch64__)) /* aarch64 for older gcc versions (issue #851) */ \
@@ -218,42 +264,39 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
   #endif
 #endif
 
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept;
 
-
-// defined in `init.c`; do not use these directly
-extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
-extern bool _mi_process_is_initialized;             // has mi_process_init been called?
-
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  const mi_threadid_t tid = __mi_prim_thread_id();
+  mi_assert_internal(tid > 1);
+  mi_assert_internal((tid & MI_PAGE_FLAG_MASK) == 0);  // bottom 2 bits are clear?
+  return tid;
+}
 
 // Get a unique id for the current thread.
 #if defined(MI_PRIM_THREAD_ID)
 
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
   return MI_PRIM_THREAD_ID();  // used for example by CPython for a free threaded build (see python/cpython#115488)
 }
 
 #elif defined(_WIN32)
 
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
   // Windows: works on Intel and ARM in both 32- and 64-bit
   return (uintptr_t)NtCurrentTeb();
 }
 
 #elif MI_USE_BUILTIN_THREAD_POINTER
 
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
   // Works on most Unix based platforms with recent compilers
   return (uintptr_t)__builtin_thread_pointer();
 }
 
-#elif defined(MI_HAS_TLS_SLOT)
+#elif MI_HAS_TLS_SLOT
 
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
   #if defined(__BIONIC__)
     // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
     // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
@@ -269,8 +312,8 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
 #else
 
 // otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
-  return (uintptr_t)&_mi_heap_default;
+static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept {
+  return (uintptr_t)&__mi_theap_main;
 }
 
 #endif
@@ -278,96 +321,215 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
 
 
 /* ----------------------------------------------------------------------------------------
-The thread local default heap: `_mi_prim_get_default_heap()`
-This is inlined here as it is on the fast path for allocation functions.
+Get the thread local default theap: `_mi_theap_default()` (and the cached heap `_mi_theap_cached`).
 
-On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
-__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
-that the storage will always be available (allocated on the thread stacks).
-
-On some platforms though we cannot use that when overriding `malloc` since the underlying
-TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
-We try to circumvent this in an efficient way:
-- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
-           loader itself calls `malloc` even before the modules are initialized.
-- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
-- DragonFly: defaults are working but seem slow compared to freeBSD (see PR #323)
+This is inlined here as it is on the fast path for allocation functions.
+We have 4 models:
+
+- MI_TLS_MODEL_THREAD_LOCAL: use regular thread local (default on Linux, FreeBSD, etc)
+    On most platforms (Linux, FreeBSD, NetBSD, etc), this just returns a
+    thread local variable (`__mi_theap_default`). With the initial-exec TLS model this ensures
+    that the storage will always be available and properly initialized (with an empty theap).
+
+    On some platforms the underlying TLS implementation (or the loader) will call itself `malloc`
+    on a first access to a thread local and recurse in the MI_TLS_MODEL_THREAD_LOCAL.
+    A way around this is to define MI_TLS_RECURSE_GUARD which adds an extra check if the process
+    is initialized before accessing the thread-local. This is a check in the fast path though
+    so this should be avoided.
+
+- MI_TLS_MODEL_FIXED_SLOT: use a fixed slot in the TLS block (default on macOS)
+    This reserves an unused and fixed TLS slot. This is fast and avoids the problem
+    where the underlying TLS implementation (or the loader) will call itself `malloc`
+    on a first access to a thread local (and recurse in the MI_TLS_MODEL_THREAD_LOCAL).
+    This goes wrong though if the OS or a library uses the same fixed slot.
+
+- MI_TLS_MODEL_DYNAMIC_WIN32: use a dynamically allocated slot with TlsAlloc. (default on Windows)
+    Windows has somewhat slow thread locals so by default we use TlsAlloc'd slots which
+    can be more efficient. First tries to use one of the "direct" first 64 slots which 
+    are the fastest, but falls back to using "expansion" slots when needed (up to 1088 slots).
+    (If the allocated slot happens to always be under 64 for a particular program,
+    one might use cmake with `-DMI_WIN_DIRECT_TLS=ON` to skip the expansion slot test in the fast path.)
+
+- MI_TLS_MODEL_DYNAMIC_PTHREADS: use `pthread_getspecific`. (default on OpenBSD, maybe good for Android as well?)
+    Use pthread local storage. Somewhat slow but can work well depending on the platform.
+
+Each model should define `MI_THEAP_INITASNULL` to signify that the initial value
+returned from `_mi_theap_default()` can be `NULL` (instead of the address of the empty heap).
+This incurs an extra check in the fast path (but can often be combined in an existing check).
 ------------------------------------------------------------------------------------------- */
 
-static inline mi_heap_t* mi_prim_get_default_heap(void);
-
-#if defined(MI_MALLOC_OVERRIDE)
-#if defined(__APPLE__) // macOS
-  #define MI_TLS_SLOT               89  // seems unused?
-  // other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
+static inline mi_theap_t* _mi_theap_default(void);
+static inline mi_theap_t* _mi_theap_cached(void);
+
+#if defined(_WIN32)
+  #define MI_TLS_MODEL_DYNAMIC_WIN32        1    
+#elif defined(__APPLE__)  // macOS
+  // #define MI_TLS_MODEL_DYNAMIC_PTHREADS  1    // also works but a bit slower
+  #define MI_TLS_MODEL_FIXED_SLOT           1
+  #define MI_TLS_MODEL_FIXED_SLOT_DEFAULT   108  // seems unused. @apple: it would be great to get 2 official slots for custom allocators :-)
+  #define MI_TLS_MODEL_FIXED_SLOT_CACHED    109
+  // we used before __PTK_FRAMEWORK_OLDGC_KEY9 (89) but that seems used now.
   // see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
-#elif defined(__OpenBSD__)
-  // use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16)
-  // see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
-  #define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)
-  // #elif defined(__DragonFly__)
-  // #warning "mimalloc is not working correctly on DragonFly yet."
-  // #define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
-#elif defined(__ANDROID__)
-  // See issue #381
-  #define MI_TLS_PTHREAD
-#endif
+#elif defined(__OpenBSD__) || defined(__ANDROID__)
+  #define MI_TLS_MODEL_DYNAMIC_PTHREADS     1
+  // #define MI_TLS_MODEL_DYNAMIC_PTHREADS_DEFAULT_ENTRY_IS_NULL  1
+#else
+  #define MI_TLS_MODEL_THREAD_LOCAL         1
 #endif
 
+// Declared this way to optimize register spills and branches
+mi_decl_cold mi_decl_noinline mi_theap_t* _mi_theap_empty_get(void);
 
-#if defined(MI_TLS_SLOT)
-# if !defined(MI_HAS_TLS_SLOT)
-#  error "trying to use a TLS slot for the default heap, but the mi_prim_tls_slot primitives are not defined"
-# endif
-
-static inline mi_heap_t* mi_prim_get_default_heap(void) {
-  mi_heap_t* heap = (mi_heap_t*)mi_prim_tls_slot(MI_TLS_SLOT);
-  if mi_unlikely(heap == NULL) {
-    #ifdef __GNUC__
-    __asm(""); // prevent conditional load of the address of _mi_heap_empty
-    #endif
-    heap = (mi_heap_t*)&_mi_heap_empty;
-  }
-  return heap;
+static inline mi_theap_t* __mi_theap_empty(void) {
+  #if __GNUC__
+  __asm("");  // prevent conditional load
+  return (mi_theap_t*)&_mi_theap_empty;
+  #else
+  return _mi_theap_empty_get();
+  #endif
 }
 
-#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+#if MI_TLS_MODEL_THREAD_LOCAL
+// Thread local with an initial value (default on Linux). Very efficient.
+
+extern mi_decl_hidden mi_decl_thread mi_theap_t* __mi_theap_default;  // default theap to allocate from
+extern mi_decl_hidden mi_decl_thread mi_theap_t* __mi_theap_cached;   // theap from the last used heap
 
-static inline mi_heap_t** mi_prim_tls_pthread_heap_slot(void) {
-  pthread_t self = pthread_self();
-  #if defined(__DragonFly__)
-  if (self==NULL) return NULL;
+static inline mi_theap_t* _mi_theap_default(void) {
+  #if defined(MI_TLS_RECURSE_GUARD)
+  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_theap_empty_get();
   #endif
-  return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
+  return __mi_theap_default;
 }
 
-static inline mi_heap_t* mi_prim_get_default_heap(void) {
-  mi_heap_t** pheap = mi_prim_tls_pthread_heap_slot();
-  if mi_unlikely(pheap == NULL) return _mi_heap_main_get();
-  mi_heap_t* heap = *pheap;
-  if mi_unlikely(heap == NULL) return (mi_heap_t*)&_mi_heap_empty;
-  return heap;
+static inline mi_theap_t* _mi_theap_cached(void) {
+  return __mi_theap_cached;
 }
 
-#elif defined(MI_TLS_PTHREAD)
+#elif MI_TLS_MODEL_FIXED_SLOT
+// Fixed TLS slot (default on macOS).
+#define MI_THEAP_INITASNULL  1
+
+static inline mi_theap_t* _mi_theap_default(void) {
+  return (mi_theap_t*)mi_prim_tls_slot(MI_TLS_MODEL_FIXED_SLOT_DEFAULT);
+}
 
-extern pthread_key_t _mi_heap_default_key;
-static inline mi_heap_t* mi_prim_get_default_heap(void) {
-  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
-  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+static inline mi_theap_t* _mi_theap_cached(void) {
+  return (mi_theap_t*)mi_prim_tls_slot(MI_TLS_MODEL_FIXED_SLOT_CACHED);
 }
 
-#else // default using a thread local variable; used on most platforms.
+#elif MI_TLS_MODEL_DYNAMIC_WIN32
+// Dynamic TLS slot (default on Windows)
+#define MI_THEAP_INITASNULL  1
 
-static inline mi_heap_t* mi_prim_get_default_heap(void) {
-  #if defined(MI_TLS_RECURSE_GUARD)
-  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
+// We try to use direct slots (64), but can also use the expansion slots (upto 1024 extra available)
+// See <https://www.geoffchappell.com/studies/windows/km/ntoskrnl/inc/api/pebteb/teb/index.htm> for the offsets.
+#if MI_SIZE_SIZE==4
+#define MI_TLS_EXPANSION_SLOT    (0x0F94 / MI_SIZE_SIZE)
+#else
+#define MI_TLS_EXPANSION_SLOT    (0x1780 / MI_SIZE_SIZE)
+#endif
+
+extern mi_decl_hidden size_t _mi_theap_default_slot;
+extern mi_decl_hidden size_t _mi_theap_cached_slot;
+extern mi_decl_hidden size_t _mi_theap_default_expansion_slot;
+extern mi_decl_hidden size_t _mi_theap_cached_expansion_slot;
+
+static inline mi_theap_t* _mi_theap_default(void) {
+  const size_t slot = _mi_theap_default_slot;
+  mi_theap_t* theap  = (mi_theap_t*)mi_prim_tls_slot(slot);
+  #if !MI_WIN_DIRECT_TLS
+  if mi_unlikely(slot==MI_TLS_EXPANSION_SLOT) { // in TlsExpansionSlots ?
+    if mi_likely(theap!=NULL) {                 // initialized (on this thread)?
+      theap = ((mi_theap_t**)theap)[_mi_theap_default_expansion_slot];
+    }
+  }
   #endif
-  return _mi_heap_default;
+  return theap;
 }
 
-#endif  // mi_prim_get_default_heap()
+static inline mi_theap_t* _mi_theap_cached(void) {
+  const size_t slot = _mi_theap_cached_slot;
+  mi_theap_t* theap = (mi_theap_t*)mi_prim_tls_slot(slot);
+  #if !MI_WIN_DIRECT_TLS
+  if mi_unlikely(slot==MI_TLS_EXPANSION_SLOT) { // in TlsExpansionSlots ?
+    if mi_likely(theap!=NULL) {                 // initialized (on this thread)?
+      theap = ((mi_theap_t**)theap)[_mi_theap_cached_expansion_slot];
+    }
+  }
+  #endif
+  return theap;
+}
 
+#elif MI_TLS_MODEL_DYNAMIC_PTHREADS
+// Dynamic pthread slot on less common platforms. This is not too bad. (default on OpenBSD)
+#define MI_THEAP_INITASNULL  1
 
+extern mi_decl_hidden pthread_key_t _mi_theap_default_key;
+extern mi_decl_hidden pthread_key_t _mi_theap_cached_key;
+
+static inline mi_theap_t* _mi_theap_default(void) {
+  #if !MI_TLS_MODEL_DYNAMIC_PTHREADS_DEFAULT_ENTRY_IS_NULL
+  // we can skip this check if using the initial key will return NULL from pthread_getspecific
+  if mi_unlikely(_mi_theap_default_key==0) { return NULL; }
+  #endif
+  return (mi_theap_t*)pthread_getspecific(_mi_theap_default_key);
+}
+
+static inline mi_theap_t* _mi_theap_cached(void) {
+  #if !MI_TLS_MODEL_DYNAMIC_PTHREADS_DEFAULT_ENTRY_IS_NULL
+  // we can skip this check if using the initial key will return NULL from pthread_getspecific
+  if mi_unlikely(_mi_theap_cached_key==0) { return NULL; }
+  #endif
+  return (mi_theap_t*)pthread_getspecific(_mi_theap_cached_key);
+}
+
+#else
+#error "no TLS model is defined for this platform?"
+#endif
+
+
+// Check if a thread is initialized (without using a thread-local if using fixed slots)
+static inline bool _mi_thread_is_initialized(void) {
+  return (mi_theap_is_initialized(_mi_theap_default()));
+}
+
+// Get (and possible create) the theap belonging to a heap
+// We cache the last accessed theap in `_mi_theap_cached` for better performance.
+static inline mi_theap_t* _mi_heap_theap(const mi_heap_t* heap) {
+  mi_theap_t* theap = _mi_theap_cached();  
+  #if MI_THEAP_INITASNULL
+  if mi_likely(theap!=NULL && _mi_theap_heap(theap)==heap) return theap;
+  #else
+  if mi_likely(_mi_theap_heap(theap)==heap) return theap;
+  #endif
+  return _mi_heap_theap_get_or_init(heap);
+}
+
+// Get the theap belonging to a heap without creating in if it is not yet initialized.
+static inline mi_theap_t* _mi_heap_theap_peek(const mi_heap_t* heap) {
+  mi_theap_t* theap = _mi_theap_cached();
+  #if MI_THEAP_INITASNULL
+  if mi_unlikely(theap==NULL || _mi_theap_heap(theap)!=heap)
+  #else
+  if mi_unlikely(_mi_theap_heap(theap)!=heap)
+  #endif
+  {
+    theap = _mi_heap_theap_get_peek(heap);  // don't update the cache on a query (?)
+  }
+  mi_assert(theap==NULL || _mi_theap_heap(theap)==heap);
+  return theap;
+}
+
+// Find the associated theap or NULL if it does not exist (during shutdown)
+// Should be fast as it is called in `free.c:mi_free_try_collect`.
+static inline mi_theap_t* _mi_page_associated_theap_peek(mi_page_t* page) {
+  mi_heap_t* const heap = page->heap;
+  mi_theap_t* theap;
+  if mi_likely(heap==NULL) { theap = __mi_theap_main; }  // note: on macOS accessing the thread_local can cause allocation during thread shutdown (and reinitialize the thread)!
+                      else { theap = _mi_heap_theap_peek(heap); }
+  mi_assert_internal(theap==NULL || _mi_thread_id()==theap->tld->thread_id);
+  return theap;
+}
 
-#endif  // MIMALLOC_PRIM_H
+#endif  // MI_PRIM_H
diff --git a/system/lib/mimalloc/include/mimalloc/track.h b/system/lib/mimalloc/include/mimalloc/track.h
index a659d94044670..8f8b93f9aa16c 100644
--- a/system/lib/mimalloc/include/mimalloc/track.h
+++ b/system/lib/mimalloc/include/mimalloc/track.h
@@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_TRACK_H
-#define MIMALLOC_TRACK_H
+#ifndef MI_TRACK_H
+#define MI_TRACK_H
 
 /* ------------------------------------------------------------------------------------------------------
 Track memory ranges with macros for tools like Valgrind address sanitizer, or other memory checkers.
@@ -34,7 +34,7 @@ The corresponding `mi_track_free` still uses the block start pointer and origina
 The `mi_track_resize` is currently unused but could be called on reallocations within a block.
 `mi_track_init` is called at program start.
 
-The following macros are for tools like asan and valgrind to track whether memory is 
+The following macros are for tools like asan and valgrind to track whether memory is
 defined, undefined, or not accessible at all:
 
   #define mi_track_mem_defined(p,size)
@@ -47,7 +47,7 @@ defined, undefined, or not accessible at all:
 // valgrind tool
 
 #define MI_TRACK_ENABLED      1
-#define MI_TRACK_HEAP_DESTROY 1           // track free of individual blocks on heap_destroy
+#define MI_TRACK_HEAP_DESTROY 1           // track free of individual blocks on theap_destroy
 #define MI_TRACK_TOOL         "valgrind"
 
 #include <valgrind/valgrind.h>
@@ -82,10 +82,6 @@ defined, undefined, or not accessible at all:
 #define MI_TRACK_HEAP_DESTROY 1
 #define MI_TRACK_TOOL         "ETW"
 
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
 #include "../src/prim/windows/etw.h"
 
 #define mi_track_init()                           EventRegistermicrosoft_windows_mimalloc();
@@ -96,7 +92,7 @@ defined, undefined, or not accessible at all:
 // no tracking
 
 #define MI_TRACK_ENABLED      0
-#define MI_TRACK_HEAP_DESTROY 0 
+#define MI_TRACK_HEAP_DESTROY 0
 #define MI_TRACK_TOOL         "none"
 
 #define mi_track_malloc_size(p,reqsize,size,zero)
@@ -146,4 +142,4 @@ defined, undefined, or not accessible at all:
   }
 #endif
 
-#endif
+#endif // MI_TRACK_H
diff --git a/system/lib/mimalloc/include/mimalloc/types.h b/system/lib/mimalloc/include/mimalloc/types.h
index 2fdde904bbdb3..b0c67ce616f63 100644
--- a/system/lib/mimalloc/include/mimalloc/types.h
+++ b/system/lib/mimalloc/include/mimalloc/types.h
@@ -1,34 +1,36 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #pragma once
-#ifndef MIMALLOC_TYPES_H
-#define MIMALLOC_TYPES_H
+#ifndef MI_TYPES_H
+#define MI_TYPES_H
 
 // --------------------------------------------------------------------------
 // This file contains the main type definitions for mimalloc:
-// mi_heap_t      : all data for a thread-local heap, contains
-//                  lists of all managed heap pages.
-// mi_segment_t   : a larger chunk of memory (32GiB) from where pages
-//                  are allocated. A segment is divided in slices (64KiB) from
-//                  which pages are allocated.
-// mi_page_t      : a "mimalloc" page (usually 64KiB or 512KiB) from
-//                  where objects are allocated.
+// mi_heap_t      : all data for a heap; usually there is just one main default heap.
+// mi_theap_t     : a thread local heap belonging to a specific heap:
+//                  maintains lists of thread-local heap pages that have free space.
+// mi_page_t      : a mimalloc page (usually 64KiB or 512KiB) from
+//                  where objects of a single size are allocated.
 //                  Note: we write "OS page" for OS memory pages while
 //                  using plain "page" for mimalloc pages (`mi_page_t`).
+// mi_arena_t     : a large memory area where pages are allocated (process shared)
+// mi_tld_t       : thread local data
+// mi_subproc_t   : all heaps belong to a sub-process (usually just the main one)
 // --------------------------------------------------------------------------
 
 
+#include <mimalloc-stats.h>
 #include <stddef.h>   // ptrdiff_t
 #include <stdint.h>   // uintptr_t, uint16_t, etc
-#include "atomic.h"   // _Atomic
-
-#ifdef _MSC_VER
-#pragma warning(disable:4214) // bitfield is not int
-#endif
+#include <stdbool.h>  // bool
+#include <limits.h>   // SIZE_MAX etc.
+#include <errno.h>    // error codes
+#include "bits.h"     // size defines (MI_INTPTR_SIZE etc), bit operations
+#include "atomic.h"   // _Atomic primitives
 
 // Minimal alignment necessary. On most platforms 16 bytes are needed
 // due to SSE registers for example. This must be at least `sizeof(void*)`
@@ -36,6 +38,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_MAX_ALIGN_SIZE  16   // sizeof(max_align_t)
 #endif
 
+
 // ------------------------------------------------------
 // Variants
 // ------------------------------------------------------
@@ -51,29 +54,50 @@ terms of the MIT license. A copy of the license can be found in the file
 // Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance).
 // #define MI_STAT 1
 
-// Define MI_SECURE to enable security mitigations
-// #define MI_SECURE 1  // guard page around metadata
-// #define MI_SECURE 2  // guard page around each mimalloc page
-// #define MI_SECURE 3  // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
-// #define MI_SECURE 4  // checks for double free. (may be more expensive)
+// Define MI_SECURE to enable security mitigations. Level 1 has minimal performance impact,
+// but protects most metadata with guard pages:
+//   #define MI_SECURE 1  // guard page around metadata; check pointer validity on free
+//
+// Level 2 is only used if `MI_PAGE_META_IS_SEPARATED==0` (which it is not by default) 
+//   #define MI_SECURE 2  // guard page around each mimalloc page (can fragment VMA's with large theaps..)
+//
+// Level 3 has slightly more performance overhead
+//   #define MI_SECURE 3  // randomize allocations, encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
+//
+// Level 4 has (much) more overhead. It also adds guard pages around each mimalloc page (even if `MI_PAGE_META_IS_SEPARATED` is defined).
+//   #define MI_SECURE 4  // checks also for double free. 
 
 #if !defined(MI_SECURE)
 #define MI_SECURE 0
 #endif
 
-// Define MI_DEBUG for debug mode
-// #define MI_DEBUG 1  // basic assertion checks and statistics, check double free, corrupted free list, and invalid pointer free.
-// #define MI_DEBUG 2  // + internal assertion checks
+// Define MI_DEBUG for assertion and invariant checking
+// #define MI_DEBUG 1  // basic assertion checks and statistics, check double free, corrupted free list, and invalid pointer free. (cmake -DMI_DEBUG=ON)
+// #define MI_DEBUG 2  // + internal assertion checks (cmake -DMI_DEBUG_INTERNAL=ON)
 // #define MI_DEBUG 3  // + extensive internal invariant checking (cmake -DMI_DEBUG_FULL=ON)
 #if !defined(MI_DEBUG)
-#if !defined(NDEBUG) || defined(_DEBUG)
+#if defined(MI_BUILD_RELEASE) || defined(NDEBUG)
+#define MI_DEBUG 0
+#else
 #define MI_DEBUG 2
+#endif
+#endif
+
+// Statistics (0=only essential, 1=normal, 2=more fine-grained (expensive) tracking)
+#ifndef MI_STAT
+#if (MI_DEBUG>0)
+#define MI_STAT 2
 #else
-#define MI_DEBUG 0
+#define MI_STAT 0
+#endif
 #endif
+
+// Enable guard pages behind objects of a certain size (set by the MIMALLOC_GUARDED_MIN/MAX/SAMPLE_RATE options)
+#if !defined(MI_GUARDED) && MI_DEBUG && !defined(NDEBUG) && !MI_PAGE_META_ALIGNED_FREE_SMALL 
+#define MI_GUARDED  1
 #endif
 
-// Reserve extra padding at the end of each block to be more resilient against heap block overflows.
+// Reserve extra padding at the end of each block to be more resilient against theap block overflows.
 // The padding can detect buffer overflow on free.
 #if !defined(MI_PADDING) && (MI_SECURE>=3 || MI_DEBUG>=1 || (MI_TRACK_VALGRIND || MI_TRACK_ASAN || MI_TRACK_ETW))
 #define MI_PADDING  1
@@ -91,134 +115,207 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_ENCODE_FREELIST  1
 #endif
 
+// Enable large pages for objects between 64KiB and 512KiB.
+// This should perhaps be disabled by default as for many workloads the block sizes above 64 KiB
+// are quite random which can lead to too many partially used large pages (but see issue #1104).
+#ifndef MI_ENABLE_LARGE_PAGES
+#define MI_ENABLE_LARGE_PAGES  1
+#endif
 
-// We used to abandon huge pages in order to eagerly deallocate it if freed from another thread.
-// Unfortunately, that makes it not possible to visit them during a heap walk or include them in a
-// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks nowadays if freed from
-// another thread so the memory becomes "virtually" available (and eventually gets properly freed by
-// the owning thread).
-// #define MI_HUGE_PAGE_ABANDON 1
+// Place page meta info at the start of the page area or keep it separate? 
+// Separate keeps the page info at the arena start (default) which is more secure 
+// and reduces wasted space due to alignment and block sizes. 
+// (but also reserves more memory up front (about 2MiB per GiB))
+#if !defined(MI_PAGE_META_IS_SEPARATED)
+#if MI_PAGE_MAP_FLAT
+#define MI_PAGE_META_IS_SEPARATED    0
+#else
+#define MI_PAGE_META_IS_SEPARATED    1
+#endif
+#endif
 
+// We can choose to only put page info of small pages at the start of the page area.
+// This can be used to have a slightly faster `mi_free_small` function for specialized
+// cases (like language runtime systems).
+#if !defined(MI_PAGE_META_ALIGNED_FREE_SMALL)
+#define MI_PAGE_META_ALIGNED_FREE_SMALL   0
+#endif
 
-// ------------------------------------------------------
-// Platform specific values
-// ------------------------------------------------------
+// Configuration checks
+#if !MI_PAGE_META_IS_SEPARATED && MI_SECURE
+#error "secure mode should use separated page infos"
+#endif
+#if MI_PAGE_META_ALIGNED_FREE_SMALL && MI_SECURE
+#error "secure mode cannot use MI_PAGE_META_ALIGNED_FREE_SMALL"
+#endif
+#if MI_PAGE_META_IS_SEPARATED && MI_PAGE_MAP_FLAT
+#error "cannot have a flat page map with separated page infos"
+#endif
+#if MI_DEBUG && NDEBUG
+#warning "mimalloc assertions enabled in a release build"
+#endif
 
-// ------------------------------------------------------
-// Size of a pointer.
-// We assume that `sizeof(void*)==sizeof(intptr_t)`
-// and it holds for all platforms we know of.
-//
-// However, the C standard only requires that:
-//  p == (void*)((intptr_t)p))
-// but we also need:
-//  i == (intptr_t)((void*)i)
-// or otherwise one might define an intptr_t type that is larger than a pointer...
-// ------------------------------------------------------
 
-#if INTPTR_MAX > INT64_MAX
-# define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
-#elif INTPTR_MAX == INT64_MAX
-# define MI_INTPTR_SHIFT (3)
-#elif INTPTR_MAX == INT32_MAX
-# define MI_INTPTR_SHIFT (2)
-#else
-#error platform pointers must be 32, 64, or 128 bits
+// --------------------------------------------------------------
+// Sizes of internal data-structures
+// (comments specify sizes on 64-bit, usually 32-bit is halved)
+// --------------------------------------------------------------
+
+// Main size parameter; determines max arena sizes and max arena object sizes etc.
+#ifndef MI_ARENA_SLICE_SHIFT
+  #ifdef  MI_SMALL_PAGE_SHIFT   // backward compatibility
+  #define MI_ARENA_SLICE_SHIFT              MI_SMALL_PAGE_SHIFT
+  #elif MI_SECURE && __APPLE__ && MI_ARCH_ARM64
+  #define MI_ARENA_SLICE_SHIFT              (17)                        // 128 KiB to not waste too much due to 16 KiB guard pages
+  #else
+  #define MI_ARENA_SLICE_SHIFT              (13 + MI_SIZE_SHIFT)        // 64 KiB (32 KiB on 32-bit)
+  #endif
+#endif
+#if MI_ARENA_SLICE_SHIFT < 12
+#error Arena slices should be at least 4KiB
 #endif
 
-#if SIZE_MAX == UINT64_MAX
-# define MI_SIZE_SHIFT (3)
-typedef int64_t  mi_ssize_t;
-#elif SIZE_MAX == UINT32_MAX
-# define MI_SIZE_SHIFT (2)
-typedef int32_t  mi_ssize_t;
-#else
-#error platform objects must be 32 or 64 bits
+#ifndef MI_BCHUNK_BITS_SHIFT
+  #if MI_ARENA_SLICE_SHIFT <= 13    // <= 8KiB
+  #define MI_BCHUNK_BITS_SHIFT              (7)   // 128 bits
+  #elif MI_ARENA_SLICE_SHIFT < 16   // <= 32KiB
+  #define MI_BCHUNK_BITS_SHIFT              (8)   // 256 bits
+  #else
+  #define MI_BCHUNK_BITS_SHIFT              (6 + MI_SIZE_SHIFT)       // 512 bits (or 256 on 32-bit)
+  #endif
 #endif
 
-#if (SIZE_MAX/2) > LONG_MAX
-# define MI_ZU(x)  x##ULL
-# define MI_ZI(x)  x##LL
-#else
-# define MI_ZU(x)  x##UL
-# define MI_ZI(x)  x##L
+#define MI_BCHUNK_BITS                    (1 << MI_BCHUNK_BITS_SHIFT)         // sub-bitmaps in arena's are "bchunks" of 512 bits
+#define MI_ARENA_SLICE_SIZE               (MI_ZU(1) << MI_ARENA_SLICE_SHIFT)  // arena's allocate in slices of 64 KiB
+#define MI_ARENA_SLICE_ALIGN              (MI_ARENA_SLICE_SIZE)
+
+#define MI_ARENA_MIN_OBJ_SLICES           (1)
+#define MI_ARENA_MAX_CHUNK_OBJ_SLICES     (MI_BCHUNK_BITS)                    // 32 MiB (or 8 MiB on 32-bit)
+
+#define MI_ARENA_MIN_OBJ_SIZE             (MI_ARENA_MIN_OBJ_SLICES * MI_ARENA_SLICE_SIZE)
+#define MI_ARENA_MAX_CHUNK_OBJ_SIZE       (MI_ARENA_MAX_CHUNK_OBJ_SLICES * MI_ARENA_SLICE_SIZE)
+
+#if MI_ARENA_MAX_CHUNK_OBJ_SIZE < MI_SIZE_SIZE*1024
+#error maximum object size may be too small to hold local thread data
 #endif
 
-#define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
-#define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
+#define MI_SMALL_PAGE_SIZE                MI_ARENA_MIN_OBJ_SIZE                    // 64 KiB
+#define MI_MEDIUM_PAGE_SIZE               (8*MI_SMALL_PAGE_SIZE)                   // 512 KiB  (=byte in the bchunk bitmap)
+#define MI_LARGE_PAGE_SIZE                (MI_SIZE_SIZE*MI_MEDIUM_PAGE_SIZE)       // 4 MiB    (=word in the bchunk bitmap)
 
-#define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
-#define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
 
-#define MI_KiB     (MI_ZU(1024))
-#define MI_MiB     (MI_KiB*MI_KiB)
-#define MI_GiB     (MI_MiB*MI_KiB)
+// Maximum number of size classes. (spaced exponentially in 12.5% increments)
+#if MI_BIN_HUGE != 73U
+#error "mimalloc internal: expecting 73 bins"
+#endif
+#define MI_BIN_FULL  (MI_BIN_HUGE+1)
+#define MI_BIN_COUNT (MI_BIN_FULL+1)
+
+// We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+#define MI_MAX_ALLOC_SIZE        PTRDIFF_MAX
+
+// Minimal commit for a page on-demand commit (should be >= OS page size)
+#define MI_PAGE_MIN_COMMIT_SIZE  MI_ARENA_SLICE_SIZE
 
 
 // ------------------------------------------------------
-// Main internal data-structures
+// Arena's are large reserved areas of memory allocated from
+// the OS that are managed by mimalloc to efficiently
+// allocate MI_ARENA_SLICE_SIZE slices of memory for the
+// mimalloc pages.
 // ------------------------------------------------------
 
-// Main tuning parameters for segment and page sizes
-// Sizes for 64-bit (usually divide by two for 32-bit)
-#ifndef MI_SEGMENT_SLICE_SHIFT
-#define MI_SEGMENT_SLICE_SHIFT            (13 + MI_INTPTR_SHIFT)         // 64KiB  (32KiB on 32-bit)
-#endif
+// A large memory arena where pages are allocated in.
+typedef struct mi_arena_s mi_arena_t;     // defined below
 
-#ifndef MI_SEGMENT_SHIFT
-#if MI_INTPTR_SIZE > 4
-#define MI_SEGMENT_SHIFT                  ( 9 + MI_SEGMENT_SLICE_SHIFT)  // 32MiB
-#else
-#define MI_SEGMENT_SHIFT                  ( 7 + MI_SEGMENT_SLICE_SHIFT)  // 4MiB on 32-bit
-#endif
-#endif
 
-#ifndef MI_SMALL_PAGE_SHIFT
-#define MI_SMALL_PAGE_SHIFT               (MI_SEGMENT_SLICE_SHIFT)       // 64KiB
-#endif
-#ifndef MI_MEDIUM_PAGE_SHIFT
-#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)     // 512KiB
-#endif
+// ------------------------------------------------------
+// Heaps contain allocated blocks. Heaps are self-contained
+// but share the (sub-process) memory in the arena's.
+// ------------------------------------------------------
 
-// Derived constants
-#define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
-#define MI_SEGMENT_ALIGN                  MI_SEGMENT_SIZE
-#define MI_SEGMENT_MASK                   ((uintptr_t)(MI_SEGMENT_ALIGN - 1))
-#define MI_SEGMENT_SLICE_SIZE             (MI_ZU(1)<< MI_SEGMENT_SLICE_SHIFT)
-#define MI_SLICES_PER_SEGMENT             (MI_SEGMENT_SIZE / MI_SEGMENT_SLICE_SIZE) // 1024
+// A first-class heap.
+typedef struct mi_heap_s mi_heap_t;       // heaps
 
-#define MI_SMALL_PAGE_SIZE                (MI_ZU(1)<<MI_SMALL_PAGE_SHIFT)
-#define MI_MEDIUM_PAGE_SIZE               (MI_ZU(1)<<MI_MEDIUM_PAGE_SHIFT)
+// ------------------------------------------------------
+// We can have sub-processes that are fully separated
+// from each other (for running multiple Python interpreters
+// for example). A sub-process holds the memory arenas and heaps.
+// ------------------------------------------------------
 
-#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 8KiB on 64-bit
-#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128KiB on 64-bit
-#define MI_MEDIUM_OBJ_WSIZE_MAX           (MI_MEDIUM_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
-#define MI_LARGE_OBJ_SIZE_MAX             (MI_SEGMENT_SIZE/2)      // 32MiB on 64-bit
-#define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
+// A sub-process
+typedef struct mi_subproc_s mi_subproc_t;
 
-// Maximum number of size classes. (spaced exponentially in 12.5% increments)
-#define MI_BIN_HUGE  (73U)
 
-#if (MI_MEDIUM_OBJ_WSIZE_MAX >= 655360)
-#error "mimalloc internal: define more bins"
-#endif
+// ---------------------------------------------------------------
+// a memory id tracks the provenance of arena/OS allocated memory
+// ---------------------------------------------------------------
 
-// Maximum block size for which blocks are guaranteed to be block size aligned. (see `segment.c:_mi_segment_page_start`)
-#define MI_MAX_ALIGN_GUARANTEE            (MI_MEDIUM_OBJ_SIZE_MAX)
+// Memory can reside in arena's, direct OS allocated, meta-data pages, or statically allocated.
+// The memid keeps track of this.
+typedef enum mi_memkind_e {
+  MI_MEM_NONE,      // not allocated
+  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
+  MI_MEM_STATIC,    // allocated in a static area and should not be freed (the initial main theap data for example (`init.c`))
+  MI_MEM_META,      // allocated with the meta data allocator (`arena-meta.c`)
+  MI_MEM_OS,        // allocated from the OS
+  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
+  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
+  MI_MEM_ARENA,     // allocated from an arena (the usual case) (`arena.c`)
+  MI_MEM_HEAP_MAIN  // allocated in the main heap (for theaps)
+} mi_memkind_t;
 
-// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
-#define MI_BLOCK_ALIGNMENT_MAX            (MI_SEGMENT_SIZE >> 1)
+static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
+  return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
+}
 
-// Maximum slice count (255) for which we can find the page for interior pointers
-#define MI_MAX_SLICE_OFFSET_COUNT         ((MI_BLOCK_ALIGNMENT_MAX / MI_SEGMENT_SLICE_SIZE) - 1)
+static inline bool mi_memkind_needs_no_free(mi_memkind_t memkind) {
+  return (memkind <= MI_MEM_STATIC);
+}
 
-// we never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-// on 64-bit+ systems we also limit the maximum allocation size such that the slice count fits in 32-bits. (issue #877)
-#if (PTRDIFF_MAX > INT32_MAX) && (PTRDIFF_MAX >= (MI_SEGMENT_SLIZE_SIZE * UINT32_MAX))
-#define MI_MAX_ALLOC_SIZE   (MI_SEGMENT_SLICE_SIZE * (UINT32_MAX-1))
-#else
-#define MI_MAX_ALLOC_SIZE   PTRDIFF_MAX
-#endif
+
+typedef struct mi_memid_os_info {
+  void*         base;               // actual base address of the block (used for offset aligned allocations)
+  size_t        size;               // allocated full size
+  // size_t        alignment;       // alignment at allocation
+} mi_memid_os_info_t;
+
+typedef struct mi_memid_arena_info {
+  mi_arena_t*   arena;              // arena that contains this memory
+  uint32_t      slice_index;        // slice index in the arena
+  uint32_t      slice_count;        // allocated slices
+} mi_memid_arena_info_t;
+
+typedef struct mi_memid_meta_info {
+  void*         meta_page;          // meta-page that contains the block
+  uint32_t      block_index;        // block index in the meta-data page
+  uint32_t      block_count;        // allocated blocks
+} mi_memid_meta_info_t;
+
+typedef struct mi_memid_s {
+  union {
+    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
+    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
+    mi_memid_meta_info_t  meta;     // only used for MI_MEM_META
+  } mem;
+  mi_memkind_t  memkind;
+  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
+  bool          initially_committed;// `true` if the memory was originally allocated as committed
+  bool          initially_zero;     // `true` if the memory was originally zero initialized
+} mi_memid_t;
+
+
+static inline bool mi_memid_is_os(mi_memid_t memid) {
+  return mi_memkind_is_os(memid.memkind);
+}
+
+static inline bool mi_memid_needs_no_free(mi_memid_t memid) {
+  return mi_memkind_needs_no_free(memid.memkind);
+}
+
+static inline mi_arena_t* mi_memid_arena(mi_memid_t memid) {
+  return (memid.memkind == MI_MEM_ARENA ? memid.mem.arena.arena : NULL);
+}
 
 
 // ------------------------------------------------------
@@ -238,38 +335,26 @@ typedef struct mi_block_s {
 } mi_block_t;
 
 
-// The delayed flags are used for efficient multi-threaded free-ing
-typedef enum mi_delayed_e {
-  MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
-  MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
-  MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
-  MI_NEVER_DELAYED_FREE = 3  // sticky: used for abondoned pages without a owning heap; this only resets on page reclaim
-} mi_delayed_t;
-
-
-// The `in_full` and `has_aligned` page flags are put in a union to efficiently
-// test if both are false (`full_aligned == 0`) in the `mi_free` routine.
-#if !MI_TSAN
-typedef union mi_page_flags_s {
-  uint8_t full_aligned;
-  struct {
-    uint8_t in_full : 1;
-    uint8_t has_aligned : 1;
-  } x;
-} mi_page_flags_t;
-#else
-// under thread sanitizer, use a byte for each flag to suppress warning, issue #130
-typedef union mi_page_flags_s {
-  uint16_t full_aligned;
-  struct {
-    uint8_t in_full;
-    uint8_t has_aligned;
-  } x;
-} mi_page_flags_t;
-#endif
+// The page flags are put in the bottom 2 bits of the thread_id (for a fast test in `mi_free`)
+// If `has_interior_pointers` is true if the page has pointers at an offset in a block (so we have to unalign to the block start before free-ing)
+// `in_full_queue` is true if the page is full and resides in the full queue (so we move it to a regular queue on free-ing)
+#define MI_PAGE_IN_FULL_QUEUE           MI_ZU(0x01)
+#define MI_PAGE_HAS_INTERIOR_POINTERS   MI_ZU(0x02)
+#define MI_PAGE_FLAG_MASK               MI_ZU(0x03)
+typedef size_t mi_page_flags_t;
+
+// There are two special threadid's: 0 for pages that are abandoned (and not in a theap queue),
+// and 4 for abandoned & mapped threads -- abandoned-mapped pages are abandoned but also mapped
+// in an arena (in `mi_heap_t.arena_pages.pages_abandoned`) so these can be quickly found for reuse.
+// Abondoning partially used pages allows for sharing of this memory between threads (in particular if threads are blocked)
+#define MI_THREADID_ABANDONED           MI_ZU(0)
+#define MI_THREADID_ABANDONED_MAPPED    (MI_PAGE_FLAG_MASK + 1)
 
 // Thread free list.
-// We use the bottom 2 bits of the pointer for mi_delayed_t flags
+// Points to a list of blocks that are freed by other threads.
+// The least-bit is set if the page is owned by the current thread. (`mi_page_is_owned`).
+// Ownership is required before we can read any non-atomic fields in the page.
+// This way we can push a block on the thread free list and try to claim ownership atomically in `free.c:mi_free_block_mt`.
 typedef uintptr_t mi_thread_free_t;
 
 // A page contains blocks of one specific size (`block_size`).
@@ -280,226 +365,122 @@ typedef uintptr_t mi_thread_free_t;
 // The `local_free` and `thread_free` lists are migrated to the `free` list
 // when it is exhausted. The separate `local_free` list is necessary to
 // implement a monotonic heartbeat. The `thread_free` list is needed for
-// avoiding atomic operations in the common case.
+// avoiding atomic operations when allocating from the owning thread.
 //
 // `used - |thread_free|` == actual blocks that are in use (alive)
 // `used - |thread_free| + |free| + |local_free| == capacity`
 //
-// We don't count `freed` (as |free|) but use `used` to reduce
+// We don't count "freed" (as |free|) but use only the `used` field to reduce
 // the number of memory accesses in the `mi_page_all_free` function(s).
+// Use `_mi_page_free_collect` to collect the thread_free list and update the `used` count.
 //
 // Notes:
-// - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
+// - Non-atomic fields can only be accessed if having _ownership_ (low bit of `xthread_free` is 1).
+//   Combining the `thread_free` list with an ownership bit allows a concurrent `free` to atomically
+//   free an object and (re)claim ownership if the page was abandoned.
+// - If a page is not part of a theap it is called "abandoned"  (`theap==NULL`) -- in
+//   that case the `xthreadid` is 0 or 4 (4 is for abandoned pages that
+//   are in the `pages_abandoned` lists of an arena, these are called "mapped" abandoned pages).
+// - page flags are in the bottom 3 bits of `xthread_id` for the fast path in `mi_free`.
+// - The layout is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
 // - Using `uint16_t` does not seem to slow things down
-// - The size is 12 words on 64-bit which helps the page index calculations
-//   (and 14 words on 32-bit, and encoded free lists add 2 words)
-// - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
-//   concurrent frees where only the first concurrent free adds to the owning
-//   heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`).
-//   The invariant is that no-delayed-free is only set if there is
-//   at least one block that will be added, or as already been added, to
-//   the owning heap `thread_delayed_free` list. This guarantees that pages
-//   will be freed correctly even if only other threads free blocks.
-typedef struct mi_page_s {
-  // "owned" by the segment
-  uint32_t              slice_count;       // slices in this page (0 if not a page)
-  uint32_t              slice_offset;      // distance from the actual page data slice (0 if a page)
-  uint8_t               is_committed:1;    // `true` if the page virtual memory is committed
-  uint8_t               is_zero_init:1;    // `true` if the page was initially zero initialized
-  uint8_t               is_huge:1;         // `true` if the page is in a huge segment (`segment->kind == MI_SEGMENT_HUGE`)
-                                           // padding
-  // layout like this to optimize access in `mi_malloc` and `mi_free`
-  uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
-  uint16_t              reserved;          // number of blocks reserved in memory
-  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
-  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
-  uint8_t               retire_expire:7;   // expiration count for retired blocks
-
-  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
-  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
-  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
-  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               heap_tag;          // tag of the owning heap, used for separated heaps by object type
-                                           // padding
-  size_t                block_size;        // size available in each block (always `>0`)
-  uint8_t*              page_start;        // start of the page area containing the blocks
 
-  #if (MI_ENCODE_FREELIST || MI_PADDING)
-  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
-  #endif
-
-  _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
-  _Atomic(uintptr_t)        xheap;
+typedef struct mi_page_s {
+  _Atomic(mi_threadid_t)    xthread_id;        // thread this page belongs to. (= `theap->thread_id (or 0 or 4 if abandoned) | page_flags`)
 
-  struct mi_page_s*     next;              // next page owned by this thread with the same `block_size`
-  struct mi_page_s*     prev;              // previous page owned by this thread with the same `block_size`
+  mi_block_t*               free;              // list of available free blocks (`malloc` allocates from this list)
+  uint16_t                  used;              // number of blocks in use (including blocks in `thread_free`)
+  uint16_t                  capacity;          // number of blocks committed
+  uint16_t                  reserved;          // number of blocks reserved in memory
+  uint8_t                   retire_expire;     // expiration count for retired blocks
+  bool                      free_is_zero;      // `true` if the blocks in the free list are zero initialized
 
-  // 64-bit 11 words, 32-bit 13 words, (+2 for secure)
-  void* padding[1];
-} mi_page_t;
+  mi_block_t*               local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+  _Atomic(mi_thread_free_t) xthread_free;      // list of deferred free blocks freed by other threads (= `mi_block_t* | (1 if owned)`)
 
+  size_t                    block_size;        // const: size available in each block (always `>0`)
+  uint8_t*                  page_start;        // const: start of the blocks
 
+  #if (MI_ENCODE_FREELIST || MI_PADDING)
+  uintptr_t                 keys[2];           // const: two random keys to encode the free lists (see `_mi_block_next`) or padding canary
+  #endif
 
-// ------------------------------------------------------
-// Mimalloc segments contain mimalloc pages
-// ------------------------------------------------------
+  mi_theap_t*               theap;             // the theap owning this page (may not be valid or NULL for abandoned pages)
+  mi_heap_t*                heap;              // const: the heap owning this page
 
-typedef enum mi_page_kind_e {
-  MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
-  MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages inside a segment
-  MI_PAGE_LARGE,    // larger blocks go into a single page spanning a whole segment
-  MI_PAGE_HUGE      // a huge page is a single page in a segment of variable size
-                    // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an aligment `> MI_BLOCK_ALIGNMENT_MAX`.
-} mi_page_kind_t;
+  struct mi_page_s*         next;              // next page owned by the theap with the same `block_size`
+  struct mi_page_s*         prev;              // previous page owned by the theap with the same `block_size`
+  size_t                    slice_committed;   // committed size relative to the first arena slice of the page data (or 0 if the page is fully committed already)
+  mi_memid_t                memid;             // const: provenance of the page memory
+} mi_page_t;
 
-typedef enum mi_segment_kind_e {
-  MI_SEGMENT_NORMAL, // MI_SEGMENT_SIZE size with pages inside.
-  MI_SEGMENT_HUGE,   // segment with just one huge page inside.
-} mi_segment_kind_t;
 
 // ------------------------------------------------------
-// A segment holds a commit mask where a bit is set if
-// the corresponding MI_COMMIT_SIZE area is committed.
-// The MI_COMMIT_SIZE must be a multiple of the slice
-// size. If it is equal we have the most fine grained
-// decommit (but setting it higher can be more efficient).
-// The MI_MINIMAL_COMMIT_SIZE is the minimal amount that will
-// be committed in one go which can be set higher than
-// MI_COMMIT_SIZE for efficiency (while the decommit mask
-// is still tracked in fine-grained MI_COMMIT_SIZE chunks)
+// Object sizes
 // ------------------------------------------------------
 
-#define MI_MINIMAL_COMMIT_SIZE      (1*MI_SEGMENT_SLICE_SIZE)
-#define MI_COMMIT_SIZE              (MI_SEGMENT_SLICE_SIZE)              // 64KiB
-#define MI_COMMIT_MASK_BITS         (MI_SEGMENT_SIZE / MI_COMMIT_SIZE)
-#define MI_COMMIT_MASK_FIELD_BITS    MI_SIZE_BITS
-#define MI_COMMIT_MASK_FIELD_COUNT  (MI_COMMIT_MASK_BITS / MI_COMMIT_MASK_FIELD_BITS)
-
-#if (MI_COMMIT_MASK_BITS != (MI_COMMIT_MASK_FIELD_COUNT * MI_COMMIT_MASK_FIELD_BITS))
-#error "the segment size must be exactly divisible by the (commit size * size_t bits)"
+#define MI_PAGE_ALIGN                     MI_ARENA_SLICE_ALIGN      // pages must be aligned on this for the page map.
+#define MI_PAGE_MIN_START_BLOCK_ALIGN     MI_MAX_ALIGN_SIZE         // minimal block alignment for the first block in a page (16b)
+#define MI_PAGE_MAX_START_BLOCK_ALIGN2    (4*MI_KiB)                // maximal block alignment for "power of 2"-sized blocks (such that we guarantee natural alignment)
+#define MI_PAGE_OSPAGE_BLOCK_ALIGN2       (4*MI_KiB)                // also aligns any multiple of this size to avoid TLB misses.
+#define MI_PAGE_MAX_OVERALLOC_ALIGN       MI_ARENA_SLICE_SIZE       // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation
+
+// The max object sizes are intended to not waste more than ~ 12.5% internally over the page sizes.
+#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_OSPAGE_BLOCK_ALIGN2)/6)   // = 10 KiB
+#if MI_ENABLE_LARGE_PAGES
+#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_OSPAGE_BLOCK_ALIGN2)/6)  // ~ 84 KiB
+#define MI_LARGE_MAX_OBJ_SIZE             (MI_LARGE_PAGE_SIZE/8)    // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
+#else
+#define MI_MEDIUM_MAX_OBJ_SIZE            (MI_MEDIUM_PAGE_SIZE/8)   // <= 64 KiB
+#define MI_LARGE_MAX_OBJ_SIZE             MI_MEDIUM_MAX_OBJ_SIZE    // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
 #endif
+#define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
 
-typedef struct mi_commit_mask_s {
-  size_t mask[MI_COMMIT_MASK_FIELD_COUNT];
-} mi_commit_mask_t;
-
-typedef mi_page_t  mi_slice_t;
-typedef int64_t    mi_msecs_t;
-
-
-// ---------------------------------------------------------------
-// a memory id tracks the provenance of arena/OS allocated memory
-// ---------------------------------------------------------------
-
-// Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this.
-typedef enum mi_memkind_e {
-  MI_MEM_NONE,      // not allocated
-  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
-  MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
-  MI_MEM_OS,        // allocated from the OS
-  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
-  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
-  MI_MEM_ARENA      // allocated from an arena (the usual case)
-} mi_memkind_t;
-
-static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
-  return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
-}
-
-typedef struct mi_memid_os_info {
-  void*         base;               // actual base address of the block (used for offset aligned allocations)
-  size_t        alignment;          // alignment at allocation
-} mi_memid_os_info_t;
+#if (MI_LARGE_MAX_OBJ_WSIZE >= 655360)
+#error "mimalloc internal: define more bins"
+#endif
 
-typedef struct mi_memid_arena_info {
-  size_t        block_index;        // index in the arena
-  mi_arena_id_t id;                 // arena id (>= 1)
-  bool          is_exclusive;       // this arena can only be used for specific arena allocations
-} mi_memid_arena_info_t;
 
-typedef struct mi_memid_s {
-  union {
-    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
-    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
-  } mem;
-  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
-  bool          initially_committed;// `true` if the memory was originally allocated as committed
-  bool          initially_zero;     // `true` if the memory was originally zero initialized
-  mi_memkind_t  memkind;
-} mi_memid_t;
+// ------------------------------------------------------
+// Page kinds
+// ------------------------------------------------------
 
+typedef enum mi_page_kind_e {
+  MI_PAGE_SMALL,      // small blocks go into 64KiB pages
+  MI_PAGE_MEDIUM,     // medium blocks go into 512KiB pages
+  MI_PAGE_LARGE,      // larger blocks go into 4MiB pages (if `MI_ENABLE_LARGE_PAGES==1`)
+  MI_PAGE_SINGLETON   // page containing a single block.
+                      // used for blocks `> MI_LARGE_MAX_OBJ_SIZE` or an aligment `> MI_PAGE_MAX_OVERALLOC_ALIGN`.
+} mi_page_kind_t;
 
-// -----------------------------------------------------------------------------------------
-// Segments are large allocated memory blocks (8mb on 64 bit) from arenas or the OS.
-//
-// Inside segments we allocated fixed size mimalloc pages (`mi_page_t`) that contain blocks.
-// The start of a segment is this structure with a fixed number of slice entries (`slices`)
-// usually followed by a guard OS page and the actual allocation area with pages.
-// While a page is not allocated, we view it's data as a `mi_slice_t` (instead of a `mi_page_t`).
-// Of any free area, the first slice has the info and `slice_offset == 0`; for any subsequent
-// slices part of the area, the `slice_offset` is the byte offset back to the first slice
-// (so we can quickly find the page info on a free, `internal.h:_mi_segment_page_of`).
-// For slices, the `block_size` field is repurposed to signify if a slice is used (`1`) or not (`0`).
-// Small and medium pages use a fixed amount of slices to reduce slice fragmentation, while
-// large and huge pages span a variable amount of slices.
-typedef struct mi_segment_s {
-  // constant fields
-  mi_memid_t        memid;              // memory id for arena/OS allocation
-  bool              allow_decommit;     // can we decommmit the memory
-  bool              allow_purge;        // can we purge the memory (reset or decommit)
-  size_t            segment_size;
-
-  // segment fields
-  mi_msecs_t        purge_expire;       // purge slices in the `purge_mask` after this time
-  mi_commit_mask_t  purge_mask;         // slices that can be purged
-  mi_commit_mask_t  commit_mask;        // slices that are currently committed
-
-  // from here is zero initialized
-  struct mi_segment_s* next;            // the list of freed segments in the cache (must be first field, see `segment.c:mi_segment_init`)
-  bool              was_reclaimed;      // true if it was reclaimed (used to limit on-free reclamation)
-
-  size_t            abandoned;          // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
-  size_t            abandoned_visits;   // count how often this segment is visited during abondoned reclamation (to force reclaim if it takes too long)
-  size_t            used;               // count of pages in use
-  uintptr_t         cookie;             // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie`
-
-  size_t            segment_slices;      // for huge segments this may be different from `MI_SLICES_PER_SEGMENT`
-  size_t            segment_info_slices; // initial count of slices that we are using for segment info and possible guard pages.
-
-  // layout like this to optimize access in `mi_free`
-  mi_segment_kind_t kind;
-  size_t            slice_entries;       // entries in the `slices` array, at most `MI_SLICES_PER_SEGMENT`
-  _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
-
-  mi_slice_t        slices[MI_SLICES_PER_SEGMENT+1];  // one extra final entry for huge blocks with large alignment
-} mi_segment_t;
 
 
 // ------------------------------------------------------
-// Heaps
-// Provide first-class heaps to allocate from.
-// A heap just owns a set of pages for allocation and
+// A "theap" is a thread local heap which owns pages.
+// (making them thread-local avoids atomic operations)
+//
+// All theaps belong to a (non-thread-local) heap.
+// A theap just owns a set of pages for allocation and
 // can only be allocate/reallocate from the thread that created it.
 // Freeing blocks can be done from any thread though.
-// Per thread, the segments are shared among its heaps.
-// Per thread, there is always a default heap that is
-// used for allocation; it is initialized to statically
-// point to an empty heap to avoid initialization checks
-// in the fast path.
+//
+// Per thread, there is always a default theap that belongs
+// to the default heap. It is initialized to statically
+// point initially to an empty theap to avoid initialization
+// checks in the fast path.
 // ------------------------------------------------------
 
 // Thread local data
-typedef struct mi_tld_s mi_tld_t;
+typedef struct mi_tld_s mi_tld_t;   // defined below
 
 // Pages of a certain block size are held in a queue.
 typedef struct mi_page_queue_s {
   mi_page_t* first;
   mi_page_t* last;
+  size_t     count;
   size_t     block_size;
 } mi_page_queue_t;
 
-#define MI_BIN_FULL  (MI_BIN_HUGE+1)
-
 // Random context
 typedef struct mi_random_cxt_s {
   uint32_t input[16];
@@ -510,7 +491,7 @@ typedef struct mi_random_cxt_s {
 
 
 // In debug mode there is a padding structure at the end of the blocks to check for buffer overflows
-#if (MI_PADDING)
+#if MI_PADDING
 typedef struct mi_padding_s {
   uint32_t canary; // encoded block value to check validity of the padding (in case of overflow)
   uint32_t delta;  // padding bytes before the block. (mi_usable_size(p) - delta == exact allocated bytes)
@@ -525,181 +506,233 @@ typedef struct mi_padding_s {
 #define MI_PAGES_DIRECT   (MI_SMALL_WSIZE_MAX + MI_PADDING_WSIZE + 1)
 
 
-// A heap owns a set of pages.
-struct mi_heap_s {
-  mi_tld_t*             tld;
-  _Atomic(mi_block_t*)  thread_delayed_free;
-  mi_threadid_t         thread_id;                           // thread this heap belongs too
-  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)
+// A thread-local heap ("theap") owns a set of thread-local pages.
+struct mi_theap_s {
+  mi_tld_t*             tld;                                 // thread-local data
+  _Atomic(mi_heap_t*)   heap;                                // the heap this theap belongs to.
+  _Atomic(size_t)       refcount;                            // reference count
+  unsigned long long    heartbeat;                           // monotonic heartbeat count
   uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
-  uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
   mi_random_ctx_t       random;                              // random number context used for secure allocation
   size_t                page_count;                          // total number of pages in the `pages` queues.
   size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
   size_t                page_retired_max;                    // largest retired index into the `pages` array.
-  mi_heap_t*            next;                                // list of heaps per thread
-  bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
-  uint8_t               tag;                                 // custom tag, can be used for separating heaps based on the object types
+  size_t                pages_full_size;                     // optimization: total size of blocks in the pages of the full queue (issue #1220)
+  long                  generic_count;                       // how often is `_mi_malloc_generic` called?
+  long                  generic_collect_count;               // how often is `_mi_malloc_generic` called without collecting?
+
+  mi_theap_t*           tnext;                               // list of theaps in this thread
+  mi_theap_t*           tprev;  
+  mi_theap_t*           hnext;                               // list of theaps of the owning `heap`
+  mi_theap_t*           hprev;
+  
+  long                  page_full_retain;                    // how many full pages can be retained per queue (before abandoning them)
+  bool                  allow_page_reclaim;                  // `true` if this theap should not reclaim abandoned pages
+  bool                  allow_page_abandon;                  // `true` if this theap can abandon pages to reduce memory footprint
+  #if MI_GUARDED
+  size_t                guarded_size_min;                    // minimal size for guarded objects
+  size_t                guarded_size_max;                    // maximal size for guarded objects
+  size_t                guarded_sample_rate;                 // sample rate (set to 0 to disable guarded pages)
+  size_t                guarded_sample_count;                // current sample count (counting down to 0)
+  #endif
   mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
-  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
+  mi_page_queue_t       pages[MI_BIN_COUNT];                 // queue of pages for each size class (or "bin")
+  mi_memid_t            memid;                               // provenance of the theap struct itself (meta or os)
+  mi_stats_t            stats;                               // thread-local statistics
 };
 
 
 
+
 // ------------------------------------------------------
-// Debug
+// Heaps contain allocated blocks. Heaps are self-contained
+// but share the (sub-process) memory in the arena's.
 // ------------------------------------------------------
 
-#if !defined(MI_DEBUG_UNINIT)
-#define MI_DEBUG_UNINIT     (0xD0)
-#endif
-#if !defined(MI_DEBUG_FREED)
-#define MI_DEBUG_FREED      (0xDF)
-#endif
-#if !defined(MI_DEBUG_PADDING)
-#define MI_DEBUG_PADDING    (0xDE)
-#endif
+// Keep track of all owned and abandoned pages in the arena's
+struct mi_arena_pages_s;
+typedef struct mi_arena_pages_s mi_arena_pages_t;
 
-#if (MI_DEBUG)
-// use our own assertion to print without memory allocation
-void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func );
-#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
-#else
-#define mi_assert(x)
-#endif
+#define MI_MAX_ARENAS   (160)   // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`)
+                                // 160 arenas is enough for ~2 TiB memory
 
-#if (MI_DEBUG>1)
-#define mi_assert_internal    mi_assert
-#else
-#define mi_assert_internal(x)
-#endif
+// A dynamic thread-local variable; 0 for an invalid thread-local
+typedef size_t mi_thread_local_t;
 
-#if (MI_DEBUG>2)
-#define mi_assert_expensive   mi_assert
-#else
-#define mi_assert_expensive(x)
-#endif
+typedef struct mi_heap_s {
+  mi_subproc_t*         subproc;                        // a heap belongs to a subprocess
+  size_t                heap_seq;                       // unique sequence number for heaps in this subprocess
+  mi_heap_t*            next;                           // list of heaps in this subprocess
+  mi_heap_t*            prev;
+  mi_thread_local_t     theap;                          // dynamic thread local for the thread-local theaps of this heap
 
-// ------------------------------------------------------
-// Statistics
-// ------------------------------------------------------
+  mi_arena_t*           exclusive_arena;                // if the heap should only allocate from a specific arena (or NULL)
+  int                   numa_node;                      // if >=0, prefer this numa node for allocations
 
-#ifndef MI_STAT
-#if (MI_DEBUG>0)
-#define MI_STAT 2
-#else
-#define MI_STAT 0
-#endif
-#endif
+  mi_theap_t*           theaps;                         // list of all thread-local theaps belonging to this heap (using the `hnext`/`hprev` fields)
+  mi_lock_t             theaps_lock;                    // lock for the theaps list operations
 
-typedef struct mi_stat_count_s {
-  int64_t allocated;
-  int64_t freed;
-  int64_t peak;
-  int64_t current;
-} mi_stat_count_t;
-
-typedef struct mi_stat_counter_s {
-  int64_t total;
-  int64_t count;
-} mi_stat_counter_t;
-
-typedef struct mi_stats_s {
-  mi_stat_count_t segments;
-  mi_stat_count_t pages;
-  mi_stat_count_t reserved;
-  mi_stat_count_t committed;
-  mi_stat_count_t reset;
-  mi_stat_count_t purged;
-  mi_stat_count_t page_committed;
-  mi_stat_count_t segments_abandoned;
-  mi_stat_count_t pages_abandoned;
-  mi_stat_count_t threads;
-  mi_stat_count_t normal;
-  mi_stat_count_t huge;
-  mi_stat_count_t large;
-  mi_stat_count_t malloc;
-  mi_stat_count_t segments_cache;
-  mi_stat_counter_t pages_extended;
-  mi_stat_counter_t mmap_calls;
-  mi_stat_counter_t commit_calls;
-  mi_stat_counter_t reset_calls;
-  mi_stat_counter_t purge_calls;
-  mi_stat_counter_t page_no_retire;
-  mi_stat_counter_t searches;
-  mi_stat_counter_t normal_count;
-  mi_stat_counter_t huge_count;
-  mi_stat_counter_t large_count;
-  mi_stat_counter_t arena_count;
-  mi_stat_counter_t arena_crossover_count;
-  mi_stat_counter_t arena_rollback_count;
-#if MI_STAT>1
-  mi_stat_count_t normal_bins[MI_BIN_HUGE+1];
-#endif
-} mi_stats_t;
-
-
-void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
-void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
-void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
-
-#if (MI_STAT)
-#define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
-#define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
-#define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
-#else
-#define mi_stat_increase(stat,amount)         (void)0
-#define mi_stat_decrease(stat,amount)         (void)0
-#define mi_stat_counter_increase(stat,amount) (void)0
-#endif
+  _Atomic(size_t)       abandoned_count[MI_BIN_COUNT];  // total count of abandoned pages in this heap
+  mi_page_t*            os_abandoned_pages;             // list of pages that are OS allocated and not in an arena
+  mi_lock_t             os_abandoned_pages_lock;        // lock for the os abandoned pages list (this lock protects list operations)
+
+  _Atomic(mi_arena_pages_t*) arena_pages[MI_MAX_ARENAS]; // track owned and abandoned pages in the arenas (entries can be NULL)
+  mi_lock_t             arena_pages_lock;                // lock to update the arena_pages array
+
+  mi_stats_t            stats;                           // statistics for this heap; periodically updated by merging from each theap
+} mi_heap_t;
+
+
+// ------------------------------------------------------
+// Sub processes do not reclaim or visit pages from other sub processes.
+// These are essentially the static variables of a process, and
+// usually there is only one subprocess. This can be used for example
+// by CPython to have separate interpreters within one process.
+// Each thread can only belong to one subprocess
+// (and needs to call `mi_subproc_add_current_thread` before any allocations).
+// ------------------------------------------------------
 
-#define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
-#define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
-#define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
+struct mi_subproc_s {
+  size_t                subproc_seq;                    // unique id for sub-processes
+  mi_subproc_t*         next;                           // list of all sub-processes
+  mi_subproc_t*         prev;
+
+  _Atomic(size_t)       arena_count;                    // current count of arena's
+  _Atomic(mi_arena_t*)  arenas[MI_MAX_ARENAS];          // arena's of this sub-process
+  mi_lock_t             arena_reserve_lock;             // lock to ensure arena's get reserved one at a time
+  mi_decl_align(8)                                      // needed on some 32-bit platforms
+  _Atomic(int64_t)      purge_expire;                   // expiration is set if any arenas can be purged
+
+  _Atomic(mi_heap_t*)   heap_main;                      // main heap for this sub process
+  mi_heap_t*            heaps;                          // heaps belonging to this sub-process
+  mi_lock_t             heaps_lock;
+
+  _Atomic(size_t)       thread_count;                   // current threads associated with this sub-process
+  _Atomic(size_t)       thread_total_count;             // total created threads associated with this sub-process
+  _Atomic(size_t)       heap_count;                     // current heaps in this sub-process (== |heaps|)
+  _Atomic(size_t)       heap_total_count;               // total created heaps in this sub-process
+
+  mi_memid_t            memid;                          // provenance of this memory block (meta or static)
+  mi_decl_align(8)                                      // needed on some 32-bit platforms
+  mi_stats_t            stats;                          // subprocess statistics; updated for arena/OS stats like committed,
+                                                        // and otherwise merged with heap stats when those are deleted
+};
 
 
 // ------------------------------------------------------
 // Thread Local data
 // ------------------------------------------------------
 
-// A "span" is is an available range of slices. The span queues keep
-// track of slice spans of at most the given `slice_count` (but more than the previous size class).
-typedef struct mi_span_queue_s {
-  mi_slice_t* first;
-  mi_slice_t* last;
-  size_t      slice_count;
-} mi_span_queue_t;
-
-#define MI_SEGMENT_BIN_MAX (35)     // 35 == mi_segment_bin(MI_SLICES_PER_SEGMENT)
-
-// OS thread local data
-typedef struct mi_os_tld_s {
-  size_t                region_idx;   // start point for next allocation
-  mi_stats_t*           stats;        // points to tld stats
-} mi_os_tld_t;
-
-
-// Segments thread local data
-typedef struct mi_segments_tld_s {
-  mi_span_queue_t     spans[MI_SEGMENT_BIN_MAX+1];  // free slice spans inside segments
-  size_t              count;        // current number of segments;
-  size_t              peak_count;   // peak number of segments
-  size_t              current_size; // current size of all segments
-  size_t              peak_size;    // peak size of all segments
-  size_t              reclaim_count;// number of reclaimed (abandoned) segments
-  mi_stats_t*         stats;        // points to tld stats
-  mi_os_tld_t*        os;           // points to os stats
-} mi_segments_tld_t;
+// Milliseconds as in `int64_t` to avoid overflows
+typedef int64_t  mi_msecs_t;
 
 // Thread local data
 struct mi_tld_s {
-  unsigned long long  heartbeat;     // monotonic heartbeat count
-  bool                recurse;       // true if deferred was called; used to prevent infinite recursion.
-  mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
-  mi_heap_t*          heaps;         // list of heaps in this thread (so we can abandon all when the thread terminates)
-  mi_segments_tld_t   segments;      // segment tld
-  mi_os_tld_t         os;            // os tld
-  mi_stats_t          stats;         // statistics
+  mi_threadid_t         thread_id;            // thread id of this thread
+  size_t                thread_seq;           // thread sequence id (linear count of created threads)
+  int                   numa_node;            // thread preferred numa node
+  mi_subproc_t*         subproc;              // sub-process this thread belongs to.
+  mi_theap_t*           theaps;               // list of theaps in this thread (so we can abandon all when the thread terminates)
+  mi_lock_t             theaps_lock;          // lock as the theaps list is sometimes accessed from another thread (on `mi_heap_free`)
+  bool                  recurse;              // true if deferred was called; used to prevent infinite recursion.
+  bool                  is_in_threadpool;     // true if this thread is part of a threadpool (and can run arbitrary tasks)
+  mi_memid_t            memid;                // provenance of the tld memory itself (meta or OS)
 };
 
+
+/* ----------------------------------------------------------------------------
+  Arenas are fixed area's of OS memory from which we can allocate
+  large blocks (>= MI_ARENA_MIN_BLOCK_SIZE).
+  In contrast to the rest of mimalloc, the arenas are shared between
+  threads and need to be accessed using atomic operations (using atomic `mi_bitmap_t`'s).
+
+  Arenas are also used to for huge OS page (1GiB) reservations or for reserving
+  OS memory upfront which can be improve performance or is sometimes needed
+  on embedded devices. We can also employ this with WASI or `sbrk` systems
+  to reserve large arenas upfront and be able to reuse the memory more effectively.
+-----------------------------------------------------------------------------*/
+
+#define MI_ARENA_BIN_COUNT      (MI_BIN_COUNT)
+#define MI_ARENA_MIN_SIZE       (MI_BCHUNK_BITS * MI_ARENA_SLICE_SIZE)           // 32 MiB (or 8 MiB on 32-bit)
+#define MI_ARENA_MAX_SIZE       (MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE)
+
+typedef struct mi_bitmap_s  mi_bitmap_t;    // atomic bitmap  (defined in `src/bitmap.h`)
+typedef struct mi_bbitmap_s mi_bbitmap_t;   // atomic binned bitmap (defined in `src/bitmap.h`)
+
+typedef struct mi_arena_pages_s {
+  mi_bitmap_t* pages;                // all registered pages (abandoned and owned)
+  mi_bitmap_t* pages_abandoned[MI_ARENA_BIN_COUNT];  // abandoned pages per size bin (a set bit means the start of the page)
+  // followed by the bitmaps (whose siz`es depend on the arena size)
+} mi_arena_pages_t;
+
+
+// A memory arena
+typedef struct mi_arena_s {
+  mi_memid_t          memid;                // provenance of the memory area
+  mi_subproc_t*       subproc;              // subprocess this arena belongs to (`this 'element-of' this->subproc->arenas`)
+  size_t              arena_idx;            // index in the arenas array
+
+  size_t              slice_count;          // total size of the area in arena slices (of `MI_ARENA_SLICE_SIZE`)
+  size_t              info_slices;          // initial slices reserved for the arena bitmaps
+  int                 numa_node;            // associated NUMA node
+  bool                is_exclusive;         // only allow allocations if specifically for this arena
+  mi_decl_align(8)                          // needed on some 32-bit platforms
+  _Atomic(mi_msecs_t) purge_expire;         // expiration time when slices can be purged from `slices_purge`.
+  mi_commit_fun_t*    commit_fun;           // custom commit/decommit memory
+  void*               commit_fun_arg;       // user argument for a custom commit function
+
+  size_t              total_size;           // for (user given) memory more than MI_ARENA_MAX_SIZE, we use N arena's to cover it. The first (parent) has the total size (and the other sub-arena's 0).
+  mi_arena_t*         parent;               // if this is a sub arena, this points to the first one in the memory area.
+
+  mi_bbitmap_t*       slices_free;          // is the slice free? (a binned bitmap with size classes)
+  mi_bitmap_t*        slices_committed;     // is the slice committed? (i.e. accessible)
+  mi_bitmap_t*        slices_dirty;         // is the slice potentially non-zero?
+  mi_bitmap_t*        slices_purge;         // slices that can be purged
+  mi_page_t*          pages_meta;           // pre-allocated `slice_count` page meta info -- only used if `MI_PAGE_META_IS_SEPARATED!=0`
+  mi_arena_pages_t    pages_main;           // arena page bitmaps for the main heap are allocated up front as well
+
+  // followed by the bitmaps (whose sizes depend on the arena size)
+  // note: when adding bitmaps revise `mi_arena_info_slices_needed`
+} mi_arena_t;
+
+
+
+/* -----------------------------------------------------------
+  Error codes passed to `_mi_fatal_error`
+  All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
+  For portability define undefined error codes using common Unix codes:
+  <https://www-numi.fnal.gov/offline_software/srt_public_context/WebDocs/Errors/unix_system_errors.html>
+----------------------------------------------------------- */
+
+#ifndef EAGAIN         // double free
+#define EAGAIN (11)
+#endif
+#ifndef ENOMEM         // out of memory
+#define ENOMEM (12)
+#endif
+#ifndef EFAULT         // corrupted free-list or meta-data
+#define EFAULT (14)
+#endif
+#ifndef EINVAL         // trying to free an invalid pointer
+#define EINVAL (22)
 #endif
+#ifndef EOVERFLOW      // count*size overflow
+#define EOVERFLOW (75)
+#endif
+
+/* -----------------------------------------------------------
+  Debug constants
+----------------------------------------------------------- */
+
+#if !defined(MI_DEBUG_UNINIT)
+#define MI_DEBUG_UNINIT     (0xD0)
+#endif
+#if !defined(MI_DEBUG_FREED)
+#define MI_DEBUG_FREED      (0xDF)
+#endif
+#if !defined(MI_DEBUG_PADDING)
+#define MI_DEBUG_PADDING    (0xDE)
+#endif
+
+
+#endif // MI_TYPES_H
diff --git a/system/lib/mimalloc/readme.md b/system/lib/mimalloc/readme.md
index a0296b43c35aa..0a2e0060a7039 100644
--- a/system/lib/mimalloc/readme.md
+++ b/system/lib/mimalloc/readme.md
@@ -1,7 +1,10 @@
-
 <img align="left" width="100" height="100" src="doc/mimalloc-logo.png"/>
-
-[<img align="right" src="https://dev.azure.com/Daan0324/mimalloc/_apis/build/status/microsoft.mimalloc?branchName=dev"/>](https://dev.azure.com/Daan0324/mimalloc/_build?definitionId=1&_a=summary)
+<div align="right">
+<sup> v3:</sup><a href="https://github.com/microsoft/mimalloc/actions/workflows/test.yaml"><img src="https://github.com/microsoft/mimalloc/actions/workflows/test.yaml/badge.svg?branch=dev3"/></a>
+<sup> v2:</sup><a href="https://github.com/microsoft/mimalloc/actions/workflows/test.yaml"><img src="https://github.com/microsoft/mimalloc/actions/workflows/test.yaml/badge.svg?branch=dev2"/></a>
+<sup> v1:</sup><a href="https://github.com/microsoft/mimalloc/actions/workflows/test.yaml"><img src="https://github.com/microsoft/mimalloc/actions/workflows/test.yaml/badge.svg?branch=dev"/></a>
+&nbsp;&nbsp;&nbsp;<sup>v3:</sup><a href="https://dev.azure.com/Daan0324/mimalloc/_build?definitionId=1&_a=summary"><img src="https://dev.azure.com/Daan0324/mimalloc/_apis/build/status/microsoft.mimalloc?branchName=dev3"/></a>
+</div>
 
 # mimalloc
 
@@ -12,17 +15,19 @@ is a general purpose allocator with excellent [performance](#performance) charac
 Initially developed by Daan Leijen for the runtime systems of the
 [Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.
 
-Latest release tag: `v2.1.7` (2024-05-21).  
-Latest v1 tag: `v1.8.7` (2024-05-21).
+Latest release   : `v3.3.0` (2026-04-15) recommended.  
+Latest v2 release: `v2.3.0` (2026-04-15) stable.  
+Latest v1 release: `v1.9.8` (2026-04-15) legacy.
 
 mimalloc is a drop-in replacement for `malloc` and can be used in other programs
 without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
 ```
 > LD_PRELOAD=/usr/lib/libmimalloc.so  myprogram
 ```
-It also includes a robust way to override the default allocator in [Windows](#override_on_windows). Notable aspects of the design include:
+It also includes a way to dynamically override the default allocator in [Windows](#override_on_windows). 
+Notable aspects of the design include:
 
-- __small and consistent__: the library is about 8k LOC using simple and
+- __small and consistent__: the library is about 10k LOC using simple and
   consistent data structures. This makes it very suitable
   to integrate and adapt in other projects. For runtime systems it
   provides hooks for a monotonic _heartbeat_ and deferred freeing (for
@@ -55,6 +60,7 @@ It also includes a robust way to override the default allocator in [Windows](#ov
   over our benchmarks.
 - __first-class heaps__: efficiently create and use multiple heaps to allocate across different regions.
   A heap can be destroyed at once instead of deallocating each object separately.
+  New: v3 has true first-class heaps where one can allocate in a heap from any thread.   
 - __bounded__: it does not suffer from _blowup_ \[1\], has bounded worst-case allocation
   times (_wcat_) (upto OS primitives), bounded space overhead (~0.2% meta-data, with low
   internal fragmentation), and has no internal points of contention using only atomic operations.
@@ -64,72 +70,69 @@ It also includes a robust way to override the default allocator in [Windows](#ov
   of benchmarks. There is also good huge OS page support for larger server programs.
 
 The [documentation](https://microsoft.github.io/mimalloc) gives a full overview of the API.
-You can read more on the design of _mimalloc_ in the [technical report](https://www.microsoft.com/en-us/research/publication/mimalloc-free-list-sharding-in-action) which also has detailed benchmark results.
+You can read more on the design of mimalloc in the [technical report](https://www.microsoft.com/en-us/research/publication/mimalloc-free-list-sharding-in-action) which also has detailed benchmark results.
 
 Enjoy!
 
-### Branches
-
-* `master`: latest stable release (based on `dev-slice`).
-* `dev`: development branch for mimalloc v1. Use this branch for submitting PR's.
-* `dev-slice`: development branch for mimalloc v2. This branch is downstream of `dev` (and is essentially equal to `dev` except for
-`src/segment.c`)
-
-### Releases
+### Versions
 
-Note: the `v2.x` version has a different algorithm for managing internal mimalloc pages (as slices) that tends to use reduce 
-memory usage
-  and fragmentation compared to mimalloc `v1.x` (especially for large workloads). Should otherwise have similar performance
-  (see [below](#performance)); please report if you observe any significant performance regression.
+There are three maintained versions of mimalloc. These are mostly equal except for how the OS memory is handled. 
+New development is mostly on v3, while v1 and v2 are maintained with security and bug fixes. 
 
-* 2024-05-21, `v1.8.7`, `v2.1.7`: Fix build issues on less common platforms. Started upstreaming patches
-  from the CPython [integration](https://github.com/python/cpython/issues/113141#issuecomment-2119255217). Upstream `vcpkg` patches.
-* 2024-05-13, `v1.8.6`, `v2.1.6`: Fix build errors on various (older) platforms. Refactored aligned allocation.
-* 2024-04-22, `v1.8.4`, `v2.1.4`: Fixes various bugs and build issues. Add `MI_LIBC_MUSL` cmake flag for musl builds.
-  Free-ing code is refactored into a separate module (`free.c`). Mimalloc page info is simplified with the block size
-  directly available (and new `block_size_shift` to improve aligned block free-ing). 
-  New approach to collection of abandoned segments: When
-  a thread terminates the segments it owns are abandoned (containing still live objects) and these can be
-  reclaimed by other threads. We no longer use a list of abandoned segments but this is now done using bitmaps in arena's 
-  which is more concurrent (and more aggressive). Abandoned memory can now also be reclaimed if a thread frees an object in
-  an abandoned page (which can be disabled using `mi_option_abandoned_reclaim_on_free`). The option `mi_option_max_segment_reclaim`
-  gives a maximum percentage of abandoned segments that can be reclaimed per try (=10%).
-
-* 2023-04-24, `v1.8.2`, `v2.1.2`: Fixes build issues on freeBSD, musl, and C17 (UE 5.1.1). Reduce code size/complexity 
-  by removing regions and segment-cache's and only use arenas with improved memory purging -- this may improve memory
-  usage as well for larger services. Renamed options for consistency. Improved Valgrind and ASAN checking.
-  
-* 2023-04-03, `v1.8.1`, `v2.1.1`: Fixes build issues on some platforms.
-
-* 2023-03-29, `v1.8.0`, `v2.1.0`: Improved support dynamic overriding on Windows 11. Improved tracing precision
-  with [asan](#asan) and [Valgrind](#valgrind), and added Windows event tracing [ETW](#ETW) (contributed by Xinglong He). Created an OS
-  abstraction layer to make it easier to port and separate platform dependent code (in `src/prim`). Fixed C++ STL compilation on older Microsoft C++ compilers, and various small bug fixes.
-
-* 2022-12-23, `v1.7.9`, `v2.0.9`: Supports building with [asan](#asan) and improved [Valgrind](#valgrind) support.
-  Support arbitrary large alignments (in particular for `std::pmr` pools). 
-  Added C++ STL allocators attached to a specific heap (thanks @vmarkovtsev). 
-  Heap walks now visit all object (including huge objects). Support Windows nano server containers (by Johannes Schindelin,@dscho). 
-  Various small bug fixes.
-
-* 2022-11-03, `v1.7.7`, `v2.0.7`: Initial support for [Valgrind](#valgrind) for leak testing and heap block overflow
-  detection. Initial
-  support for attaching heaps to a speficic memory area (only in v2). Fix `realloc` behavior for zero size blocks, remove restriction to integral multiple of the alignment in `alloc_align`, improved aligned allocation performance, reduced contention with many threads on few processors (thank you @dposluns!), vs2022 support, support `pkg-config`, .
+- __v3__: recommended: simplifies the lock-free design of previous versions and improves sharing of 
+        memory between threads. On certain large workloads this version may use 
+        (much) less memory. Also supports true first-class heaps (that can allocate from any thread) 
+        and has more efficient heap-walking (for the CPython GC for example).
+        (release tags: `v3.x`, development branch `dev3`).
+- __v2__: stable mimalloc version. Uses thread-local segments to reduce fragmentation. (release tags: `v2.x`, development branch `dev2` and `main`)
+- __v1__: legacy version: initial design of mimalloc (release tags: `v1.9.x`, development branch `dev`). Send PR's against this version if possible.
 
-* 2022-04-14, `v1.7.6`, `v2.0.6`: fix fallback path for aligned OS allocation on Windows, improve Windows aligned allocation
-  even when compiling with older SDK's, fix dynamic overriding on macOS Monterey, fix MSVC C++ dynamic overriding, fix
-  warnings under Clang 14, improve performance if many OS threads are created and destroyed, fix statistics for large object
-  allocations, using MIMALLOC_VERBOSE=1 has no maximum on the number of error messages, various small fixes.
-
-* 2022-02-14, `v1.7.5`, `v2.0.5` (alpha): fix malloc override on
-  Windows 11, fix compilation with musl, potentially reduced
-  committed memory, add `bin/minject` for Windows,
-  improved wasm support, faster aligned allocation,
-  various small fixes.
+### Releases
+* 2026-04-15, `v1.9.8`, `v2.3.0`, `v3.3.0`: initial support for github (binary) releases, 
+  fix visiting of full pages during collection (performance),
+  fix THP alignment (performance), fix arm64 cross-compilation on Windows, enable guard pages in debug mode,
+  always use uncommitted areas between arenas (security), enable static overloading of `malloc` etc. on Windows with the 
+  static CRT (by @Noxybot), fix TLS slot leak on Windows (v3), enable clean DLL load/unload with statically linked
+  mimalloc (v3), fix race in `mi_heap_destroy` (v3), by default put page meta info separate from allocated 
+  objects (v3,security), fix C++ overrides for emscripten. Various bugs found by DeepTest include: 
+  fix offset for `mi_heap_realloc_aligned`, fix `mi_(w)dupenv_s` buffer size, fix potential overflow in size options,
+  and error codes for `mi_reallocarr(ay)`. 
+* 2026-02-03, `v3.2.8` (rc3): Fix thread reinitialize issue on macOS. Fix SIMD codegen bug on older
+  GCC versions. Extend Windows TLS slot limit from 64 to 1088. Report commit statistics more precise.
+  Fixes issue in free-page search in arenas.
+* 2026-01-15, `v1.9.7`, `v2.2.7`, `v3.2.7` (rc2): Fix zero initializing blocks that were OS allocated.  
+  For v3 various bug and performance fixes. Fix Debian 32-bit compilation.
+* 2026-01-08, `v1.9.6`, `v2.2.6`, `v3.2.6` (rc1): Important bug fixes. Many improvements to v3 including 
+  true first-class heaps where one can allocate in heap from any thread, and track statistics per heap as well.
+  Added `MIMALLOC_ALLOW_THP` option. This is by default enabled except on Android. When THP is detected on v3,
+  mimalloc will set the `MIMALLOC_MINIMAL_PURGE_SIZE` to 2MiB to avoid breaking up potential THP huge pages.
+  v3 uses faster TLS access on Windows, and has improved performance for `mi_calloc` and aligned allocations.
+  Fixed rare race condition on older v3, fixed potential buffer overflow in debug statistics, add API for returning
+  allocated sizes on allocation and free.
+* 2025-06-09, `v1.9.4`, `v2.2.4`, `v3.1.4` (beta) : Some important bug fixes, including a case where OS memory
+  was not always fully released. Improved v3 performance, build on XBox, fix build on Android, support interpose 
+  for older macOS versions, use MADV_FREE_REUSABLE on macOS, always check commit success, better support for Windows 
+  fixed TLS offset, etc.
+* 2025-03-28, `v1.9.3`, `v2.2.3`, `v3.0.3` (beta) : Various small bug and build fixes, including:
+  fix arm32 pre v7 builds, fix mingw build, get runtime statistics, improve statistic commit counts, 
+  fix execution on non BMI1 x64 systems. 
+* 2025-03-06, `v1.9.2`, `v2.2.2`, `v3.0.2-beta`: Various small bug and build fixes. 
+  Add `mi_options_print`, `mi_arenas_print`, and the experimental `mi_stat_get` and `mi_stat_get_json`. 
+  Add `mi_thread_set_in_threadpool` and `mi_heap_set_numa_affinity` (v3 only). Add vcpkg portfile. 
+  Upgrade mimalloc-redirect to v1.3.2. `MI_OPT_ARCH` is off by default now but still assumes armv8.1-a on arm64
+  for fast atomic operations. Add QNX support.
+* 2025-01-03, `v1.8.9`, `v2.1.9`, `v3.0.1-alpha`: Interim release. Support Windows arm64. New [guarded](#guarded) build that can place OS 
+  guard pages behind objects to catch buffer overflows as they occur. 
+  Many small fixes: build on Windows arm64, cygwin, riscV, and dragonfly; fix Windows static library initialization to account for
+  thread local destructors (in Rust/C++); macOS tag change; macOS TLS slot fix; improve stats; 
+  consistent `mimalloc.dll` on Windows (instead of `mimalloc-override.dll`); fix mimalloc-redirect on Win11 H2; 
+  add 0-byte to canary; upstream CPython fixes; reduce .bss size; allow fixed TLS slot on Windows for improved performance.
 
 * [Older release notes](#older-release-notes)
 
 Special thanks to:
 
+* Sergiy Kuryata for his contributions on reducing memory commit -- especially on Windows with the Windows thread pool (now implemented in v3).
 * [David Carlier](https://devnexen.blogspot.com/) (@devnexen) for his many contributions, and making
   mimalloc work better on many less common operating systems, like Haiku, Dragonfly, etc.
 * Mary Feofanova (@mary3000), Evgeniy Moiseenko, and Manuel Pöter (@mpoeter) for making mimalloc TSAN checkable, and finding
@@ -152,7 +155,7 @@ mimalloc is used in various large scale low-latency services and programs, for e
 <a href="https://azure.microsoft.com/"><img height="50" align="left" src="https://upload.wikimedia.org/wikipedia/commons/a/a8/Microsoft_Azure_Logo.svg"></a>
 <a href="https://deathstrandingpc.505games.com"><img height="100" src="doc/ds-logo.png"></a>
 <a href="https://docs.unrealengine.com/4.26/en-US/WhatsNew/Builds/ReleaseNotes/4_25/"><img height="100" src="doc/unreal-logo.svg"></a>
-<a href="https://cab.spbu.ru/software/spades/"><img height="100" src="doc/spades-logo.png"></a>
+<a href="https://ablab.github.io/software/spades/"><img height="100" src="doc/spades-logo.png"></a>
 
 
 # Building
@@ -160,13 +163,13 @@ mimalloc is used in various large scale low-latency services and programs, for e
 ## Windows
 
 Open `ide/vs2022/mimalloc.sln` in Visual Studio 2022 and build.
-The `mimalloc` project builds a static library (in `out/msvc-x64`), while the
-`mimalloc-override` project builds a DLL for overriding malloc
+The `mimalloc-lib` project builds a static library (in `out/msvc-x64`), while the
+`mimalloc-override-dll` project builds a DLL for overriding malloc
 in the entire program.
 
-## macOS, Linux, BSD, etc.
+## Linux, macOS, BSD, etc.
 
-We use [`cmake`](https://cmake.org)<sup>1</sup> as the build system:
+We use [`cmake`](https://cmake.org) as the build system:
 
 ```
 > mkdir -p out/release
@@ -189,32 +192,58 @@ maintains detailed statistics as:
 > cmake -DCMAKE_BUILD_TYPE=Debug ../..
 > make
 ```
+
 This will name the shared library as `libmimalloc-debug.so`.
 
-Finally, you can build a _secure_ version that uses guard pages, encrypted
-free lists, etc., as:
+Finally, you can build a _secure_ version that uses guard pages, encrypted free lists, etc., as:
+
 ```
 > mkdir -p out/secure
 > cd out/secure
 > cmake -DMI_SECURE=ON ../..
 > make
 ```
+
 This will name the shared library as `libmimalloc-secure.so`.
-Use `ccmake`<sup>2</sup> instead of `cmake`
-to see and customize all the available build options.
+Use `cmake ../.. -LH` to see all the available build options.
 
-Notes:
-1. Install CMake: `sudo apt-get install cmake`
-2. Install CCMake: `sudo apt-get install cmake-curses-gui`
+The examples use the default compiler. If you like to use another, use:
 
+```
+> CC=clang CXX=clang++ cmake ../..
+```
+
+## Cmake with Visual Studio
+
+You can also use cmake on Windows. Open a Visual Studio 2022 development prompt 
+and invoke `cmake` with the right [generator](https://cmake.org/cmake/help/latest/generator/Visual%20Studio%2017%202022.html) 
+and architecture, like:
+
+```
+> cmake ..\.. -G "Visual Studio 17 2022" -A x64 -DMI_OVERRIDE=ON
+```
 
-## Single source
+The cmake build type is specified when actually building, for example:
+
+```
+> cmake --build . --config=Release
+```
+
+You can also install the [LLVM toolset](https://learn.microsoft.com/en-us/cpp/build/clang-support-msbuild?view=msvc-170#install-1) 
+on Windows to build with the `clang-cl` compiler directly:
+
+```
+> cmake ../.. -G "Visual Studio 17 2022" -T ClangCl
+```
+
+
+## Single Source
 
 You can also directly build the single `src/static.c` file as part of your project without
 needing `cmake` at all. Make sure to also add the mimalloc `include` directory to the include path.
 
 
-# Using the library
+# Using the Library
 
 The preferred usage is including `<mimalloc.h>`, linking with
 the shared- or static library, and using the `mi_malloc` API exclusively for allocation. For example,
@@ -226,7 +255,7 @@ mimalloc uses only safe OS calls (`mmap` and `VirtualAlloc`) and can co-exist
 with other allocators linked to the same program.
 If you use `cmake`, you can simply use:
 ```
-find_package(mimalloc 1.4 REQUIRED)
+find_package(mimalloc 1.8 REQUIRED)
 ```
 in your `CMakeLists.txt` to find a locally installed mimalloc. Then use either:
 ```
@@ -240,7 +269,7 @@ to link with the static library. See `test\CMakeLists.txt` for an example.
 
 For best performance in C++ programs, it is also recommended to override the
 global `new` and `delete` operators. For convenience, mimalloc provides
-[`mimalloc-new-delete.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) which does this for you -- just include it in a single(!) source file in your project.
+[`mimalloc-new-delete.h`](include/mimalloc-new-delete.h) which does this for you -- just include it in a single(!) source file in your project.
 In C++, mimalloc also provides the `mi_stl_allocator` struct which implements the `std::allocator`
 interface.
 
@@ -251,33 +280,52 @@ and statistics (`MIMALLOC_SHOW_STATS=1`) (in the debug version):
 
 175451865205073170563711388363 = 374456281610909315237213 * 468551
 
-heap stats:     peak      total      freed       unit
-normal   2:    16.4 kb    17.5 mb    17.5 mb      16 b   ok
-normal   3:    16.3 kb    15.2 mb    15.2 mb      24 b   ok
-normal   4:      64 b      4.6 kb     4.6 kb      32 b   ok
-normal   5:      80 b    118.4 kb   118.4 kb      40 b   ok
-normal   6:      48 b       48 b       48 b       48 b   ok
-normal  17:     960 b      960 b      960 b      320 b   ok
-
-heap stats:     peak      total      freed       unit
-    normal:    33.9 kb    32.8 mb    32.8 mb       1 b   ok
-      huge:       0 b        0 b        0 b        1 b   ok
-     total:    33.9 kb    32.8 mb    32.8 mb       1 b   ok
-malloc requested:         32.8 mb
-
- committed:    58.2 kb    58.2 kb    58.2 kb       1 b   ok
-  reserved:     2.0 mb     2.0 mb     2.0 mb       1 b   ok
-     reset:       0 b        0 b        0 b        1 b   ok
-  segments:       1          1          1
--abandoned:       0
-     pages:       6          6          6
--abandoned:       0
-     mmaps:       3
- mmap fast:       0
- mmap slow:       1
-   threads:       0
-   elapsed:     2.022s
-   process: user: 1.781s, system: 0.016s, faults: 756, reclaims: 0, rss: 2.7 mb
+subproc 0
+ blocks          peak       total     current       block      total#
+  bin S    4:    75.3 KiB    55.2 MiB     0          32   B       1.8 M    ok
+  bin S    6:    31.0 KiB   180.4 KiB     0          48   B       3.8 K    ok
+  bin S    8:    64   B      64   B       0          64   B       1        ok
+  bin S    9:   160   B     160   B       0          80   B       2        ok
+  bin S   17:     1.2 KiB     1.2 KiB     0         320   B       4        ok
+  bin S   21:   640   B       3.1 KiB     0         640   B       5        ok
+  bin S   33:     5.0 KiB     5.0 KiB     0           5.0 KiB     1        ok
+
+  binned    :    84.2 Ki     41.5 Mi      0                                ok
+  huge      :     0           0           0                                ok
+  total     :    84.2 KiB    41.5 MiB     0
+  malloc req:                29.7 MiB
+
+ pages           peak       total     current       block      total#
+  touched   :   152.8 KiB   152.8 KiB   152.8 KiB
+  pages     :     8          14           0                                ok
+  abandoned :     1         249           0                                ok
+  reclaima  :     0
+  reclaimf  :   249
+  reabandon :     0
+  waits     :     0
+  extended  :    38
+  retire    :    35
+  searches  :     0.7 avg
+
+ arenas          peak       total     current       block      total#
+  reserved  :     1.0 GiB     1.0 GiB     1.0 GiB
+  committed :     4.8 MiB     4.8 MiB     4.4 MiB
+  reset     :     0
+  purged    :   385.5 Ki
+  arenas    :     1
+  rollback  :     0
+  mmaps     :     3
+  commits   :     0
+  resets    :     1
+  purges    :     2
+  guarded   :     0
+  heaps     :     1           1           1
+
+ process         peak       total     current       block      total#
+  threads   :     1           1           1
+  numa nodes:     1
+  elapsed   :     0.553 s
+  process   : user: 0.557 s, system: 0.013 s, faults: 29, peak rss: 2.1 MiB, peak commit: 4.8 MiB
 ```
 
 The above model of using the `mi_` prefixed API is not always possible
@@ -290,53 +338,51 @@ completely and redirect all calls to the _mimalloc_ library instead .
 You can set further options either programmatically (using [`mi_option_set`](https://microsoft.github.io/mimalloc/group__options.html)), or via environment variables:
 
 - `MIMALLOC_SHOW_STATS=1`: show statistics when the program terminates.
-- `MIMALLOC_VERBOSE=1`: show verbose messages.
+- `MIMALLOC_VERBOSE=1`: show verbose messages (including statistics).
 - `MIMALLOC_SHOW_ERRORS=1`: show error and warning messages.
 
 Advanced options:
 
-- `MIMALLOC_ARENA_EAGER_COMMIT=2`: turns on eager commit for the large arenas (usually 1GiB) from which mimalloc 
-   allocates segments and pages. Set this to 2 (default) to 
-   only enable this on overcommit systems (e.g. Linux). Set this to 1 to enable explicitly on other systems 
-   as well (like Windows or macOS) which may improve performance (as the whole arena is committed at once). 
-   Note that eager commit only increases the commit but not the actual the peak resident set 
+- `MIMALLOC_ARENA_EAGER_COMMIT=2`: turns on eager commit for the large arenas (usually 1GiB) from which mimalloc
+   allocates segments and pages. Set this to 2 (default) to
+   only enable this on overcommit systems (e.g. Linux). Set this to 1 to enable explicitly on other systems
+   as well (like Windows or macOS) which may improve performance (as the whole arena is committed at once).
+   Note that eager commit only increases the commit but not the actual the peak resident set
    (rss) so it is generally ok to enable this.
-- `MIMALLOC_PURGE_DELAY=N`: the delay in `N` milli-seconds (by default `10`) after which mimalloc will purge 
-   OS pages that are not in use. This signals to the OS that the underlying physical memory can be reused which 
+- `MIMALLOC_PURGE_DELAY=N`: the delay in `N` milli-seconds (by default `1000` in v3) after which mimalloc will purge
+   OS pages that are not in use. This signals to the OS that the underlying physical memory can be reused which
    can reduce memory fragmentation especially in long running (server) programs. Setting `N` to `0` purges immediately when
-   a page becomes unused which can improve memory usage but also decreases performance. Setting `N` to a higher
-   value like `100` can improve performance (sometimes by a lot) at the cost of potentially using more memory at times.
+   a page becomes unused which can improve memory usage but also decreases performance.
    Setting it to `-1` disables purging completely.
 - `MIMALLOC_PURGE_DECOMMITS=1`: By default "purging" memory means unused memory is decommitted (`MEM_DECOMMIT` on Windows,
    `MADV_DONTNEED` (which decresease rss immediately) on `mmap` systems). Set this to 0 to instead "reset" unused
    memory on a purge (`MEM_RESET` on Windows, generally `MADV_FREE` (which does not decrease rss immediately) on `mmap` systems).
-   Mimalloc generally does not "free" OS memory but only "purges" OS memory, in other words, it tries to keep virtual 
+   Mimalloc generally does not "free" OS memory but only "purges" OS memory, in other words, it tries to keep virtual
    address ranges and decommits within those ranges (to make the underlying physical memory available to other processes).
 
 Further options for large workloads and services:
 
+- `MIMALLOC_ALLOW_THP=1`: By default always allow transparent huge pages (THP) on Linux systems. On Android only this is
+   by default off. When set to `0`, THP is disabled for the process that mimalloc runs in. If enabled, mimalloc also sets
+   the `MIMALLOC_MINIMAL_PURGE_SIZE` in v3 to 2MiB to avoid potentially breaking up transparent huge pages when purging memory.
 - `MIMALLOC_USE_NUMA_NODES=N`: pretend there are at most `N` NUMA nodes. If not set, the actual NUMA nodes are detected
    at runtime. Setting `N` to 1 may avoid problems in some virtual environments. Also, setting it to a lower number than
    the actual NUMA nodes is fine and will only cause threads to potentially allocate more memory across actual NUMA
    nodes (but this can happen in any case as NUMA local allocation is always a best effort but not guaranteed).
-- `MIMALLOC_ALLOW_LARGE_OS_PAGES=1`: use large OS pages (2 or 4MiB) when available; for some workloads this can significantly
-   improve performance. When this option is disabled, it also disables transparent huge pages (THP) for the process 
-   (on Linux and Android). Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
+- `MIMALLOC_ALLOW_LARGE_OS_PAGES=0`: Set to 1 to use large OS pages (2 or 4MiB) when available; for some workloads this can
+   significantly improve performance. However, large OS pages cannot be purged or shared with other processes so may lead
+   to increased memory usage in some cases.
+   Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
    to explicitly give permissions for large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
    the OS is very slow to reserve contiguous physical memory for large OS pages so use with care on systems that
-   can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible).   
+   can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible).
 - `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where `N` is the number of 1GiB _huge_ OS pages. This reserves the huge pages at
    startup and sometimes this can give a large (latency) performance improvement on big workloads.
-   Usually it is better to not use `MIMALLOC_ALLOW_LARGE_OS_PAGES=1` in combination with this setting. Just like large 
+   Usually it is better to not use `MIMALLOC_ALLOW_LARGE_OS_PAGES=1` in combination with this setting. Just like large
    OS pages, use with care as reserving
    contiguous physical memory can take a long time when memory is fragmented (but reserving the huge pages is done at
    startup only once).
    Note that we usually need to explicitly give permission for huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])).
-   With huge OS pages, it may be beneficial to set the setting
-   `MIMALLOC_EAGER_COMMIT_DELAY=N` (`N` is 1 by default) to delay the initial `N` segments (of 4MiB)
-   of a thread to not allocate in the huge OS pages; this prevents threads that are short lived
-   and allocate just a little to take up space in the huge OS page area (which cannot be purged as huge OS pages are pinned
-   to physical memory).
    The huge pages are usually allocated evenly among NUMA nodes.
    We can use `MIMALLOC_RESERVE_HUGE_OS_PAGES_AT=N` where `N` is the numa node (starting at 0) to allocate all
    the huge pages at a specific numa node instead.
@@ -367,13 +413,40 @@ As always, evaluate with care as part of an overall security strategy as all of
 
 ## Debug Mode
 
-When _mimalloc_ is built using debug mode, various checks are done at runtime to catch development errors.
+When _mimalloc_ is built using debug mode, (`-DCMAKE_BUILD_TYPE=Debug`), 
+various checks are done at runtime to catch development errors.
 
 - Statistics are maintained in detail for each object size. They can be shown using `MIMALLOC_SHOW_STATS=1` at runtime.
 - All objects have padding at the end to detect (byte precise) heap block overflows.
 - Double free's, and freeing invalid heap pointers are detected.
 - Corrupted free-lists and some forms of use-after-free are detected.
 
+## Guarded Mode
+
+<span id="guarded">_mimalloc_ can be build in guarded mode using the `-DMI_GUARDED=ON` flags in `cmake`.</span>
+This is `ON` by default when building a debug version of mimalloc.
+Guarded mode enables placing OS guard pages behind certain object allocations to catch buffer overflows as they occur.
+This can be invaluable to catch buffer-overflow bugs in large programs. However, it also means that any object
+allocated with a guard page takes at least 8 KiB memory for the guard page and its alignment. As such, allocating
+a guard page for every allocation may be too expensive both in terms of memory, and in terms of performance with
+many system calls. Therefore, there are various environment variables (and options) to tune this:
+
+- `MIMALLOC_GUARDED_SAMPLE_RATE=N`: Set the sample rate to `N` (by default 0). This mode places a guard page
+  behind every `N` suitable object allocations (per thread). Since the performance in guarded mode without placing
+  guard pages is close to release mode, this can be used to enable guard pages even in production to catch latent 
+  buffer overflow bugs. Set the sample rate to `1` to guard every object, and to `0` to place no guard pages at all.
+
+- `MIMALLOC_GUARDED_SAMPLE_SEED=N`: Start sampling at `N` (by default random). Can be used to reproduce a buffer
+  overflow if needed.
+
+- `MIMALLOC_GUARDED_MIN=N`, `MIMALLOC_GUARDED_MAX=N`: Minimal and maximal _rounded_ object sizes for which a guard 
+  page is considered (`0` and `1GiB` respectively). If you suspect a buffer overflow occurs with an object of size
+  141, set the minimum and maximum to `148` and the sample rate to `1` to have all of those guarded.
+
+- `MIMALLOC_GUARDED_PRECISE=1`: If we have an object of size 13, we would usually place it an aligned 16 bytes in
+  front of the guard page. Using `MIMALLOC_GUARDED_PRECISE` places it exactly 13 bytes before a page so that even
+  a 1 byte overflow is detected. This violates the C/C++ minimal alignment guarantees though so use with care.
+
 
 # Overriding Standard Malloc
 
@@ -417,41 +490,44 @@ the [shell](https://stackoverflow.com/questions/43941322/dyld-insert-libraries-i
 
 ### Dynamic Override on Windows
 
-<span id="override_on_windows">Dynamically overriding on mimalloc on Windows</span> 
-is robust and has the particular advantage to be able to redirect all malloc/free calls that go through
-the (dynamic) C runtime allocator, including those from other DLL's or libraries.
-As it intercepts all allocation calls on a low level, it can be used reliably 
-on large programs that include other 3rd party components.
-There are four requirements to make the overriding work robustly:
+<span id="override_on_windows">We use a separate redirection DLL to override mimalloc on Windows</span> 
+such that we redirect all malloc/free calls that go through the (dynamic) C runtime allocator, 
+including those from other DLL's or libraries. As it intercepts all allocation calls on a low level, 
+it can be used on large programs that include other 3rd party components.
+There are four requirements to make the overriding work well:
 
 1. Use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch).
-2. Link your program explicitly with `mimalloc-override.dll` library.
-   To ensure the `mimalloc-override.dll` is loaded at run-time it is easiest to insert some
-    call to the mimalloc API in the `main` function, like `mi_version()`
-    (or use the `/INCLUDE:mi_version` switch on the linker). See the `mimalloc-override-test` project
-    for an example on how to use this. 
-3. The [`mimalloc-redirect.dll`](bin) (or `mimalloc-redirect32.dll`) must be put
-   in the same folder as the main `mimalloc-override.dll` at runtime (as it is a dependency of that DLL).
-   The redirection DLL ensures that all calls to the C runtime malloc API get redirected to
-   mimalloc functions (which reside in `mimalloc-override.dll`).
-4. Ensure the `mimalloc-override.dll` comes as early as possible in the import
+
+2. Link your program explicitly with the `mimalloc.dll.lib` export library for the `mimalloc.dll`.
+   (which must be compiled with `-DMI_OVERRIDE=ON`, which is the default though).
+   To ensure the `mimalloc.dll` is actually loaded at run-time it is easiest 
+   to insert some call to the mimalloc API in the `main` function, like `mi_version()`
+   (or use the `/include:mi_version` switch on the linker command, or
+   similarly, `#pragma comment(linker, "/include:mi_version")` in some source file). 
+   See the `mimalloc-test-override` project for an example on how to use this. 
+
+3. The `mimalloc-redirect.dll` must be put in the same directory as the main 
+   `mimalloc.dll` at runtime (as it is a dependency of that DLL).
+   The redirection DLL ensures that all calls to the C runtime malloc API get 
+   redirected to mimalloc functions (which reside in `mimalloc.dll`).
+
+4. Ensure the `mimalloc.dll` comes as early as possible in the import
    list of the final executable (so it can intercept all potential allocations).
+   You can use `minject -l <exe>` to check this if needed.
 
-For best performance on Windows with C++, it
-is also recommended to also override the `new`/`delete` operations (by including
-[`mimalloc-new-delete.h`](include/mimalloc-new-delete.h) 
+For best performance on Windows with C++, it is also recommended to also override 
+the `new`/`delete` operations (by including [`mimalloc-new-delete.h`](include/mimalloc-new-delete.h)
 a single(!) source file in your project).
 
 The environment variable `MIMALLOC_DISABLE_REDIRECT=1` can be used to disable dynamic
-overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully redirected.
+overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully 
+redirected.
+
+For different platforms than x64, you may need a specific [redirection dll](bin).
+Furthermore, we cannot always re-link an executable or ensure `mimalloc.dll` comes
+first in the import table. In such cases the [`minject`](bin) tool can be used
+to patch the executable's import tables.
 
-We cannot always re-link an executable with `mimalloc-override.dll`, and similarly, we cannot always
-ensure the the DLL comes first in the import table of the final executable.
-In many cases though we can patch existing executables without any recompilation
-if they are linked with the dynamic C runtime (`ucrtbase.dll`) -- just put the `mimalloc-override.dll`
-into the import table (and put `mimalloc-redirect.dll` in the same folder)
-Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388) or
-the [`minject`](bin) program.
 
 ## Static override
 
@@ -462,6 +538,7 @@ an object file instead of a library file as linkers give preference to
 that over archives to resolve symbols. To ensure that the standard
 malloc interface resolves to the _mimalloc_ library, link it as the first
 object file. For example:
+
 ```
 > gcc -o myprogram mimalloc.o  myfile1.c ...
 ```
@@ -469,16 +546,20 @@ object file. For example:
 Another way to override statically that works on all platforms, is to
 link statically to mimalloc (as shown in the introduction) and include a
 header file in each source file that re-defines `malloc` etc. to `mi_malloc`.
-This is provided by [`mimalloc-override.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-override.h). This only works reliably though if all sources are
+This is provided by [`mimalloc-override.h`](include/mimalloc-override.h). This only works 
+reliably though if all sources are
 under your control or otherwise mixing of pointers from different heaps may occur!
 
+Note: recently we also enabled static overloading on Windows. In that case you need
+to link with the static CRT _release_ runtime (`/MT`) and link with the static 
+`mimalloc(-debug).obj` (to take precendence over the definitions in the CRT library).
 
 # Tools
 
 Generally, we recommend using the standard allocator with memory tracking tools, but mimalloc
-can also be build to support the [address sanitizer][asan] or the excellent [Valgrind] tool. 
+can also be build to support the [address sanitizer][asan] or the excellent [Valgrind] tool.
 Moreover, it can be build to support Windows event tracing ([ETW]).
-This has a small performance overhead but does allow detecting memory leaks and byte-precise 
+This has a small performance overhead but does allow detecting memory leaks and byte-precise
 buffer overflows directly on final executables. See also the `test/test-wrong.c` file to test with various tools.
 
 ## Valgrind
@@ -505,9 +586,13 @@ you also need to tell `valgrind` to not intercept those calls itself, and use:
 
 By setting the `MIMALLOC_SHOW_STATS` environment variable you can check that mimalloc is indeed
 used and not the standard allocator. Even though the [Valgrind option][valgrind-soname]
-is called `--soname-synonyms`, this also
-works when overriding with a static library or object file. Unfortunately, it is not possible to
-dynamically override mimalloc using `LD_PRELOAD` together with `valgrind`.
+is called `--soname-synonyms`, this also works when overriding with a static library or object file.
+To dynamically override mimalloc using `LD_PRELOAD` together with `valgrind`, use:
+
+```
+> valgrind --trace-children=yes --soname-synonyms=somalloc=*mimalloc* /usr/bin/env LD_PRELOAD=/usr/lib/libmimalloc.so -- <myprogram>
+```
+
 See also the `test/test-wrong.c` file to test with `valgrind`.
 
 Valgrind support is in its initial development -- please report any issues.
@@ -523,7 +608,7 @@ To build with the address sanitizer, use the `-DMI_TRACK_ASAN=ON` cmake option:
 > cmake ../.. -DMI_TRACK_ASAN=ON
 ```
 
-This can also be combined with secure mode or debug mode. 
+This can also be combined with secure mode or debug mode.
 You can then run your programs as:'
 
 ```
@@ -531,7 +616,7 @@ You can then run your programs as:'
 ```
 
 When you link a program with an address sanitizer build of mimalloc, you should
-generally compile that program too with the address sanitizer enabled. 
+generally compile that program too with the address sanitizer enabled.
 For example, assuming you build mimalloc in `out/debug`:
 
 ```
@@ -540,23 +625,23 @@ clang -g -o test-wrong -Iinclude test/test-wrong.c out/debug/libmimalloc-asan-de
 
 Since the address sanitizer redirects the standard allocation functions, on some platforms (macOSX for example)
 it is required to compile mimalloc with `-DMI_OVERRIDE=OFF`.
-Adress sanitizer support is in its initial development -- please report any issues.
+Address sanitizer support is in its initial development -- please report any issues.
 
 [asan]: https://github.com/google/sanitizers/wiki/AddressSanitizer
 
 ## ETW
 
 Event tracing for Windows ([ETW]) provides a high performance way to capture all allocations though
-mimalloc and analyze them later. To build with ETW support, use the `-DMI_TRACK_ETW=ON` cmake option. 
+mimalloc and analyze them later. To build with ETW support, use the `-DMI_TRACK_ETW=ON` cmake option.
 
-You can then capture an allocation trace using the Windows performance recorder (WPR), using the 
+You can then capture an allocation trace using the Windows performance recorder (WPR), using the
 `src/prim/windows/etw-mimalloc.wprp` profile. In an admin prompt, you can use:
 ```
 > wpr -start src\prim\windows\etw-mimalloc.wprp -filemode
 > <my_mimalloc_program>
 > wpr -stop <my_mimalloc_program>.etl
-``` 
-and then open `<my_mimalloc_program>.etl` in the Windows Performance Analyzer (WPA), or 
+```
+and then open `<my_mimalloc_program>.etl` in the Windows Performance Analyzer (WPA), or
 use a tool like [TraceControl] that is specialized for analyzing mimalloc traces.
 
 [ETW]: https://learn.microsoft.com/en-us/windows-hardware/test/wpt/event-tracing-for-windows
@@ -601,7 +686,7 @@ as [mimalloc-bench](https://github.com/daanx/mimalloc-bench).
 ## Benchmark Results on a 16-core AMD 5950x (Zen3)
 
 Testing on the 16-core AMD 5950x processor at 3.4Ghz (4.9Ghz boost), with
-with 32GiB memory at 3600Mhz, running	Ubuntu 20.04 with glibc 2.31 and GCC 9.3.0.
+32GiB memory at 3600Mhz, running	Ubuntu 20.04 with glibc 2.31 and GCC 9.3.0.
 
 We measure three versions of _mimalloc_: the main version `mi` (tag:v1.7.0),
 the new v2.0 beta version as `xmi` (tag:v2.0.0), and the main version in secure mode as `smi` (tag:v1.7.0).
@@ -823,6 +908,50 @@ provided by the bot. You will only need to do this once across all repos using o
 
 # Older Release Notes
 
+* 2024-05-21, `v1.8.7`, `v2.1.7`: Fix build issues on less common platforms. Started upstreaming patches
+  from the CPython [integration](https://github.com/python/cpython/issues/113141#issuecomment-2119255217). Upstream `vcpkg` patches.
+* 2024-05-13, `v1.8.6`, `v2.1.6`: Fix build errors on various (older) platforms. Refactored aligned allocation.
+* 2024-04-22, `v1.8.4`, `v2.1.4`: Fixes various bugs and build issues. Add `MI_LIBC_MUSL` cmake flag for musl builds.
+  Free-ing code is refactored into a separate module (`free.c`). Mimalloc page info is simplified with the block size
+  directly available (and new `block_size_shift` to improve aligned block free-ing).
+  New approach to collection of abandoned segments: When
+  a thread terminates the segments it owns are abandoned (containing still live objects) and these can be
+  reclaimed by other threads. We no longer use a list of abandoned segments but this is now done using bitmaps in arena's
+  which is more concurrent (and more aggressive). Abandoned memory can now also be reclaimed if a thread frees an object in
+  an abandoned page (which can be disabled using `mi_option_abandoned_reclaim_on_free`). The option `mi_option_max_segment_reclaim`
+  gives a maximum percentage of abandoned segments that can be reclaimed per try (=10%).
+
+* 2023-04-24, `v1.8.2`, `v2.1.2`: Fixes build issues on freeBSD, musl, and C17 (UE 5.1.1). Reduce code size/complexity
+  by removing regions and segment-cache's and only use arenas with improved memory purging -- this may improve memory
+  usage as well for larger services. Renamed options for consistency. Improved Valgrind and ASAN checking.
+
+* 2023-04-03, `v1.8.1`, `v2.1.1`: Fixes build issues on some platforms.
+
+* 2023-03-29, `v1.8.0`, `v2.1.0`: Improved support dynamic overriding on Windows 11. Improved tracing precision
+  with [asan](#asan) and [Valgrind](#valgrind), and added Windows event tracing [ETW](#ETW) (contributed by Xinglong He). Created an OS
+  abstraction layer to make it easier to port and separate platform dependent code (in `src/prim`). Fixed C++ STL compilation on older Microsoft C++ compilers, and various small bug fixes.
+
+* 2022-12-23, `v1.7.9`, `v2.0.9`: Supports building with [asan](#asan) and improved [Valgrind](#valgrind) support.
+  Support arbitrary large alignments (in particular for `std::pmr` pools).
+  Added C++ STL allocators attached to a specific heap (thanks @vmarkovtsev).
+  Heap walks now visit all object (including huge objects). Support Windows nano server containers (by Johannes Schindelin,@dscho).
+  Various small bug fixes.
+
+* 2022-11-03, `v1.7.7`, `v2.0.7`: Initial support for [Valgrind](#valgrind) for leak testing and heap block overflow
+  detection. Initial
+  support for attaching heaps to a specific memory area (only in v2). Fix `realloc` behavior for zero size blocks, remove restriction to integral multiple of the alignment in `alloc_align`, improved aligned allocation performance, reduced contention with many threads on few processors (thank you @dposluns!), vs2022 support, support `pkg-config`, .
+
+* 2022-04-14, `v1.7.6`, `v2.0.6`: fix fallback path for aligned OS allocation on Windows, improve Windows aligned allocation
+  even when compiling with older SDK's, fix dynamic overriding on macOS Monterey, fix MSVC C++ dynamic overriding, fix
+  warnings under Clang 14, improve performance if many OS threads are created and destroyed, fix statistics for large object
+  allocations, using MIMALLOC_VERBOSE=1 has no maximum on the number of error messages, various small fixes.
+
+* 2022-02-14, `v1.7.5`, `v2.0.5` (alpha): fix malloc override on
+  Windows 11, fix compilation with musl, potentially reduced
+  committed memory, add `bin/minject` for Windows,
+  improved wasm support, faster aligned allocation,
+  various small fixes.
+
 * 2021-11-14, `v1.7.3`, `v2.0.3` (beta): improved WASM support, improved macOS support and performance (including
   M1), improved performance for v2 for large objects, Python integration improvements, more standard
   installation directories, various small fixes.
diff --git a/system/lib/mimalloc/src/alloc-aligned.c b/system/lib/mimalloc/src/alloc-aligned.c
index ba629ef30a4c2..371e24de473e0 100644
--- a/system/lib/mimalloc/src/alloc-aligned.c
+++ b/system/lib/mimalloc/src/alloc-aligned.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/prim.h"  // mi_prim_get_default_heap
+#include "mimalloc/prim.h"  // _mi_theap_default
 
 #include <string.h>     // memset
 
@@ -16,42 +16,78 @@ terms of the MIT license. A copy of the license can be found in the file
 // ------------------------------------------------------
 
 static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) {
-  // objects up to `MI_MAX_ALIGN_GUARANTEE` are allocated aligned to their size (see `segment.c:_mi_segment_page_start`).
+  // certain blocks are always allocated at a certain natural alignment.
+  // (see also `arena.c:mi_arenas_page_alloc_fresh`).
   mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0));
   if (alignment > size) return false;
-  if (alignment <= MI_MAX_ALIGN_SIZE) return true;
   const size_t bsize = mi_good_size(size);
-  return (bsize <= MI_MAX_ALIGN_GUARANTEE && (bsize & (alignment-1)) == 0);
+  const bool ok = (bsize <= MI_PAGE_MAX_START_BLOCK_ALIGN2 && _mi_is_power_of_two(bsize)) ||             // power-of-two under N
+                  (alignment==MI_PAGE_OSPAGE_BLOCK_ALIGN2 && (bsize % MI_PAGE_OSPAGE_BLOCK_ALIGN2)==0);  // or multiple of N
+  if (ok) { mi_assert_internal((bsize & (alignment-1)) == 0); } // since both power of 2 and alignment <= size
+  return ok;
 }
 
+#if MI_GUARDED
+static mi_decl_restrict void* mi_theap_malloc_guarded_aligned(mi_theap_t* theap, size_t size, size_t alignment, bool zero) mi_attr_noexcept {
+  // use over allocation for guarded blocksl
+  #if MI_THEAP_INITASNULL
+  if mi_unlikely(theap==NULL) { theap = _mi_theap_empty_get(); }
+  #endif
+  mi_assert_internal(alignment > 0 && alignment < MI_PAGE_MAX_OVERALLOC_ALIGN);
+  const size_t oversize = size + alignment - 1;
+  void* base = _mi_theap_malloc_guarded(theap, oversize, zero);
+  void* p = _mi_align_up_ptr(base, alignment);
+  mi_track_align(base, p, (uint8_t*)p - (uint8_t*)base, size);
+  mi_assert_internal(mi_usable_size(p) >= size);
+  mi_assert_internal(_mi_is_aligned(p, alignment));
+  return p;
+}
+
+static void* mi_theap_malloc_zero_no_guarded(mi_theap_t* theap, size_t size, bool zero, size_t* usable) {
+  #if MI_THEAP_INITASNULL
+  if mi_unlikely(theap==NULL) { theap = _mi_theap_empty_get(); }
+  #endif
+  const size_t rate = theap->guarded_sample_rate;
+  // only write if `rate!=0` so we don't write to the constant `_mi_theap_empty`
+  if (rate != 0) { theap->guarded_sample_rate = 0; }
+  void* p = _mi_theap_malloc_zero(theap, size, zero, usable);
+  if (rate != 0) { theap->guarded_sample_rate = rate; }
+  return p;
+}
+#else
+static void* mi_theap_malloc_zero_no_guarded(mi_theap_t* theap, size_t size, bool zero, size_t* usable) {
+  return _mi_theap_malloc_zero(theap, size, zero, usable);
+}
+#endif
+
 // Fallback aligned allocation that over-allocates -- split out for better codegen
-static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
+static mi_decl_noinline void* mi_theap_malloc_zero_aligned_at_overalloc(mi_theap_t* const theap, const size_t size, const size_t alignment, const size_t offset, const bool zero, size_t* usable) mi_attr_noexcept
 {
   mi_assert_internal(size <= (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE));
   mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment));
 
   void* p;
   size_t oversize;
-  if mi_unlikely(alignment > MI_BLOCK_ALIGNMENT_MAX) {
-    // use OS allocation for very large alignment and allocate inside a huge page (dedicated segment with 1 page)
-    // This can support alignments >= MI_SEGMENT_SIZE by ensuring the object can be aligned at a point in the
-    // first (and single) page such that the segment info is `MI_SEGMENT_SIZE` bytes before it (so it can be found by aligning the pointer down)
+  if mi_unlikely(alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+    // use OS allocation for large alignments and allocate inside a singleton page (not in an arena)
+    // This can support alignments >= MI_PAGE_ALIGN by ensuring the object can be aligned
+    // in the first (and single) page such that the page info is `MI_PAGE_ALIGN` bytes before it (and can be found in the _mi_page_map).
     if mi_unlikely(offset != 0) {
       // todo: cannot support offset alignment for very large alignments yet
       #if MI_DEBUG > 0
-      _mi_error_message(EOVERFLOW, "aligned allocation with a very large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
+      _mi_error_message(EOVERFLOW, "aligned allocation with a large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
       #endif
       return NULL;
     }
     oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size);
-    p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block
-    // zero afterwards as only the area from the aligned_p may be committed!
+    // note: no guarded as alignment > 0
+    p = _mi_theap_malloc_zero_ex(theap, oversize, zero, alignment, usable); // the page block size should be large enough to align in the single huge page block
     if (p == NULL) return NULL;
   }
   else {
     // otherwise over-allocate
-    oversize = size + alignment - 1;
-    p = _mi_heap_malloc_zero(heap, oversize, zero);
+    oversize = (size < MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : size) + alignment - 1;  // adjust for size <= 16; with size 0 and aligment 64k, we would allocate a 64k block and pointing just beyond that.
+    p = mi_theap_malloc_zero_no_guarded(theap, oversize, zero, usable);
     if (p == NULL) return NULL;
   }
 
@@ -61,69 +97,89 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t
   const uintptr_t adjust  = (poffset == 0 ? 0 : alignment - poffset);
   mi_assert_internal(adjust < alignment);
   void* aligned_p = (void*)((uintptr_t)p + adjust);
+
+  // note: after the above allocation, the page may be abandoned now (as it became full, see `page.c:_mi_malloc_generic`)
+  // and we no longer own it. We should be careful to only read constant fields in the page,
+  // or use safe atomic access as in `mi_page_set_has_interior_pointers`.
+  // (we can access the page though since the just allocated pointer keeps it alive)
+  mi_page_t* page = _mi_ptr_page(p);
   if (aligned_p != p) {
-    mi_page_t* page = _mi_ptr_page(p);
-    mi_page_set_has_aligned(page, true);
+    mi_page_set_has_interior_pointers(page, true);
+    #if MI_GUARDED
+    // set tag to aligned so mi_usable_size works with guard pages
+    if (adjust >= sizeof(mi_block_t)) {
+      mi_block_t* const block = (mi_block_t*)p;
+      block->next = MI_BLOCK_TAG_ALIGNED;
+    }
+    #endif
     _mi_padding_shrink(page, (mi_block_t*)p, adjust + size);
   }
   // todo: expand padding if overallocated ?
 
-  mi_assert_internal(mi_page_usable_block_size(_mi_ptr_page(p)) >= adjust + size);
-  mi_assert_internal(p == _mi_page_ptr_unalign(_mi_ptr_page(aligned_p), aligned_p));
+  mi_assert_internal(mi_page_usable_block_size(page) >= adjust + size);
   mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
   mi_assert_internal(mi_usable_size(aligned_p)>=size);
   mi_assert_internal(mi_usable_size(p) == mi_usable_size(aligned_p)+adjust);
+  #if MI_DEBUG > 1
+  mi_page_t* const apage = _mi_ptr_page(aligned_p);
+  void* unalign_p = _mi_page_ptr_unalign(apage, aligned_p);
+  mi_assert_internal(p == unalign_p);
+  #endif
 
   // now zero the block if needed
-  if (alignment > MI_BLOCK_ALIGNMENT_MAX) {
-    // for the tracker, on huge aligned allocations only the memory from the start of the large block is defined
-    mi_track_mem_undefined(aligned_p, size);
-    if (zero) {
-      _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p));
-    }
-  }
+  //if (alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+  //  // for the tracker, on huge aligned allocations only from the start of the large block is defined
+  //  mi_track_mem_undefined(aligned_p, size);
+  //  if (zero) {
+  //    _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p));
+  //  }
+  //}
 
   if (p != aligned_p) {
     mi_track_align(p,aligned_p,adjust,mi_usable_size(aligned_p));
+    #if MI_GUARDED
+    mi_track_mem_defined(p, sizeof(mi_block_t));
+    #endif
   }
   return aligned_p;
 }
 
 // Generic primitive aligned allocation -- split out for better codegen
-static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_generic(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
+static mi_decl_noinline void* mi_theap_malloc_zero_aligned_at_generic(mi_theap_t* const theap, const size_t size, const size_t alignment, const size_t offset, const bool zero, size_t* usable) mi_attr_noexcept
 {
   mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment));
   // we don't allocate more than MI_MAX_ALLOC_SIZE (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-  if mi_unlikely(size > (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE)) { 
+  if mi_unlikely(size > (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE)) {
     #if MI_DEBUG > 0
     _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment);
     #endif
     return NULL;
   }
-  
+
   // use regular allocation if it is guaranteed to fit the alignment constraints.
-  // this is important to try as the fast path in `mi_heap_malloc_zero_aligned` only works when there exist
+  // this is important to try as the fast path in `mi_theap_malloc_zero_aligned` only works when there exist
   // a page with the right block size, and if we always use the over-alloc fallback that would never happen.
   if (offset == 0 && mi_malloc_is_naturally_aligned(size,alignment)) {
-    void* p = _mi_heap_malloc_zero(heap, size, zero);
+    void* p = mi_theap_malloc_zero_no_guarded(theap, size, zero, usable);
     mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0);
-    const bool is_aligned_or_null = (((uintptr_t)p) & (alignment-1))==0;  
+    const bool is_aligned_or_null = (((uintptr_t)p) & (alignment-1))==0;
     if mi_likely(is_aligned_or_null) {
       return p;
     }
     else {
       // this should never happen if the `mi_malloc_is_naturally_aligned` check is correct..
       mi_assert(false);
-      mi_free(p); 
+      mi_free(p);
     }
   }
 
   // fall back to over-allocation
-  return mi_heap_malloc_zero_aligned_at_overalloc(heap,size,alignment,offset,zero);
+  return mi_theap_malloc_zero_aligned_at_overalloc(theap,size,alignment,offset,zero,usable);
 }
 
+
 // Primitive aligned allocation
-static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
+static inline void* mi_theap_malloc_zero_aligned_at(mi_theap_t* const theap, const size_t size, const size_t alignment, const size_t offset, const bool zero, size_t* usable) mi_attr_noexcept
 {
   // note: we don't require `size > offset`, we just guarantee that the address at offset is aligned regardless of the allocated size.
   if mi_unlikely(alignment == 0 || !_mi_is_power_of_two(alignment)) { // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
@@ -132,94 +188,137 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
     #endif
     return NULL;
   }
-  
+
+  #if MI_GUARDED
+  #if MI_THEAP_INITASNULL
+  if mi_likely(theap!=NULL)
+  #endif
+  if (offset==0 && alignment < MI_PAGE_MAX_OVERALLOC_ALIGN && mi_theap_malloc_use_guarded(theap,size)) {
+    return mi_theap_malloc_guarded_aligned(theap, size, alignment, zero);
+  }
+  #endif
+
   // try first if there happens to be a small block available with just the right alignment
-  if mi_likely(size <= MI_SMALL_SIZE_MAX && alignment <= size) {
-    const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
-    const size_t padsize = size + MI_PADDING_SIZE;  
-    mi_page_t* page = _mi_heap_get_free_small_page(heap, padsize);
-    if mi_likely(page->free != NULL) {
-      const bool is_aligned = (((uintptr_t)page->free + offset) & align_mask)==0;
-      if mi_likely(is_aligned)
-      {
-        #if MI_STAT>1
-        mi_heap_stat_increase(heap, malloc, size);
-        #endif
-        void* p = (zero ? _mi_page_malloc_zeroed(heap,page,padsize) : _mi_page_malloc(heap,page,padsize)); // call specific page malloc for better codegen
-        mi_assert_internal(p != NULL);
-        mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
-        mi_track_malloc(p,size,zero);
-        return p;
+  // since most small power-of-2 blocks (under MI_PAGE_MAX_BLOCK_START_ALIGN2) are already
+  // naturally aligned this can be often the case.
+  #if MI_THEAP_INITASNULL
+  if mi_likely(theap!=NULL)
+  #endif
+  {
+    if mi_likely(size <= MI_SMALL_SIZE_MAX && alignment <= size) {
+      const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
+      const size_t padsize = size + MI_PADDING_SIZE;
+      mi_page_t* page = _mi_theap_get_free_small_page(theap, padsize);
+      if mi_likely(page->free != NULL) {
+        const bool is_aligned = (((uintptr_t)page->free + offset) & align_mask)==0;
+        if mi_likely(is_aligned)
+        {
+          if (usable!=NULL) { *usable = mi_page_usable_block_size(page); }
+          void* p = _mi_page_malloc_zero(theap, page, padsize, zero);
+          mi_assert_internal(p != NULL);
+          mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
+          mi_track_malloc(p, size, zero);
+          return p;
+        }
       }
     }
   }
 
   // fallback to generic aligned allocation
-  return mi_heap_malloc_zero_aligned_at_generic(heap, size, alignment, offset, zero);
+  return mi_theap_malloc_zero_aligned_at_generic(theap, size, alignment, offset, zero, usable);
 }
 
 
 // ------------------------------------------------------
-// Optimized mi_heap_malloc_aligned / mi_malloc_aligned
+// Internal mi_theap_malloc_aligned / mi_malloc_aligned
 // ------------------------------------------------------
 
-mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, false);
+static mi_decl_restrict void* mi_theap_malloc_aligned_at(mi_theap_t* theap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_malloc_zero_aligned_at(theap, size, alignment, offset, false, NULL);
 }
 
-mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_malloc_aligned_at(heap, size, alignment, 0);
+mi_decl_nodiscard mi_decl_restrict void* mi_theap_malloc_aligned(mi_theap_t* theap, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_malloc_aligned_at(theap, size, alignment, 0);
 }
 
-// ensure a definition is emitted
-#if defined(__cplusplus)
-void* _mi_extern_heap_malloc_aligned = (void*)&mi_heap_malloc_aligned;
-#endif
-
-// ------------------------------------------------------
-// Aligned Allocation
-// ------------------------------------------------------
-
-mi_decl_nodiscard mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, true);
+static mi_decl_restrict void* mi_theap_zalloc_aligned_at(mi_theap_t* theap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_malloc_zero_aligned_at(theap, size, alignment, offset, true, NULL);
 }
 
-mi_decl_nodiscard mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_zalloc_aligned_at(heap, size, alignment, 0);
+static mi_decl_restrict void* mi_theap_zalloc_aligned(mi_theap_t* theap, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_zalloc_aligned_at(theap, size, alignment, 0);
 }
 
-mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+static mi_decl_restrict void* mi_theap_calloc_aligned_at(mi_theap_t* theap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count, size, &total)) return NULL;
-  return mi_heap_zalloc_aligned_at(heap, total, alignment, offset);
+  return mi_theap_zalloc_aligned_at(theap, total, alignment, offset);
 }
 
-mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_calloc_aligned_at(heap,count,size,alignment,0);
+static mi_decl_restrict void* mi_theap_calloc_aligned(mi_theap_t* theap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_calloc_aligned_at(theap, count, size, alignment, 0);
 }
 
+
+// ------------------------------------------------------
+// Aligned Allocation
+// ------------------------------------------------------
+
 mi_decl_nodiscard mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_malloc_aligned_at(mi_prim_get_default_heap(), size, alignment, offset);
+  return mi_theap_malloc_aligned_at(_mi_theap_default(), size, alignment, offset);
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_malloc_aligned(mi_prim_get_default_heap(), size, alignment);
+  return mi_theap_malloc_aligned(_mi_theap_default(), size, alignment);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_umalloc_aligned(size_t size, size_t alignment, size_t* block_size) mi_attr_noexcept {
+  return mi_theap_malloc_zero_aligned_at(_mi_theap_default(), size, alignment, 0, false, block_size);
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_zalloc_aligned_at(mi_prim_get_default_heap(), size, alignment, offset);
+  return mi_theap_zalloc_aligned_at(_mi_theap_default(), size, alignment, offset);
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_zalloc_aligned(mi_prim_get_default_heap(), size, alignment);
+  return mi_theap_zalloc_aligned(_mi_theap_default(), size, alignment);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_uzalloc_aligned(size_t size, size_t alignment, size_t* block_size) mi_attr_noexcept {
+  return mi_theap_malloc_zero_aligned_at(_mi_theap_default(), size, alignment, 0, true, block_size);
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_calloc_aligned_at(mi_prim_get_default_heap(), count, size, alignment, offset);
+  return mi_theap_calloc_aligned_at(_mi_theap_default(), count, size, alignment, offset);
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_calloc_aligned(mi_prim_get_default_heap(), count, size, alignment);
+  return mi_theap_calloc_aligned(_mi_theap_default(), count, size, alignment);
+}
+
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_malloc_aligned_at(_mi_heap_theap(heap), size, alignment, offset);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_malloc_aligned(_mi_heap_theap(heap), size, alignment);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_zalloc_aligned_at(_mi_heap_theap(heap), size, alignment, offset);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_zalloc_aligned(_mi_heap_theap(heap), size, alignment);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_calloc_aligned_at(_mi_heap_theap(heap), count, size, alignment, offset);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_calloc_aligned(_mi_heap_theap(heap), count, size, alignment);
 }
 
 
@@ -227,18 +326,18 @@ mi_decl_nodiscard mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t
 // Aligned re-allocation
 // ------------------------------------------------------
 
-static void* mi_heap_realloc_zero_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset, bool zero) mi_attr_noexcept {
+static void* mi_theap_realloc_zero_aligned_at(mi_theap_t* theap, void* p, size_t newsize, size_t alignment, size_t offset, bool zero) mi_attr_noexcept {
   mi_assert(alignment > 0);
-  if (alignment <= sizeof(uintptr_t)) return _mi_heap_realloc_zero(heap,p,newsize,zero);
-  if (p == NULL) return mi_heap_malloc_zero_aligned_at(heap,newsize,alignment,offset,zero);
+  if (alignment <= sizeof(uintptr_t)) return _mi_theap_realloc_zero(theap,p,newsize,zero,NULL,NULL);
+  if (p == NULL) return mi_theap_malloc_zero_aligned_at(theap,newsize,alignment,offset,zero,NULL);
   size_t size = mi_usable_size(p);
   if (newsize <= size && newsize >= (size - (size / 2))
       && (((uintptr_t)p + offset) % alignment) == 0) {
-    return p;  // reallocation still fits, is aligned and not more than 50% waste
+    return p;  // reallocation still fits, is aligned and not more than 25% waste
   }
   else {
     // note: we don't zero allocate upfront so we only zero initialize the expanded part
-    void* newp = mi_heap_malloc_aligned_at(heap,newsize,alignment,offset);
+    void* newp = mi_theap_malloc_aligned_at(theap,newsize,alignment,offset);
     if (newp != NULL) {
       if (zero && newsize > size) {
         // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
@@ -252,61 +351,88 @@ static void* mi_heap_realloc_zero_aligned_at(mi_heap_t* heap, void* p, size_t ne
   }
 }
 
-static void* mi_heap_realloc_zero_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, bool zero) mi_attr_noexcept {
+static void* mi_theap_realloc_zero_aligned(mi_theap_t* theap, void* p, size_t newsize, size_t alignment, bool zero) mi_attr_noexcept {
   mi_assert(alignment > 0);
-  if (alignment <= sizeof(uintptr_t)) return _mi_heap_realloc_zero(heap,p,newsize,zero);
-  size_t offset = ((uintptr_t)p % alignment); // use offset of previous allocation (p can be NULL)
-  return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,zero);
+  if (alignment <= sizeof(uintptr_t)) return _mi_theap_realloc_zero(theap,p,newsize,zero,NULL,NULL);
+  return mi_theap_realloc_zero_aligned_at(theap,p,newsize,alignment,0,zero);
 }
 
-mi_decl_nodiscard void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,false);
+static void* mi_theap_realloc_aligned_at(mi_theap_t* theap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_realloc_zero_aligned_at(theap,p,newsize,alignment,offset,false);
 }
 
-mi_decl_nodiscard void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
-  return mi_heap_realloc_zero_aligned(heap,p,newsize,alignment,false);
+static void* mi_theap_realloc_aligned(mi_theap_t* theap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_theap_realloc_zero_aligned(theap,p,newsize,alignment,false);
 }
 
-mi_decl_nodiscard void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_realloc_zero_aligned_at(heap, p, newsize, alignment, offset, true);
+static void* mi_theap_rezalloc_aligned_at(mi_theap_t* theap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_realloc_zero_aligned_at(theap, p, newsize, alignment, offset, true);
 }
 
-mi_decl_nodiscard void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
-  return mi_heap_realloc_zero_aligned(heap, p, newsize, alignment, true);
+static void* mi_theap_rezalloc_aligned(mi_theap_t* theap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_theap_realloc_zero_aligned(theap, p, newsize, alignment, true);
 }
 
-mi_decl_nodiscard void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+static void* mi_theap_recalloc_aligned_at(mi_theap_t* theap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(newcount, size, &total)) return NULL;
-  return mi_heap_rezalloc_aligned_at(heap, p, total, alignment, offset);
+  return mi_theap_rezalloc_aligned_at(theap, p, total, alignment, offset);
 }
 
-mi_decl_nodiscard void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+static void* mi_theap_recalloc_aligned(mi_theap_t* theap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(newcount, size, &total)) return NULL;
-  return mi_heap_rezalloc_aligned(heap, p, total, alignment);
+  return mi_theap_rezalloc_aligned(theap, p, total, alignment);
 }
 
+
 mi_decl_nodiscard void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_realloc_aligned_at(mi_prim_get_default_heap(), p, newsize, alignment, offset);
+  return mi_theap_realloc_aligned_at(_mi_theap_default(), p, newsize, alignment, offset);
 }
 
 mi_decl_nodiscard void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
-  return mi_heap_realloc_aligned(mi_prim_get_default_heap(), p, newsize, alignment);
+  return mi_theap_realloc_aligned(_mi_theap_default(), p, newsize, alignment);
 }
 
 mi_decl_nodiscard void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_rezalloc_aligned_at(mi_prim_get_default_heap(), p, newsize, alignment, offset);
+  return mi_theap_rezalloc_aligned_at(_mi_theap_default(), p, newsize, alignment, offset);
 }
 
 mi_decl_nodiscard void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
-  return mi_heap_rezalloc_aligned(mi_prim_get_default_heap(), p, newsize, alignment);
+  return mi_theap_rezalloc_aligned(_mi_theap_default(), p, newsize, alignment);
 }
 
 mi_decl_nodiscard void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
-  return mi_heap_recalloc_aligned_at(mi_prim_get_default_heap(), p, newcount, size, alignment, offset);
+  return mi_theap_recalloc_aligned_at(_mi_theap_default(), p, newcount, size, alignment, offset);
 }
 
 mi_decl_nodiscard void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_recalloc_aligned(mi_prim_get_default_heap(), p, newcount, size, alignment);
+  return mi_theap_recalloc_aligned(_mi_theap_default(), p, newcount, size, alignment);
 }
+
+
+mi_decl_nodiscard void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_realloc_aligned_at(_mi_heap_theap(heap), p, newsize, alignment, offset);
+}
+
+mi_decl_nodiscard void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_theap_realloc_aligned(_mi_heap_theap(heap), p, newsize, alignment);
+}
+
+mi_decl_nodiscard void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_rezalloc_aligned_at(_mi_heap_theap(heap), p, newsize, alignment, offset);
+}
+
+mi_decl_nodiscard void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_theap_rezalloc_aligned(_mi_heap_theap(heap), p, newsize, alignment);
+}
+
+mi_decl_nodiscard void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_theap_recalloc_aligned_at(_mi_heap_theap(heap), p, newcount, size, alignment, offset);
+}
+
+mi_decl_nodiscard void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_theap_recalloc_aligned(_mi_heap_theap(heap), p, newcount, size, alignment);
+}
+
+
diff --git a/system/lib/mimalloc/src/alloc-override.c b/system/lib/mimalloc/src/alloc-override.c
index ded7a101de9f2..4737fbc377161 100644
--- a/system/lib/mimalloc/src/alloc-override.c
+++ b/system/lib/mimalloc/src/alloc-override.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2026, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -9,11 +9,8 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "this file should be included from 'alloc.c' (so aliases can work)"
 #endif
 
-#if defined(MI_MALLOC_OVERRIDE) && defined(_WIN32) && !(defined(MI_SHARED_LIB) && defined(_DLL))
-#error "It is only possible to override "malloc" on Windows when building as a DLL (and linking the C runtime as a DLL)"
-#endif
 
-#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32))
+#if defined(MI_MALLOC_OVERRIDE) && !defined(_DLL)
 
 #if defined(__APPLE__)
 #include <AvailabilityMacros.h>
@@ -72,24 +69,20 @@ typedef void* mi_nothrow_t;
   #define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
   #define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)
 
-  __attribute__((used)) static struct mi_interpose_s _mi_interposes[]  __attribute__((section("__DATA, __interpose"))) =
+  #define MI_INTERPOSE_DECLS(name)        __attribute__((used)) static struct mi_interpose_s name[]  __attribute__((section("__DATA, __interpose")))
+
+  MI_INTERPOSE_DECLS(_mi_interposes) =
   {
     MI_INTERPOSE_MI(malloc),
     MI_INTERPOSE_MI(calloc),
     MI_INTERPOSE_MI(realloc),
     MI_INTERPOSE_MI(strdup),
-    #if defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7
-    MI_INTERPOSE_MI(strndup),
-    #endif
     MI_INTERPOSE_MI(realpath),
     MI_INTERPOSE_MI(posix_memalign),
     MI_INTERPOSE_MI(reallocf),
     MI_INTERPOSE_MI(valloc),
     MI_INTERPOSE_FUN(malloc_size,mi_malloc_size_checked),
     MI_INTERPOSE_MI(malloc_good_size),
-    #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
-    MI_INTERPOSE_MI(aligned_alloc),
-    #endif
     #ifdef MI_OSX_ZONE
     // we interpose malloc_default_zone in alloc-override-osx.c so we can use mi_free safely
     MI_INTERPOSE_MI(free),
@@ -100,6 +93,12 @@ typedef void* mi_nothrow_t;
     MI_INTERPOSE_FUN(vfree,mi_cfree),
     #endif
   };
+  MI_INTERPOSE_DECLS(_mi_interposes_10_7) __OSX_AVAILABLE(10.7) = {
+    MI_INTERPOSE_MI(strndup),
+  };
+  MI_INTERPOSE_DECLS(_mi_interposes_10_15) __OSX_AVAILABLE(10.15) = {
+    MI_INTERPOSE_MI(aligned_alloc),
+  };
 
   #ifdef __cplusplus
   extern "C" {
@@ -128,21 +127,103 @@ typedef void* mi_nothrow_t;
   };
 
 #elif defined(_MSC_VER)
-  // cannot override malloc unless using a dll.
-  // we just override new/delete which does work in a static library.
+  _Check_return_ _Ret_maybenull_ _Post_writable_byte_size_(_Size) _ACRTIMP _CRTALLOCATOR _CRT_HYBRIDPATCHABLE
+  void* __cdecl _expand(_Pre_notnull_ void* _Block, _In_ _CRT_GUARDOVERFLOW size_t _Size) {
+    return mi_expand(_Block, _Size);
+  }
+  _Check_return_ _ACRTIMP 
+  size_t __cdecl _msize_base(_Pre_notnull_ void* _Block) _CRT_NOEXCEPT {
+    return mi_malloc_size(_Block);
+  }
+  _Check_return_ _ACRTIMP _CRT_HYBRIDPATCHABLE 
+  size_t __cdecl _msize(_Pre_notnull_ void* _Block) {
+    return mi_malloc_size(_Block);
+  }
+  _ACRTIMP 
+  void __cdecl _free_base(_Pre_maybenull_ _Post_invalid_ void* _Block) {
+    mi_free(_Block);
+  }
+  _ACRTIMP _CRT_HYBRIDPATCHABLE 
+  void __cdecl free(_Pre_maybenull_ _Post_invalid_ void* _Block) {
+    mi_free(_Block);
+  }
+  _Check_return_ _Ret_maybenull_ _Post_writable_byte_size_(_Size) _ACRTIMP _CRTALLOCATOR _CRT_JIT_INTRINSIC _CRTRESTRICT _CRT_HYBRIDPATCHABLE
+  void* __cdecl malloc(_In_ _CRT_GUARDOVERFLOW size_t _Size) {
+    return mi_malloc(_Size);
+  }
+  _Check_return_ _Ret_maybenull_ _Post_writable_byte_size_(_Size) _ACRTIMP _CRTALLOCATOR _CRTRESTRICT
+  void* __cdecl _malloc_base(_In_ size_t _Size) {
+    return mi_malloc(_Size);
+  }
+  _Success_(return != 0) _Check_return_ _Ret_maybenull_ _Post_writable_byte_size_(_Size) _ACRTIMP _CRTALLOCATOR _CRTRESTRICT
+  void* __cdecl _realloc_base(_Pre_maybenull_ _Post_invalid_  void*  _Block, _In_ size_t _Size) {
+    return mi_realloc(_Block, _Size);
+  }
+  _Success_(return != 0) _Check_return_ _Ret_maybenull_ _Post_writable_byte_size_(_Size) _ACRTIMP _CRTALLOCATOR _CRTRESTRICT _CRT_HYBRIDPATCHABLE
+  void* __cdecl realloc(_Pre_maybenull_ _Post_invalid_ void*  _Block, _In_ _CRT_GUARDOVERFLOW  size_t _Size) {
+    return mi_realloc(_Block, _Size);
+  }
+  _Check_return_ _Ret_maybenull_ _Post_writable_byte_size_(_Count * _Size) _ACRTIMP _CRTALLOCATOR _CRTRESTRICT 
+  void* __cdecl _calloc_base(_In_ size_t _Count, _In_ size_t _Size) {
+    return mi_calloc(_Count, _Size);
+  }
+  _Check_return_ _Ret_maybenull_ _Post_writable_byte_size_(_Count * _Size) _ACRTIMP _CRT_JIT_INTRINSIC _CRTALLOCATOR _CRTRESTRICT
+  void* __cdecl calloc(_In_ _CRT_GUARDOVERFLOW size_t _Count, _In_ _CRT_GUARDOVERFLOW size_t _Size) {
+    return mi_calloc(_Count, _Size);
+  }
+  _Success_(return != 0) _Check_return_ _Ret_maybenull_ _Post_writable_byte_size_(_Count * _Size) _ACRTIMP _CRTALLOCATOR _CRTRESTRICT
+  void* __cdecl _recalloc_base(_Pre_maybenull_ _Post_invalid_ void*  _Block, _In_ size_t _Count, _In_ size_t _Size) {
+    return mi_recalloc(_Block, _Count, _Size);
+  }
+  _Success_(return != 0) _Check_return_ _Ret_maybenull_ _Post_writable_byte_size_(_Count * _Size) _ACRTIMP _CRTALLOCATOR _CRTRESTRICT
+  void* __cdecl _recalloc(_Pre_maybenull_ _Post_invalid_ void*  _Block, _In_ _CRT_GUARDOVERFLOW size_t _Count, _In_ _CRT_GUARDOVERFLOW size_t _Size) {
+    return mi_recalloc(_Block, _Count, _Size);
+  }
+  _ACRTIMP 
+  void __cdecl _aligned_free(_Pre_maybenull_ _Post_invalid_ void* _Block) {
+    mi_free(_Block);
+  }
+  _Check_return_ _Ret_maybenull_ _Post_writable_byte_size_(_Size) _ACRTIMP _CRTALLOCATOR _CRTRESTRICT
+  void* __cdecl _aligned_malloc(_In_ _CRT_GUARDOVERFLOW size_t _Size, _In_ size_t _Alignment) {
+    return mi_malloc_aligned(_Size, _Alignment);
+  }
+  _Check_return_ _Ret_maybenull_ _Post_writable_byte_size_(_Size) _ACRTIMP _CRTALLOCATOR _CRTRESTRICT
+  void* __cdecl _aligned_offset_malloc(_In_ _CRT_GUARDOVERFLOW size_t _Size, _In_ size_t _Alignment, _In_ size_t _Offset) {
+    return mi_malloc_aligned_at(_Size, _Alignment, _Offset);
+  }
+  _Check_return_ _ACRTIMP 
+  size_t __cdecl _aligned_msize(_Pre_notnull_ void*  _Block, _In_ size_t _Alignment, _In_ size_t _Offset) {
+    MI_UNUSED(_Alignment); MI_UNUSED(_Offset); return mi_malloc_size(_Block);
+  }
+  _Success_(return != 0) _Check_return_ _Ret_maybenull_ _Post_writable_byte_size_(_Size) _ACRTIMP _CRTALLOCATOR _CRTRESTRICT
+  void* __cdecl _aligned_offset_realloc(_Pre_maybenull_ _Post_invalid_ void*  _Block, _In_ _CRT_GUARDOVERFLOW size_t _Size, _In_ size_t _Alignment, _In_ size_t _Offset) {
+    return mi_realloc_aligned_at(_Block, _Size, _Alignment, _Offset);
+  }
+  _Success_(return != 0) _Check_return_ _Ret_maybenull_ _Post_writable_byte_size_(_Count * _Size) _ACRTIMP _CRTALLOCATOR _CRTRESTRICT
+  void* __cdecl _aligned_offset_recalloc(_Pre_maybenull_ _Post_invalid_ void*  _Block, _In_ _CRT_GUARDOVERFLOW size_t _Count, _In_ _CRT_GUARDOVERFLOW size_t _Size, _In_ size_t _Alignment, _In_ size_t _Offset) {
+    return mi_recalloc_aligned_at(_Block, _Count, _Size, _Alignment, _Offset);
+  }
+  _Success_(return != 0) _Check_return_ _Ret_maybenull_ _Post_writable_byte_size_(_Size) _ACRTIMP _CRTALLOCATOR _CRTRESTRICT
+  void* __cdecl _aligned_realloc(_Pre_maybenull_ _Post_invalid_ void*  _Block, _In_ _CRT_GUARDOVERFLOW size_t _Size, _In_ size_t _Alignment) {
+    return mi_realloc_aligned(_Block, _Size, _Alignment);
+  }
+  _Success_(return != 0) _Check_return_ _Ret_maybenull_ _Post_writable_byte_size_(_Count * _Size) _ACRTIMP _CRTALLOCATOR _CRTRESTRICT
+  void* __cdecl _aligned_recalloc(_Pre_maybenull_ _Post_invalid_ void*  _Block, _In_ _CRT_GUARDOVERFLOW size_t _Count, _In_ _CRT_GUARDOVERFLOW size_t _Size, _In_ size_t _Alignment) {
+    return mi_recalloc_aligned(_Block, _Count, _Size, _Alignment);
+  }
 #else
   // On all other systems forward allocation primitives to our API
   mi_decl_export void* malloc(size_t size)              MI_FORWARD1(mi_malloc, size)
   mi_decl_export void* calloc(size_t size, size_t n)    MI_FORWARD2(mi_calloc, size, n)
   mi_decl_export void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize)
-  mi_decl_export void  free(void* p)                    MI_FORWARD0(mi_free, p)  
+  mi_decl_export void  free(void* p)                    MI_FORWARD0(mi_free, p)
   // In principle we do not need to forward `strdup`/`strndup` but on some systems these do not use `malloc` internally (but a more primitive call)
   // We only override if `strdup` is not a macro (as on some older libc's, see issue #885)
   #if !defined(strdup)
   mi_decl_export char* strdup(const char* str)             MI_FORWARD1(mi_strdup, str)
   #endif
   #if !defined(strndup) && (!defined(__APPLE__) || (defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7))
-  mi_decl_export char* strndup(const char* str, size_t n)  MI_FORWARD2(mi_strndup, str, n)   
+  mi_decl_export char* strndup(const char* str, size_t n)  MI_FORWARD2(mi_strndup, str, n)
   #endif
 #endif
 
@@ -203,18 +284,18 @@ typedef void* mi_nothrow_t;
   void _ZdaPv(void* p)            MI_FORWARD0(mi_free,p) // delete[]
   void _ZdlPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n)
   void _ZdaPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n)
-  
+
   void _ZdlPvSt11align_val_t(void* p, size_t al)            { mi_free_aligned(p,al); }
   void _ZdaPvSt11align_val_t(void* p, size_t al)            { mi_free_aligned(p,al); }
   void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
   void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
 
-  void _ZdlPvRKSt9nothrow_t(void* p, mi_nothrow_t tag)      { MI_UNUSED(tag); mi_free(p); }  // operator delete(void*, std::nothrow_t const&) 
+  void _ZdlPvRKSt9nothrow_t(void* p, mi_nothrow_t tag)      { MI_UNUSED(tag); mi_free(p); }  // operator delete(void*, std::nothrow_t const&)
   void _ZdaPvRKSt9nothrow_t(void* p, mi_nothrow_t tag)      { MI_UNUSED(tag); mi_free(p); }  // operator delete[](void*, std::nothrow_t const&)
-  void _ZdlPvSt11align_val_tRKSt9nothrow_t(void* p, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free_aligned(p,al); } // operator delete(void*, std::align_val_t, std::nothrow_t const&) 
-  void _ZdaPvSt11align_val_tRKSt9nothrow_t(void* p, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free_aligned(p,al); } // operator delete[](void*, std::align_val_t, std::nothrow_t const&) 
-  
-  #if (MI_INTPTR_SIZE==8)
+  void _ZdlPvSt11align_val_tRKSt9nothrow_t(void* p, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free_aligned(p,al); } // operator delete(void*, std::align_val_t, std::nothrow_t const&)
+  void _ZdaPvSt11align_val_tRKSt9nothrow_t(void* p, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free_aligned(p,al); } // operator delete[](void*, std::align_val_t, std::nothrow_t const&)
+
+  #if (MI_INTPTR_SIZE==8) || (MI_INTPTR_SIZE==4 && defined(__EMSCRIPTEN__))  // pr #1257
     void* _Znwm(size_t n)                             MI_FORWARD1(mi_new,n)  // new 64-bit
     void* _Znam(size_t n)                             MI_FORWARD1(mi_new,n)  // new[] 64-bit
     void* _ZnwmRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
@@ -249,7 +330,7 @@ extern "C" {
   // Forward Posix/Unix calls as well
   void*  reallocf(void* p, size_t newsize) MI_FORWARD2(mi_reallocf,p,newsize)
   size_t malloc_size(const void* p)        MI_FORWARD1(mi_usable_size,p)
-  #if !defined(__ANDROID__) && !defined(__FreeBSD__)
+  #if !defined(__ANDROID__) && !defined(__FreeBSD__) && !defined(__DragonFly__)
   size_t malloc_usable_size(void *p)       MI_FORWARD1(mi_usable_size,p)
   #else
   size_t malloc_usable_size(const void *p) MI_FORWARD1(mi_usable_size,p)
@@ -281,7 +362,9 @@ void  cfree(void* p)                                    { mi_free(p); }
 void* pvalloc(size_t size)                              { return mi_pvalloc(size); }
 mi_decl_weak // XXX EMSCRIPTEN
 void* memalign(size_t alignment, size_t size)           { return mi_memalign(alignment, size); }
+#if !defined(_WIN32)
 void* _aligned_malloc(size_t alignment, size_t size)    { return mi_aligned_alloc(alignment, size); }
+#endif
 mi_decl_weak // XXX EMSCRIPTEN
 void* reallocarray(void* p, size_t count, size_t size)  { return mi_reallocarray(p, count, size); }
 // some systems define reallocarr so mark it as a weak symbol (#751)
@@ -304,8 +387,8 @@ mi_decl_weak int reallocarr(void* p, size_t count, size_t size)    { return mi_r
   void* emscripten_builtin_calloc(size_t nmemb, size_t size)        MI_FORWARD2(mi_calloc, nmemb, size)
 #endif
 
-#elif defined(__GLIBC__) && defined(__linux__)
-  // forward __libc interface (needed for glibc-based Linux distributions)
+#elif defined(__linux__)
+  // forward __libc interface (needed for glibc-based and musl-based Linux distributions)
   void* __libc_malloc(size_t size)                      MI_FORWARD1(mi_malloc,size)
   void* __libc_calloc(size_t count, size_t size)        MI_FORWARD2(mi_calloc,count,size)
   void* __libc_realloc(void* p, size_t size)            MI_FORWARD2(mi_realloc,p,size)
@@ -326,4 +409,4 @@ mi_decl_weak int reallocarr(void* p, size_t count, size_t size)    { return mi_r
 #pragma GCC visibility pop
 #endif
 
-#endif // MI_MALLOC_OVERRIDE && !_WIN32
+#endif // MI_MALLOC_OVERRIDE
diff --git a/system/lib/mimalloc/src/alloc-posix.c b/system/lib/mimalloc/src/alloc-posix.c
index 225752fd8707f..a40bc974b00ac 100644
--- a/system/lib/mimalloc/src/alloc-posix.c
+++ b/system/lib/mimalloc/src/alloc-posix.c
@@ -100,7 +100,12 @@ mi_decl_nodiscard mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size
 }
 
 mi_decl_nodiscard void* mi_reallocarray( void* p, size_t count, size_t size ) mi_attr_noexcept {  // BSD
-  void* newp = mi_reallocn(p,count,size);
+  size_t total;
+  if mi_unlikely(mi_count_size_overflow(count, size, &total)) {
+    errno = EOVERFLOW;
+    return NULL;
+  }
+  void* newp = mi_realloc(p,total);
   if (newp==NULL) { errno = ENOMEM; }
   return newp;
 }
@@ -108,12 +113,15 @@ mi_decl_nodiscard void* mi_reallocarray( void* p, size_t count, size_t size ) mi
 mi_decl_nodiscard int mi_reallocarr( void* p, size_t count, size_t size ) mi_attr_noexcept { // NetBSD
   mi_assert(p != NULL);
   if (p == NULL) {
-    errno = EINVAL;
-    return EINVAL;
+    return (errno = EINVAL);
+  }
+  size_t total;
+  if mi_unlikely(mi_count_size_overflow(count, size, &total)) {
+    return (errno = EOVERFLOW);
   }
   void** op = (void**)p;
-  void* newp = mi_reallocarray(*op, count, size);
-  if mi_unlikely(newp == NULL) { return errno; }
+  void* newp = mi_realloc(*op,total);
+  if (newp == NULL) { return ENOMEM; }
   *op = newp;
   return 0;
 }
@@ -141,8 +149,8 @@ mi_decl_nodiscard mi_decl_restrict unsigned char* mi_mbsdup(const unsigned char*
 }
 
 int mi_dupenv_s(char** buf, size_t* size, const char* name) mi_attr_noexcept {
-  if (buf==NULL || name==NULL) return EINVAL;
   if (size != NULL) *size = 0;
+  if (buf==NULL || name==NULL) return EINVAL;
   char* p = getenv(name);        // mscver warning 4996
   if (p==NULL) {
     *buf = NULL;
@@ -150,14 +158,14 @@ int mi_dupenv_s(char** buf, size_t* size, const char* name) mi_attr_noexcept {
   else {
     *buf = mi_strdup(p);
     if (*buf==NULL) return ENOMEM;
-    if (size != NULL) *size = _mi_strlen(p);
+    if (size != NULL) { *size = _mi_strlen(p) + 1; }
   }
   return 0;
 }
 
 int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name) mi_attr_noexcept {
-  if (buf==NULL || name==NULL) return EINVAL;
   if (size != NULL) *size = 0;
+  if (buf==NULL || name==NULL) return EINVAL;  
 #if !defined(_WIN32) || (defined(WINAPI_FAMILY) && (WINAPI_FAMILY != WINAPI_FAMILY_DESKTOP_APP))
   // not supported
   *buf = NULL;
@@ -170,7 +178,7 @@ int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name)
   else {
     *buf = mi_wcsdup(p);
     if (*buf==NULL) return ENOMEM;
-    if (size != NULL) *size = wcslen((const wchar_t*)p);
+    if (size != NULL) { *size = wcslen((const wchar_t*)p) + 1; }
   }
   return 0;
 #endif
diff --git a/system/lib/mimalloc/src/alloc.c b/system/lib/mimalloc/src/alloc.c
index 86aaae757bddf..bce7bb9065312 100644
--- a/system/lib/mimalloc/src/alloc.c
+++ b/system/lib/mimalloc/src/alloc.c
@@ -1,5 +1,6 @@
+
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -28,20 +29,30 @@ terms of the MIT license. A copy of the license can be found in the file
 // Fast allocation in a page: just pop from the free list.
 // Fall back to generic allocation only if the list is empty.
 // Note: in release mode the (inlined) routine is about 7 instructions with a single test.
-extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept 
+static mi_decl_forceinline void* mi_page_malloc_zero(mi_theap_t* theap, mi_page_t* page, size_t size, bool zero, size_t* usable) mi_attr_noexcept
 {
-  mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size);
+  if (page->block_size != 0) { // not the empty theap
+    mi_assert_internal(mi_page_block_size(page) >= size);
+    mi_assert_internal(_mi_is_aligned(mi_page_slice_start(page), MI_PAGE_ALIGN));
+    mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
+  }
+
+  // check the free list
   mi_block_t* const block = page->free;
   if mi_unlikely(block == NULL) {
-    return _mi_malloc_generic(heap, size, zero, 0);
+    return _mi_malloc_generic(theap, size, (zero ? 1 : 0), usable);
   }
   mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
+  if (usable != NULL) { *usable = mi_page_usable_block_size(page); };
+
   // pop from the free list
   page->free = mi_block_next(page, block);
   page->used++;
   mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
+  mi_assert_internal(page->block_size < MI_MAX_ALIGN_SIZE || _mi_is_aligned(block, MI_MAX_ALIGN_SIZE));
+
   #if MI_DEBUG>3
-  if (page->free_is_zero) {
+  if (page->free_is_zero && size > sizeof(*block)) {
     mi_assert_expensive(mi_mem_is_zero(block+1,size - sizeof(*block)));
   }
   #endif
@@ -49,49 +60,48 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_
   // allow use of the block internally
   // note: when tracking we need to avoid ever touching the MI_PADDING since
   // that is tracked by valgrind etc. as non-accessible (through the red-zone, see `mimalloc/track.h`)
-  mi_track_mem_undefined(block, mi_page_usable_block_size(page));
-
-  // zero the block? note: we need to zero the full block size (issue #63)
-  if mi_unlikely(zero) {
-    mi_assert_internal(page->block_size != 0); // do not call with zero'ing for huge blocks (see _mi_malloc_generic)
-    mi_assert_internal(page->block_size >= MI_PADDING_SIZE);
-    if (page->free_is_zero) {
-      block->next = 0;
-      mi_track_mem_defined(block, page->block_size - MI_PADDING_SIZE);
-    }
-    else {
-      _mi_memzero_aligned(block, page->block_size - MI_PADDING_SIZE);
-    }
-  }
-
-  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN
-  if (!zero && !mi_page_is_huge(page)) {
-    memset(block, MI_DEBUG_UNINIT, mi_page_usable_block_size(page));
-  }
-  #elif (MI_SECURE!=0)
-  if (!zero) { block->next = 0; } // don't leak internal data
-  #endif
+  const size_t bsize = mi_page_usable_block_size(page);
+  mi_track_mem_undefined(block, bsize);
 
   #if (MI_STAT>0)
-  const size_t bsize = mi_page_usable_block_size(page);
-  if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) {
-    mi_heap_stat_increase(heap, normal, bsize);
-    mi_heap_stat_counter_increase(heap, normal_count, 1);
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
+    mi_theap_stat_increase(theap, malloc_normal, bsize);
     #if (MI_STAT>1)
+    mi_theap_stat_counter_increase(theap, malloc_normal_count, 1);
     const size_t bin = _mi_bin(bsize);
-    mi_heap_stat_increase(heap, normal_bins[bin], 1);
+    mi_theap_stat_increase(theap, malloc_bins[bin], 1);
+    mi_theap_stat_increase(theap, malloc_requested, size - MI_PADDING_SIZE);
     #endif
   }
   #endif
 
+  // zero the block? note: we need to zero the full block size (issue #63)
+  if mi_likely(!zero) {
+    // #if MI_SECURE
+    block->next = 0;  // don't leak internal data
+    // #endif
+    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN
+      if (!mi_page_is_huge(page)) { memset(block, MI_DEBUG_UNINIT, bsize); }
+    #endif    
+  }
+  else {
+    if (!page->free_is_zero) {
+      _mi_memzero_aligned(block, bsize);
+    }
+    else {
+      block->next = 0;
+      mi_track_mem_defined(block, bsize);
+    }    
+  }
+
   #if MI_PADDING // && !MI_TRACK_ENABLED
-    mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page));
+    mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + bsize);
     ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
     #if (MI_DEBUG>=2)
-    mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
+    mi_assert_internal(delta >= 0 && bsize >= (size - MI_PADDING_SIZE + delta));
     #endif
     mi_track_mem_defined(padding,sizeof(mi_padding_t));  // note: re-enable since mi_page_usable_block_size may set noaccess
-    padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
+    padding->canary = mi_ptr_encode_canary(page,block,page->keys);
     padding->delta  = (uint32_t)(delta);
     #if MI_PADDING_CHECK
     if (!mi_page_is_huge(page)) {
@@ -105,35 +115,67 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_
   return block;
 }
 
-// extra entries for improved efficiency in `alloc-aligned.c`.
-extern void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
-  return _mi_page_malloc_zero(heap,page,size,false);
-}
-extern void* _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
-  return _mi_page_malloc_zero(heap,page,size,true);
+// extra entries for improved efficiency in `alloc-aligned.c` (and in `page.c:mi_malloc_generic`.
+extern void* _mi_page_malloc_zero(mi_theap_t* theap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept {
+  return mi_page_malloc_zero(theap, page, size, zero, NULL);
 }
 
-static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept {
-  mi_assert(heap != NULL);
+#if MI_GUARDED
+mi_decl_restrict void* _mi_theap_malloc_guarded(mi_theap_t* theap, size_t size, bool zero) mi_attr_noexcept;
+#endif
+
+// main allocation primitives for small and generic allocation
+
+// internal small size allocation
+static mi_decl_forceinline mi_decl_restrict void* mi_theap_malloc_small_zero_nonnull(mi_theap_t* theap, size_t size, bool zero, size_t* usable) mi_attr_noexcept
+{
+  mi_assert(theap != NULL);
+  mi_assert(size <= MI_SMALL_SIZE_MAX);
   #if MI_DEBUG
   const uintptr_t tid = _mi_thread_id();
-  mi_assert(heap->thread_id == 0 || heap->thread_id == tid); // heaps are thread local
+  mi_assert(theap->tld->thread_id == 0 || theap->tld->thread_id == tid); // theaps are thread local
   #endif
-  mi_assert(size <= MI_SMALL_SIZE_MAX);
-  #if (MI_PADDING)
-  if (size == 0) { size = sizeof(void*); }
+  #if (MI_PADDING || MI_GUARDED)
+  if mi_unlikely(size == 0) { size = sizeof(void*); }
+  #endif
+  #if MI_GUARDED
+  if mi_unlikely(mi_theap_malloc_use_guarded(theap,size)) {
+    return _mi_theap_malloc_guarded(theap, size, zero);
+  }
   #endif
 
-  mi_page_t* page = _mi_heap_get_free_small_page(heap, size + MI_PADDING_SIZE);
-  void* const p = _mi_page_malloc_zero(heap, page, size + MI_PADDING_SIZE, zero);  
+  // get page in constant time, and allocate from it
+  mi_page_t* page = _mi_theap_get_free_small_page(theap, size + MI_PADDING_SIZE);
+  void* const p = mi_page_malloc_zero(theap, page, size + MI_PADDING_SIZE, zero, usable);
   mi_track_malloc(p,size,zero);
 
-  #if MI_STAT>1
-  if (p != NULL) {
-    if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
-    mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
+  #if MI_DEBUG>3
+  if (p != NULL && zero) {
+    mi_assert_expensive(mi_mem_is_zero(p, size));
   }
   #endif
+  return p;
+}
+
+// internal generic allocation
+static mi_decl_forceinline void* mi_theap_malloc_generic(mi_theap_t* theap, size_t size, bool zero, size_t huge_alignment, size_t* usable) mi_attr_noexcept
+{
+  #if MI_GUARDED
+  #if MI_THEAP_INITASNULL
+  if (theap!=NULL)
+  #endif
+  if (huge_alignment==0 && mi_theap_malloc_use_guarded(theap, size)) {
+    return _mi_theap_malloc_guarded(theap, size, zero);
+  }
+  #endif
+  #if !MI_THEAP_INITASNULL
+  mi_assert(theap!=NULL);
+  #endif
+  mi_assert(theap==NULL || theap->tld->thread_id == 0 || theap->tld->thread_id == _mi_thread_id());   // theaps are thread local
+  mi_assert((huge_alignment & 1)==0);
+  void* const p = _mi_malloc_generic(theap, size + MI_PADDING_SIZE, (zero ? 1 : 0) | huge_alignment, usable);  // note: size can overflow but it is detected in malloc_generic
+  mi_track_malloc(p, size, zero);
+
   #if MI_DEBUG>3
   if (p != NULL && zero) {
     mi_assert_expensive(mi_mem_is_zero(p, size));
@@ -142,88 +184,165 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap,
   return p;
 }
 
+// internal small allocation
+static mi_decl_forceinline mi_decl_restrict void* mi_theap_malloc_small_zero(mi_theap_t* theap, size_t size, bool zero, size_t* usable) mi_attr_noexcept {
+  #if MI_THEAP_INITASNULL
+  if (theap!=NULL) {
+    return mi_theap_malloc_small_zero_nonnull(theap, size, zero, usable);
+  }
+  else {
+    return mi_theap_malloc_generic(theap, size, zero, 0, usable); // tailcall
+  }
+  #else
+  return mi_theap_malloc_small_zero_nonnull(theap, size, zero, usable);
+  #endif
+}
+
+
 // allocate a small block
-mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
-  return mi_heap_malloc_small_zero(heap, size, false);
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_theap_malloc_small(mi_theap_t* theap, size_t size) mi_attr_noexcept {
+  return mi_theap_malloc_small_zero(theap, size, false, NULL);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept {
+  return mi_theap_malloc_small(_mi_theap_default(), size);
 }
 
-mi_decl_nodiscard extern inline mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept {
-  return mi_heap_malloc_small(mi_prim_get_default_heap(), size);
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return mi_theap_malloc_small_zero_nonnull(_mi_heap_theap(heap), size, false, NULL);
 }
 
-// The main allocation function
-extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept {
+// The main internal allocation functions
+static mi_decl_forceinline void* mi_theap_malloc_zero_nonnull(mi_theap_t* theap, size_t size, bool zero, size_t huge_alignment, size_t* usable) mi_attr_noexcept {
+  // fast path for small objects
   if mi_likely(size <= MI_SMALL_SIZE_MAX) {
     mi_assert_internal(huge_alignment == 0);
-    return mi_heap_malloc_small_zero(heap, size, zero);
+    return mi_theap_malloc_small_zero_nonnull(theap, size, zero, usable);
   }
   else {
-    mi_assert(heap!=NULL);
-    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id());   // heaps are thread local
-    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero, huge_alignment);  // note: size can overflow but it is detected in malloc_generic
-    mi_track_malloc(p,size,zero);
-    #if MI_STAT>1
-    if (p != NULL) {
-      if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
-      mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
-    }
-    #endif
-    #if MI_DEBUG>3
-    if (p != NULL && zero) {
-      mi_assert_expensive(mi_mem_is_zero(p, size));
-    }
-    #endif
-    return p;
+    return mi_theap_malloc_generic(theap, size, zero, huge_alignment, usable);
   }
 }
 
-extern inline void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept {
-  return _mi_heap_malloc_zero_ex(heap, size, zero, 0);
+extern mi_decl_forceinline void* _mi_theap_malloc_zero_ex(mi_theap_t* theap, size_t size, bool zero, size_t huge_alignment, size_t* usable) mi_attr_noexcept {
+  // fast path for small objects
+  #if MI_THEAP_INITASNULL
+  if mi_likely(theap!=NULL && size <= MI_SMALL_SIZE_MAX)
+  #else
+  if mi_likely(size <= MI_SMALL_SIZE_MAX)
+  #endif
+  {
+    mi_assert_internal(huge_alignment == 0);
+    return mi_theap_malloc_small_zero_nonnull(theap, size, zero, usable);
+  }
+  else {
+    return mi_theap_malloc_generic(theap, size, zero, huge_alignment, usable);
+  }
 }
 
-mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
-  return _mi_heap_malloc_zero(heap, size, false);
+void* _mi_theap_malloc_zero(mi_theap_t* theap, size_t size, bool zero, size_t* usable) mi_attr_noexcept {
+  return _mi_theap_malloc_zero_ex(theap, size, zero, 0, usable);
 }
 
-mi_decl_nodiscard extern inline mi_decl_restrict void* mi_malloc(size_t size) mi_attr_noexcept {
-  return mi_heap_malloc(mi_prim_get_default_heap(), size);
+
+// Main allocation functions
+
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_theap_malloc(mi_theap_t* theap, size_t size) mi_attr_noexcept {
+  return _mi_theap_malloc_zero(theap, size, false, NULL);
 }
 
+mi_decl_nodiscard mi_decl_restrict void* mi_malloc(size_t size) mi_attr_noexcept {
+   return mi_theap_malloc(_mi_theap_default(), size);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return mi_theap_malloc_zero_nonnull(_mi_heap_theap(heap), size, false, 0, NULL);
+}
+
+
 // zero initialized small block
 mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept {
-  return mi_heap_malloc_small_zero(mi_prim_get_default_heap(), size, true);
+  return mi_theap_malloc_small_zero(_mi_theap_default(), size, true, NULL);
+}
+
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_theap_zalloc_small(mi_theap_t* theap, size_t size) mi_attr_noexcept {
+  return mi_theap_malloc_small_zero(theap, size, true, NULL);
 }
 
-mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
-  return _mi_heap_malloc_zero(heap, size, true);
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_zalloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return mi_theap_malloc_small_zero_nonnull(_mi_heap_theap(heap), size, true, NULL);
+}
+
+
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_theap_zalloc(mi_theap_t* theap, size_t size) mi_attr_noexcept {
+  return _mi_theap_malloc_zero(theap, size, true, NULL);
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_zalloc(size_t size) mi_attr_noexcept {
-  return mi_heap_zalloc(mi_prim_get_default_heap(),size);
+  return _mi_theap_malloc_zero(_mi_theap_default(), size, true, NULL);
 }
 
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return mi_theap_malloc_zero_nonnull(_mi_heap_theap(heap), size, true, 0, NULL);
+}
 
-mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_theap_calloc(mi_theap_t* theap, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count,size,&total)) return NULL;
-  return mi_heap_zalloc(heap,total);
+  return mi_theap_zalloc(theap,total);
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
-  return mi_heap_calloc(mi_prim_get_default_heap(),count,size);
+  return mi_theap_calloc(_mi_theap_default(),count,size);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
+  return mi_heap_zalloc(heap, total);
+}
+
+// Return usable size
+mi_decl_nodiscard mi_decl_restrict void* mi_umalloc_small(size_t size, size_t* usable) mi_attr_noexcept {
+  return mi_theap_malloc_small_zero(_mi_theap_default(), size, false, usable);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_theap_umalloc(mi_theap_t* theap, size_t size, size_t* usable) mi_attr_noexcept {
+  return _mi_theap_malloc_zero_ex(theap, size, false, 0, usable);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_umalloc(size_t size, size_t* usable) mi_attr_noexcept {
+  return mi_theap_umalloc(_mi_theap_default(), size, usable);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_uzalloc(size_t size, size_t* usable) mi_attr_noexcept {
+  return _mi_theap_malloc_zero_ex(_mi_theap_default(), size, true, 0, usable);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_ucalloc(size_t count, size_t size, size_t* usable) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count,size,&total)) return NULL;
+  return mi_uzalloc(total, usable);
 }
 
 // Uninitialized `calloc`
-mi_decl_nodiscard extern mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+static mi_decl_restrict void* mi_theap_mallocn(mi_theap_t* theap, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count, size, &total)) return NULL;
-  return mi_heap_malloc(heap, total);
+  return mi_theap_malloc(theap, total);
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
-  return mi_heap_mallocn(mi_prim_get_default_heap(),count,size);
+  return mi_theap_mallocn(_mi_theap_default(),count,size);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
+  return mi_heap_malloc(heap, total);
 }
 
+
 // Expand (or shrink) in place (or fail)
 void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
   #if MI_PADDING
@@ -232,25 +351,40 @@ void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
   return NULL;
   #else
   if (p == NULL) return NULL;
-  const size_t size = _mi_usable_size(p,"mi_expand");
+  const mi_page_t* const page = mi_validate_ptr_page(p,"mi_expand");
+  const size_t size = _mi_usable_size(p,page);
   if (newsize > size) return NULL;
   return p; // it fits
   #endif
 }
 
-void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept {
+void* _mi_theap_realloc_zero(mi_theap_t* theap, void* p, size_t newsize, bool zero, size_t* usable_pre, size_t* usable_post) mi_attr_noexcept {
   // if p == NULL then behave as malloc.
   // else if size == 0 then reallocate to a zero-sized block (and don't return NULL, just as mi_malloc(0)).
   // (this means that returning NULL always indicates an error, and `p` will not have been freed in that case.)
-  const size_t size = _mi_usable_size(p,"mi_realloc"); // also works if p == NULL (with size 0)
-  if mi_unlikely(newsize <= size && newsize >= (size / 2) && newsize > 0) {  // note: newsize must be > 0 or otherwise we return NULL for realloc(NULL,0)
+  const mi_page_t* page;
+  size_t size;
+  if (p==NULL) {
+    page = NULL;
+    size = 0;
+    if (usable_pre!=NULL) { *usable_pre = 0; }
+  }
+  else {
+    page = mi_validate_ptr_page(p,"mi_realloc");
+    size = _mi_usable_size(p,page);
+    if (usable_pre!=NULL) { *usable_pre = mi_page_usable_block_size(page); }
+  }
+  if mi_unlikely(newsize<=size && newsize>=(size/2) && newsize>0  // note: newsize must be > 0 or otherwise we return NULL for realloc(NULL,0)
+                  && mi_page_heap(page)==_mi_theap_heap(theap))             // and within the same heap
+  {
     mi_assert_internal(p!=NULL);
     // todo: do not track as the usable size is still the same in the free; adjust potential padding?
     // mi_track_resize(p,size,newsize)
     // if (newsize < size) { mi_track_mem_noaccess((uint8_t*)p + newsize, size - newsize); }
+    if (usable_post!=NULL) { *usable_post = mi_page_usable_block_size(page); }
     return p;  // reallocation still fits and not more than 50% waste
   }
-  void* newp = mi_heap_malloc(heap,newsize);
+  void* newp = mi_theap_umalloc(theap,newsize,usable_post);
   if mi_likely(newp != NULL) {
     if (zero && newsize > size) {
       // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
@@ -264,60 +398,86 @@ void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero)
       const size_t copysize = (newsize > size ? size : newsize);
       mi_track_mem_defined(p,copysize);  // _mi_useable_size may be too large for byte precise memory tracking..
       _mi_memcpy(newp, p, copysize);
-      mi_free(p); // only free the original pointer if successful
+      mi_free(p); // only free the original pointer if successful  // todo: optimize since page is known?
     }
   }
   return newp;
 }
 
-mi_decl_nodiscard void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
-  return _mi_heap_realloc_zero(heap, p, newsize, false);
+mi_decl_nodiscard void* mi_theap_realloc(mi_theap_t* theap, void* p, size_t newsize) mi_attr_noexcept {
+  return _mi_theap_realloc_zero(theap, p, newsize, false, NULL, NULL);
 }
 
-mi_decl_nodiscard void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+static void* mi_theap_reallocn(mi_theap_t* theap, void* p, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count, size, &total)) return NULL;
-  return mi_heap_realloc(heap, p, total);
+  return mi_theap_realloc(theap, p, total);
 }
 
 
 // Reallocate but free `p` on errors
-mi_decl_nodiscard void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
-  void* newp = mi_heap_realloc(heap, p, newsize);
+static void* mi_theap_reallocf(mi_theap_t* theap, void* p, size_t newsize) mi_attr_noexcept {
+  void* newp = mi_theap_realloc(theap, p, newsize);
   if (newp==NULL && p!=NULL) mi_free(p);
   return newp;
 }
 
-mi_decl_nodiscard void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
-  return _mi_heap_realloc_zero(heap, p, newsize, true);
+static void* mi_theap_rezalloc(mi_theap_t* theap, void* p, size_t newsize) mi_attr_noexcept {
+  return _mi_theap_realloc_zero(theap, p, newsize, true, NULL, NULL);
 }
 
-mi_decl_nodiscard void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+static void* mi_theap_recalloc(mi_theap_t* theap, void* p, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count, size, &total)) return NULL;
-  return mi_heap_rezalloc(heap, p, total);
+  return mi_theap_rezalloc(theap, p, total);
 }
 
 
 mi_decl_nodiscard void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept {
-  return mi_heap_realloc(mi_prim_get_default_heap(),p,newsize);
+  return mi_theap_realloc(_mi_theap_default(),p,newsize);
 }
 
 mi_decl_nodiscard void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept {
-  return mi_heap_reallocn(mi_prim_get_default_heap(),p,count,size);
+  return mi_theap_reallocn(_mi_theap_default(),p,count,size);
+}
+
+mi_decl_nodiscard void* mi_urealloc(void* p, size_t newsize, size_t* usable_pre, size_t* usable_post) mi_attr_noexcept {
+  return _mi_theap_realloc_zero(_mi_theap_default(),p,newsize, false, usable_pre, usable_post);
 }
 
 // Reallocate but free `p` on errors
 mi_decl_nodiscard void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept {
-  return mi_heap_reallocf(mi_prim_get_default_heap(),p,newsize);
+  return mi_theap_reallocf(_mi_theap_default(),p,newsize);
 }
 
 mi_decl_nodiscard void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept {
-  return mi_heap_rezalloc(mi_prim_get_default_heap(), p, newsize);
+  return mi_theap_rezalloc(_mi_theap_default(), p, newsize);
 }
 
 mi_decl_nodiscard void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
-  return mi_heap_recalloc(mi_prim_get_default_heap(), p, count, size);
+  return mi_theap_recalloc(_mi_theap_default(), p, count, size);
+}
+
+
+mi_decl_nodiscard void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  return mi_theap_realloc(_mi_heap_theap(heap), p, newsize);
+}
+
+mi_decl_nodiscard void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+  return mi_theap_reallocn(_mi_heap_theap(heap), p, count, size);
+}
+
+// Reallocate but free `p` on errors
+mi_decl_nodiscard void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  return mi_theap_reallocf(_mi_heap_theap(heap), p, newsize);
+}
+
+mi_decl_nodiscard void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  return mi_theap_rezalloc(_mi_heap_theap(heap), p, newsize);
+}
+
+mi_decl_nodiscard void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+  return mi_theap_recalloc(_mi_heap_theap(heap), p, count, size);
 }
 
 
@@ -327,10 +487,10 @@ mi_decl_nodiscard void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_
 // ------------------------------------------------------
 
 // `strdup` using mi_malloc
-mi_decl_nodiscard mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept {
+mi_decl_nodiscard static mi_decl_restrict char* mi_theap_strdup(mi_theap_t* theap, const char* s) mi_attr_noexcept {
   if (s == NULL) return NULL;
   size_t len = _mi_strlen(s);
-  char* t = (char*)mi_heap_malloc(heap,len+1);
+  char* t = (char*)mi_theap_malloc(theap,len+1);
   if (t == NULL) return NULL;
   _mi_memcpy(t, s, len);
   t[len] = 0;
@@ -338,14 +498,18 @@ mi_decl_nodiscard mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const c
 }
 
 mi_decl_nodiscard mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept {
-  return mi_heap_strdup(mi_prim_get_default_heap(), s);
+  return mi_theap_strdup(_mi_theap_default(), s);
+}
+
+mi_decl_nodiscard mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept {
+  return mi_theap_strdup(_mi_heap_theap(heap), s);
 }
 
 // `strndup` using mi_malloc
-mi_decl_nodiscard mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept {
+mi_decl_nodiscard static mi_decl_restrict char* mi_theap_strndup(mi_theap_t* theap, const char* s, size_t n) mi_attr_noexcept {
   if (s == NULL) return NULL;
   const size_t len = _mi_strnlen(s,n);  // len <= n
-  char* t = (char*)mi_heap_malloc(heap, len+1);
+  char* t = (char*)mi_theap_malloc(theap, len+1);
   if (t == NULL) return NULL;
   _mi_memcpy(t, s, len);
   t[len] = 0;
@@ -353,7 +517,11 @@ mi_decl_nodiscard mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const
 }
 
 mi_decl_nodiscard mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
-  return mi_heap_strndup(mi_prim_get_default_heap(),s,n);
+  return mi_theap_strndup(_mi_theap_default(),s,n);
+}
+
+mi_decl_nodiscard mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept {
+  return mi_theap_strndup(_mi_heap_theap(heap), s, n);
 }
 
 #ifndef __wasi__
@@ -362,8 +530,8 @@ mi_decl_nodiscard mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_
 #ifndef PATH_MAX
 #define PATH_MAX MAX_PATH
 #endif
-#include <windows.h>
-mi_decl_nodiscard mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+
+mi_decl_nodiscard static mi_decl_restrict char* mi_theap_realpath(mi_theap_t* theap, const char* fname, char* resolved_name) mi_attr_noexcept {
   // todo: use GetFullPathNameW to allow longer file names
   char buf[PATH_MAX];
   DWORD res = GetFullPathNameA(fname, PATH_MAX, (resolved_name == NULL ? buf : resolved_name), NULL);
@@ -377,53 +545,31 @@ mi_decl_nodiscard mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const
     return resolved_name;
   }
   else {
-    return mi_heap_strndup(heap, buf, PATH_MAX);
+    return mi_theap_strndup(theap, buf, PATH_MAX);
   }
 }
 #else
-/*
-#include <unistd.h>  // pathconf
-static size_t mi_path_max(void) {
-  static size_t path_max = 0;
-  if (path_max <= 0) {
-    long m = pathconf("/",_PC_PATH_MAX);
-    if (m <= 0) path_max = 4096;      // guess
-    else if (m < 256) path_max = 256; // at least 256
-    else path_max = m;
-  }
-  return path_max;
-}
-*/
-char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+char* mi_theap_realpath(mi_theap_t* theap, const char* fname, char* resolved_name) mi_attr_noexcept {
   if (resolved_name != NULL) {
     return realpath(fname,resolved_name);
   }
   else {
     char* rname = realpath(fname, NULL);
     if (rname == NULL) return NULL;
-    char* result = mi_heap_strdup(heap, rname);
+    char* result = mi_theap_strdup(theap, rname);
     mi_cfree(rname);  // use checked free (which may be redirected to our free but that's ok)
     // note: with ASAN realpath is intercepted and mi_cfree may leak the returned pointer :-(
     return result;
   }
-  /*
-    const size_t n  = mi_path_max();
-    char* buf = (char*)mi_malloc(n+1);
-    if (buf == NULL) {
-      errno = ENOMEM;
-      return NULL;
-    }
-    char* rname  = realpath(fname,buf);
-    char* result = mi_heap_strndup(heap,rname,n); // ok if `rname==NULL`
-    mi_free(buf);
-    return result;
-  }
-  */
 }
 #endif
 
 mi_decl_nodiscard mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept {
-  return mi_heap_realpath(mi_prim_get_default_heap(),fname,resolved_name);
+  return mi_theap_realpath(_mi_theap_default(),fname,resolved_name);
+}
+
+mi_decl_nodiscard mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+  return mi_theap_realpath(_mi_heap_theap(heap), fname, resolved_name);
 }
 #endif
 
@@ -473,7 +619,7 @@ static std_new_handler_t mi_get_new_handler(void) {
 }
 #else
 // note: on windows we could dynamically link to `?get_new_handler@std@@YAP6AXXZXZ`.
-static std_new_handler_t mi_get_new_handler() {
+static std_new_handler_t mi_get_new_handler(void) {
   return NULL;
 }
 #endif
@@ -494,43 +640,57 @@ static bool mi_try_new_handler(bool nothrow) {
 }
 #endif
 
-mi_decl_export mi_decl_noinline void* mi_heap_try_new(mi_heap_t* heap, size_t size, bool nothrow ) {
+static mi_decl_noinline void* mi_theap_try_new(mi_theap_t* theap, size_t size, bool nothrow ) {
   void* p = NULL;
   while(p == NULL && mi_try_new_handler(nothrow)) {
-    p = mi_heap_malloc(heap,size);
+    p = mi_theap_malloc(theap,size);
   }
   return p;
 }
 
 static mi_decl_noinline void* mi_try_new(size_t size, bool nothrow) {
-  return mi_heap_try_new(mi_prim_get_default_heap(), size, nothrow);
+  return mi_theap_try_new(_mi_theap_default(), size, nothrow);
 }
 
+static mi_decl_noinline void* mi_heap_try_new(mi_heap_t* heap, size_t size, bool nothrow) {
+  return mi_theap_try_new(_mi_heap_theap(heap), size, nothrow);
+}
 
-mi_decl_nodiscard mi_decl_restrict void* mi_heap_alloc_new(mi_heap_t* heap, size_t size) {
-  void* p = mi_heap_malloc(heap,size);
-  if mi_unlikely(p == NULL) return mi_heap_try_new(heap, size, false);
+
+mi_decl_nodiscard static mi_decl_restrict void* mi_theap_alloc_new(mi_theap_t* theap, size_t size) {
+  void* p = mi_theap_malloc(theap,size);
+  if mi_unlikely(p == NULL) return mi_theap_try_new(theap, size, false);
   return p;
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_new(size_t size) {
-  return mi_heap_alloc_new(mi_prim_get_default_heap(), size);
+  return mi_theap_alloc_new(_mi_theap_default(), size);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_alloc_new(mi_heap_t* heap, size_t size) {
+  void* p = mi_heap_malloc(heap, size);
+  if mi_unlikely(p == NULL) return mi_heap_try_new(heap, size, false);
+  return p;
 }
 
 
-mi_decl_nodiscard mi_decl_restrict void* mi_heap_alloc_new_n(mi_heap_t* heap, size_t count, size_t size) {
+mi_decl_nodiscard static mi_decl_restrict void* mi_theap_alloc_new_n(mi_theap_t* theap, size_t count, size_t size) {
   size_t total;
   if mi_unlikely(mi_count_size_overflow(count, size, &total)) {
     mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
     return NULL;
   }
   else {
-    return mi_heap_alloc_new(heap,total);
+    return mi_theap_alloc_new(theap,total);
   }
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_new_n(size_t count, size_t size) {
-  return mi_heap_alloc_new_n(mi_prim_get_default_heap(), size, count);
+  return mi_theap_alloc_new_n(_mi_theap_default(), count, size);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_alloc_new_n(mi_heap_t* heap, size_t count, size_t size) {
+  return mi_theap_alloc_new_n(_mi_heap_theap(heap), count, size);
 }
 
 
@@ -577,22 +737,110 @@ mi_decl_nodiscard void* mi_new_reallocn(void* p, size_t newcount, size_t size) {
   }
 }
 
+#if MI_GUARDED
+// We always allocate a guarded allocation at an offset (`mi_page_has_interior_pointers` will be true).
+// We then set the first word of the block to `0` for regular offset aligned allocations (in `alloc-aligned.c`)
+// and the first word to `~0` for guarded allocations to have a correct `mi_usable_size`
+
+static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
+  // TODO: we can still make padding work by moving it out of the guard page area
+  mi_page_t* const page = _mi_ptr_page(block);
+  mi_page_set_has_interior_pointers(page, true);
+  block->next = MI_BLOCK_TAG_GUARDED;
+
+  // set guard page at the end of the block
+  const size_t block_size = mi_page_block_size(page);  // must use `block_size` to match `mi_free_local`
+  const size_t os_page_size = _mi_os_page_size();
+  mi_assert_internal(block_size >= obj_size + os_page_size + sizeof(mi_block_t));
+  if (block_size < obj_size + os_page_size + sizeof(mi_block_t)) {
+    // should never happen
+    mi_free(block);
+    return NULL;
+  }
+  uint8_t* guard_page = (uint8_t*)block + block_size - os_page_size;
+  // note: the alignment of the guard page relies on blocks being os_page_size aligned which
+  // is ensured in `mi_arena_page_alloc_fresh`.  
+  mi_assert_internal(_mi_is_aligned(block, os_page_size));
+  mi_assert_internal(_mi_is_aligned(guard_page, os_page_size));
+  if (!page->memid.is_pinned && _mi_is_aligned(guard_page, os_page_size)) {
+    const bool ok = _mi_os_protect(guard_page, os_page_size);
+    if mi_unlikely(!ok) {
+      _mi_warning_message("failed to set a guard page behind an object (object %p of size %zu)\n", block, block_size);
+    }
+  }
+  else {
+    _mi_warning_message("unable to set a guard page behind an object due to pinned memory (large OS pages?) (object %p of size %zu)\n", block, block_size);
+  }
+
+  // align pointer just in front of the guard page
+  size_t offset = block_size - os_page_size - obj_size;
+  mi_assert_internal(offset > sizeof(mi_block_t));
+  if (offset > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+    // give up to place it right in front of the guard page if the offset is too large for unalignment
+    offset = MI_PAGE_MAX_OVERALLOC_ALIGN;
+  }
+  uint8_t* const p = (uint8_t*)block + offset;
+  mi_assert_internal(p == guard_page - obj_size);
+  mi_track_align(block, p, offset, obj_size);
+  mi_track_mem_defined(block, sizeof(mi_block_t));
+  return p;
+}
+
+mi_decl_restrict void* _mi_theap_malloc_guarded(mi_theap_t* theap, size_t size, bool zero) mi_attr_noexcept
+{
+  // allocate multiple of page size ending in a guard page
+  // ensure minimal alignment requirement?
+  const size_t os_page_size = _mi_os_page_size();
+  const size_t obj_size = (mi_option_is_enabled(mi_option_guarded_precise) ? size : _mi_align_up(size, MI_MAX_ALIGN_SIZE));
+  const size_t bsize    = _mi_align_up(_mi_align_up(obj_size, MI_MAX_ALIGN_SIZE) + sizeof(mi_block_t), MI_MAX_ALIGN_SIZE);
+  const size_t req_size = _mi_align_up(bsize + os_page_size, os_page_size);  
+  mi_block_t* const block = (mi_block_t*)_mi_malloc_generic(theap, req_size, 0 /* don't zero */, NULL);
+  if (block==NULL) return NULL;
+  void* const p = mi_block_ptr_set_guarded(block, obj_size);
+  if (zero) { 
+    _mi_memzero_aligned(p,obj_size);  // we have to zero here as padding might have written here (if the blocksize > reqsize + os_page_size)
+  }
+
+  // stats
+  mi_track_malloc(p, obj_size, zero);  
+  if (p != NULL) {
+    if (!mi_theap_is_initialized(theap)) { theap = _mi_theap_default(); }
+    #if MI_STAT>1
+    // adjust stats to only count the allocated size of the block (and not the guard page)
+    mi_theap_stat_adjust_decrease(theap, malloc_requested, req_size);
+    mi_theap_stat_increase(theap, malloc_requested, size);
+    #endif
+    mi_theap_stat_counter_increase(theap, malloc_guarded_count, 1);
+  }
+  #if MI_DEBUG>3
+  if (p != NULL && zero) {
+    mi_assert_expensive(mi_mem_is_zero(p, size));
+  }
+  #endif
+  return p;
+}
+#endif
+
 // ------------------------------------------------------
 // ensure explicit external inline definitions are emitted!
 // ------------------------------------------------------
 
 #ifdef __cplusplus
 void* _mi_externs[] = {
-  (void*)&_mi_page_malloc,
-  (void*)&_mi_heap_malloc_zero,
-  (void*)&_mi_heap_malloc_zero_ex,
+  (void*)&_mi_page_malloc_zero,
+  (void*)&_mi_theap_malloc_zero,
+  (void*)&_mi_theap_malloc_zero_ex,
+  (void*)&mi_theap_malloc,
+  (void*)&mi_theap_zalloc,
+  (void*)&mi_theap_malloc_small,
   (void*)&mi_malloc,
   (void*)&mi_malloc_small,
+  (void*)&mi_zalloc,
   (void*)&mi_zalloc_small,
   (void*)&mi_heap_malloc,
-  (void*)&mi_heap_zalloc,
   (void*)&mi_heap_malloc_small,
-  // (void*)&mi_heap_alloc_new,
-  // (void*)&mi_heap_alloc_new_n
+  (void*)&mi_malloc_aligned
+  // (void*)&mi_theap_alloc_new,
+  // (void*)&mi_theap_alloc_new_n
 };
 #endif
diff --git a/system/lib/mimalloc/src/arena-meta.c b/system/lib/mimalloc/src/arena-meta.c
new file mode 100644
index 0000000000000..77ac6f6c4d991
--- /dev/null
+++ b/system/lib/mimalloc/src/arena-meta.c
@@ -0,0 +1,179 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+  We have a special "mini" allocator just for allocation of meta-data like
+  the theap (`mi_theap_t`) or thread-local data (`mi_tld_t`).
+
+  We reuse the bitmap of the arena's for allocation of 64b blocks inside
+  an arena slice (64KiB).
+  We always ensure that meta data is zero'd (we zero on `free`)
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "bitmap.h"
+
+/* -----------------------------------------------------------
+  Meta data allocation
+----------------------------------------------------------- */
+
+#define MI_META_PAGE_SIZE         MI_ARENA_SLICE_SIZE
+#define MI_META_PAGE_ALIGN        MI_ARENA_SLICE_ALIGN
+
+// large enough such that META_MAX_SIZE > 4k (even on 32-bit)
+#define MI_META_BLOCK_SIZE        (1 << (16 - MI_BCHUNK_BITS_SHIFT))        // 128 on 64-bit
+#define MI_META_BLOCK_ALIGN       MI_META_BLOCK_SIZE
+#define MI_META_BLOCKS_PER_PAGE   (MI_META_PAGE_SIZE / MI_META_BLOCK_SIZE)  // 512
+#define MI_META_MAX_SIZE          (MI_BCHUNK_SIZE * MI_META_BLOCK_SIZE)
+
+#if MI_META_MAX_SIZE <= 4096
+#error "max meta object size should be at least 4KiB"
+#endif
+
+typedef struct mi_meta_page_s  {
+  _Atomic(struct mi_meta_page_s*)  next;    // a linked list of meta-data pages (never released)
+  mi_memid_t                       memid;   // provenance of the meta-page memory itself
+  mi_bbitmap_t                     blocks_free;  // a small bitmap with 1 bit per block.
+} mi_meta_page_t;
+
+static mi_decl_cache_align _Atomic(mi_meta_page_t*)  mi_meta_pages = MI_ATOMIC_VAR_INIT(NULL);
+
+
+#if MI_DEBUG > 1
+static mi_meta_page_t* mi_meta_page_of_ptr(void* p, size_t* block_idx) {
+  mi_meta_page_t* mpage = (mi_meta_page_t*)((uint8_t*)_mi_align_down_ptr(p,MI_META_PAGE_ALIGN) + _mi_os_secure_guard_page_size());
+  if (block_idx != NULL) {
+    *block_idx = ((uint8_t*)p - (uint8_t*)mpage) / MI_META_BLOCK_SIZE;
+  }
+  return mpage;
+}
+#endif
+
+static mi_meta_page_t* mi_meta_page_next( mi_meta_page_t* mpage ) {
+  return mi_atomic_load_ptr_acquire(mi_meta_page_t, &mpage->next);
+}
+
+static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) {
+  mi_assert_internal(_mi_is_aligned((uint8_t*)mpage - _mi_os_secure_guard_page_size(), MI_META_PAGE_ALIGN));
+  mi_assert_internal(block_idx < MI_META_BLOCKS_PER_PAGE);
+  void* p = ((uint8_t*)mpage - _mi_os_secure_guard_page_size() + (block_idx * MI_META_BLOCK_SIZE));
+  mi_assert_internal(mpage == mi_meta_page_of_ptr(p,NULL));
+  return p;
+}
+
+// allocate a fresh meta page and add it to the global list.
+static mi_meta_page_t* mi_meta_page_zalloc(void) {
+  // allocate a fresh arena slice
+  // note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again.. (same with _mi_os_numa_node()...)
+  mi_memid_t memid;
+  uint8_t* base = (uint8_t*)_mi_arenas_alloc_aligned(mi_heap_main(), MI_META_PAGE_SIZE, MI_META_PAGE_ALIGN, 0,
+                                                                    true /* commit*/, (MI_SECURE==0) /* allow large? */,
+                                                                    NULL /* req arena */, 0 /* thread_seq */, -1 /* numa node */, &memid);
+  if (base == NULL) return NULL;
+  mi_assert_internal(_mi_is_aligned(base,MI_META_PAGE_ALIGN));
+  if (!memid.initially_zero) {
+    _mi_memzero_aligned(base, MI_ARENA_SLICE_SIZE);
+  }
+
+  // guard pages
+  #if MI_SECURE >= 1
+  _mi_os_secure_guard_page_set_at(base, memid);
+  _mi_os_secure_guard_page_set_before(base + MI_META_PAGE_SIZE, memid);
+  #endif
+
+  // initialize the page and free block bitmap
+  mi_meta_page_t* mpage = (mi_meta_page_t*)(base + _mi_os_secure_guard_page_size());
+  mpage->memid = memid;
+  mi_bbitmap_init(&mpage->blocks_free, MI_META_BLOCKS_PER_PAGE, true /* already_zero */);
+  const size_t mpage_size  = offsetof(mi_meta_page_t,blocks_free) + mi_bbitmap_size(MI_META_BLOCKS_PER_PAGE, NULL);
+  const size_t info_blocks = _mi_divide_up(mpage_size,MI_META_BLOCK_SIZE);
+  const size_t guard_blocks = _mi_divide_up(_mi_os_secure_guard_page_size(), MI_META_BLOCK_SIZE);
+  mi_assert_internal(info_blocks + 2*guard_blocks < MI_META_BLOCKS_PER_PAGE);
+  mi_bbitmap_unsafe_setN(&mpage->blocks_free, info_blocks + guard_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks - 2*guard_blocks);
+
+  // push atomically in front of the meta page list
+  // (note: there is no ABA issue since we never free meta-pages)
+  mi_meta_page_t* old = mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages);
+  do {
+    mi_atomic_store_ptr_release(mi_meta_page_t, &mpage->next, old);
+  } while(!mi_atomic_cas_ptr_weak_acq_rel(mi_meta_page_t,&mi_meta_pages,&old,mpage));
+  return mpage;
+}
+
+
+// allocate meta-data
+mi_decl_noinline void* _mi_meta_zalloc( size_t size, mi_memid_t* pmemid )
+{
+  mi_assert_internal(pmemid != NULL);
+  size = _mi_align_up(size,MI_META_BLOCK_SIZE);
+  if (size == 0 || size > MI_META_MAX_SIZE) return NULL;
+  const size_t block_count = _mi_divide_up(size,MI_META_BLOCK_SIZE);
+  mi_assert_internal(block_count > 0 && block_count < MI_BCHUNK_BITS);
+  mi_meta_page_t* mpage0 = mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages);
+  mi_meta_page_t* mpage = mpage0;
+  while (mpage != NULL) {
+    size_t block_idx;
+    if (mi_bbitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) {
+      // found and claimed `block_count` blocks
+      *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count);
+      return mi_meta_block_start(mpage,block_idx);
+    }
+    else {
+      mpage = mi_meta_page_next(mpage);
+    }
+  }
+  // failed to find space in existing pages
+  if (mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages) != mpage0) {
+    // the page list was updated by another thread in the meantime, retry
+    return _mi_meta_zalloc(size,pmemid);
+  }
+  // otherwise, allocate a fresh metapage and try once more
+  mpage = mi_meta_page_zalloc();
+  if (mpage != NULL) {
+    size_t block_idx;
+    if (mi_bbitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) {
+      // found and claimed `block_count` blocks
+      *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count);
+      return mi_meta_block_start(mpage,block_idx);
+    }
+  }
+  // if all this failed, allocate from the OS
+  return _mi_os_alloc(size, pmemid);
+}
+
+// free meta-data
+mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) {
+  if (p==NULL) return;
+  if (memid.memkind == MI_MEM_META) {
+    mi_assert_internal(_mi_divide_up(size, MI_META_BLOCK_SIZE) == memid.mem.meta.block_count);
+    const size_t block_count = memid.mem.meta.block_count;
+    const size_t block_idx   = memid.mem.meta.block_index;
+    mi_meta_page_t* mpage = (mi_meta_page_t*)memid.mem.meta.meta_page;
+    mi_assert_internal(mi_meta_page_of_ptr(p,NULL) == mpage);
+    mi_assert_internal(block_idx + block_count <= MI_META_BLOCKS_PER_PAGE);
+    mi_assert_internal(mi_bbitmap_is_clearN(&mpage->blocks_free, block_idx, block_count));
+    // we zero on free (and on the initial page allocation) so we don't need a "dirty" map
+    _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE);
+    mi_bbitmap_setN(&mpage->blocks_free, block_idx, block_count);
+  }
+  else {
+    _mi_arenas_free(p,size,memid);
+  }
+}
+
+// used for debug output
+bool _mi_meta_is_meta_page(void* p)
+{
+  mi_meta_page_t* mpage0 = mi_atomic_load_ptr_acquire(mi_meta_page_t, &mi_meta_pages);
+  mi_meta_page_t* mpage = mpage0;
+  while (mpage != NULL) {
+    if ((void*)mpage == p) return true;
+    mpage = mi_meta_page_next(mpage);
+  }
+  return false;
+}
diff --git a/system/lib/mimalloc/src/arena.c b/system/lib/mimalloc/src/arena.c
index 648ee844fedae..e203bee02aef4 100644
--- a/system/lib/mimalloc/src/arena.c
+++ b/system/lib/mimalloc/src/arena.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023, Microsoft Research, Daan Leijen
+Copyright (c) 2019-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -7,220 +7,204 @@ terms of the MIT license. A copy of the license can be found in the file
 
 /* ----------------------------------------------------------------------------
 "Arenas" are fixed area's of OS memory from which we can allocate
-large blocks (>= MI_ARENA_MIN_BLOCK_SIZE, 4MiB).
+large blocks (>= MI_ARENA_MIN_BLOCK_SIZE, 64KiB).
 In contrast to the rest of mimalloc, the arenas are shared between
 threads and need to be accessed using atomic operations.
 
-Arenas are used to for huge OS page (1GiB) reservations or for reserving
+Arenas are also used to for huge OS page (1GiB) reservations or for reserving
 OS memory upfront which can be improve performance or is sometimes needed
 on embedded devices. We can also employ this with WASI or `sbrk` systems
 to reserve large arenas upfront and be able to reuse the memory more effectively.
 
 The arena allocation needs to be thread safe and we use an atomic bitmap to allocate.
 -----------------------------------------------------------------------------*/
+
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
-
-#include <string.h>  // memset
-#include <errno.h>   // ENOMEM
-
-#include "bitmap.h"  // atomic bitmap
-
-/* -----------------------------------------------------------
-  Arena allocation
------------------------------------------------------------ */
-
-// Block info: bit 0 contains the `in_use` bit, the upper bits the
-// size in count of arena blocks.
-typedef uintptr_t mi_block_info_t;
-#define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 64MiB  (must be at least MI_SEGMENT_ALIGN)
-#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 32MiB
-#define MI_MAX_ARENAS         (112)                    // not more than 126 (since we use 7 bits in the memid and an arena index + 1)
-
-// A memory arena descriptor
-typedef struct mi_arena_s {
-  mi_arena_id_t id;                       // arena id; 0 for non-specific
-  mi_memid_t memid;                       // memid of the memory area
-  _Atomic(uint8_t*) start;                // the start of the memory area
-  size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
-  size_t   field_count;                   // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
-  size_t   meta_size;                     // size of the arena structure itself (including its bitmaps)
-  mi_memid_t meta_memid;                  // memid of the arena structure itself (OS or static allocation)
-  int      numa_node;                     // associated NUMA node
-  bool     exclusive;                     // only allow allocations if specifically for this arena
-  bool     is_large;                      // memory area consists of large- or huge OS pages (always committed)
-  _Atomic(size_t) search_idx;             // optimization to start the search for free blocks
-  _Atomic(mi_msecs_t) purge_expire;       // expiration time when blocks should be decommitted from `blocks_decommit`.  
-  mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
-  mi_bitmap_field_t* blocks_committed;    // are the blocks committed? (can be NULL for memory that cannot be decommitted)
-  mi_bitmap_field_t* blocks_purge;        // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
-  mi_bitmap_field_t* blocks_abandoned;    // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here)
-  mi_bitmap_field_t  blocks_inuse[1];     // in-place bitmap of in-use blocks (of size `field_count`)
-  // do not add further fields here as the dirty, committed, purged, and abandoned bitmaps follow the inuse bitmap fields.
-} mi_arena_t;
-
-
-// The available arenas
-static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
-static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
-
-
-//static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept;
+#include "mimalloc/prim.h"
+#include "bitmap.h"
 
 /* -----------------------------------------------------------
   Arena id's
-  id = arena_index + 1
 ----------------------------------------------------------- */
 
-static size_t mi_arena_id_index(mi_arena_id_t id) {
-  return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1);
+mi_arena_id_t _mi_arena_id_none(void) {
+  return NULL;
 }
 
-static mi_arena_id_t mi_arena_id_create(size_t arena_index) {
-  mi_assert_internal(arena_index < MI_MAX_ARENAS);
-  return (int)arena_index + 1;
+mi_arena_t* _mi_arena_from_id(mi_arena_id_t id) {
+  mi_arena_t* const arena = (mi_arena_t*)id;
+  mi_assert_internal(arena==NULL || arena->parent==NULL); // id's should never point to sub-arena's
+  return arena;
 }
 
-mi_arena_id_t _mi_arena_id_none(void) {
-  return 0;
+mi_arena_id_t mi_arena_id_from_arena(mi_arena_t* arena) {
+  mi_assert_internal(arena==NULL || arena->parent==NULL);
+  return (arena==NULL ? _mi_arena_id_none() : (mi_arena_id_t)arena);
 }
 
-static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) {
-  return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) ||
-          (arena_id == req_arena_id));
+
+static bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena) {
+  if (arena == req_arena) return true;                         // they match
+  if (arena == NULL) return false;
+  if (req_arena == NULL && !arena->is_exclusive) return true;  // or the arena is not exclusive, and we didn't request a specific one
+  if (arena->parent != NULL && arena->parent == req_arena) return true;  // sub-arena? (note that req_arena is never a sub arena)
+  return false;
 }
 
-bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) {
+bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena) {
   if (memid.memkind == MI_MEM_ARENA) {
-    return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id);
+    return mi_arena_is_suitable(memid.mem.arena.arena, request_arena);
   }
   else {
-    return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id);
+    return mi_arena_is_suitable(NULL, request_arena);
   }
 }
 
-bool _mi_arena_memid_is_os_allocated(mi_memid_t memid) {
-  return (memid.memkind == MI_MEM_OS);
+size_t mi_arenas_get_count(mi_subproc_t* subproc) {
+  return mi_atomic_load_relaxed(&subproc->arena_count);
 }
 
-/* -----------------------------------------------------------
-  Arena allocations get a (currently) 16-bit memory id where the
-  lower 8 bits are the arena id, and the upper bits the block index.
------------------------------------------------------------ */
+mi_arena_t* mi_arena_from_index(mi_subproc_t* subproc, size_t idx) {
+  mi_assert_internal(idx < mi_arenas_get_count(subproc));
+  return mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[idx]);
+}
 
-static size_t mi_block_count_of_size(size_t size) {
-  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
+static size_t mi_arena_info_slices(mi_arena_t* arena) {
+  return arena->info_slices;
 }
 
-static size_t mi_arena_block_size(size_t bcount) {
-  return (bcount * MI_ARENA_BLOCK_SIZE);
+#if MI_DEBUG > 1
+static bool mi_heap_has_page(mi_heap_t* heap, mi_arena_t* arena, mi_page_t* page) {
+  mi_assert(arena->arena_idx < MI_MAX_ARENAS);
+  mi_arena_pages_t* arena_pages = heap->arena_pages[arena->arena_idx];
+  return (page->memid.memkind == MI_MEM_ARENA &&
+          page->memid.mem.arena.arena == arena &&
+          arena_pages != NULL &&
+          mi_bitmap_is_setN(arena_pages->pages, page->memid.mem.arena.slice_index, 1));
 }
+#endif
 
-static size_t mi_arena_size(mi_arena_t* arena) {
-  return mi_arena_block_size(arena->block_count);
+size_t mi_arena_min_alignment(void) {
+  return MI_ARENA_SLICE_ALIGN;
 }
 
-static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_bitmap_index_t bitmap_index) {
-  mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
-  memid.mem.arena.id = id;
-  memid.mem.arena.block_index = bitmap_index;
-  memid.mem.arena.is_exclusive = is_exclusive;
-  return memid;
+size_t mi_arena_min_size(void) {
+  return MI_ARENA_MIN_SIZE;
 }
 
-static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
-  mi_assert_internal(memid.memkind == MI_MEM_ARENA);
-  *arena_index = mi_arena_id_index(memid.mem.arena.id);
-  *bitmap_index = memid.mem.arena.block_index;
-  return memid.mem.arena.is_exclusive;
+static size_t mi_arena_max_object_size(void) {
+  size_t max_size = mi_option_get_size(mi_option_arena_max_object_size);
+  max_size = _mi_align_up(max_size, MI_ARENA_SLICE_SIZE);
+  if (max_size <= MI_ARENA_MIN_OBJ_SIZE) {
+    return MI_ARENA_MIN_OBJ_SIZE;
+  }
+  else if (max_size >= MI_ARENA_MAX_SIZE - MI_BCHUNK_SIZE) {  // minus a bchunk to accommodate meta info
+    return (MI_ARENA_MAX_SIZE - MI_BCHUNK_SIZE);
+  }
+  else {
+    return max_size;
+  }
+}
+
+mi_decl_nodiscard static bool mi_arena_commit(mi_arena_t* arena, void* start, size_t size, bool* is_zero, size_t already_committed) {
+  if (arena != NULL && arena->commit_fun != NULL) {
+    return (*arena->commit_fun)(true, start, size, is_zero, arena->commit_fun_arg);
+  }
+  else if (already_committed > 0) {
+    return _mi_os_commit_ex(start, size, is_zero, already_committed);
+  }
+  else {
+    return _mi_os_commit(start, size, is_zero);
+  }
 }
 
 
 
 /* -----------------------------------------------------------
-  Special static area for mimalloc internal structures
-  to avoid OS calls (for example, for the arena metadata)
+  Util
 ----------------------------------------------------------- */
 
-#define MI_ARENA_STATIC_MAX  (MI_INTPTR_SIZE*MI_KiB)  // 8 KiB on 64-bit
 
-static mi_decl_cache_align uint8_t mi_arena_static[MI_ARENA_STATIC_MAX];  // must be cache aligned, see issue #895
-static mi_decl_cache_align _Atomic(size_t) mi_arena_static_top;
-
-static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* memid) {
-  *memid = _mi_memid_none();
-  if (size == 0 || size > MI_ARENA_STATIC_MAX) return NULL;
-  const size_t toplow = mi_atomic_load_relaxed(&mi_arena_static_top);
-  if ((toplow + size) > MI_ARENA_STATIC_MAX) return NULL;
-
-  // try to claim space
-  if (alignment < MI_MAX_ALIGN_SIZE) { alignment = MI_MAX_ALIGN_SIZE; }
-  const size_t oversize = size + alignment - 1;
-  if (toplow + oversize > MI_ARENA_STATIC_MAX) return NULL;
-  const size_t oldtop = mi_atomic_add_acq_rel(&mi_arena_static_top, oversize);
-  size_t top = oldtop + oversize;
-  if (top > MI_ARENA_STATIC_MAX) {
-    // try to roll back, ok if this fails
-    mi_atomic_cas_strong_acq_rel(&mi_arena_static_top, &top, oldtop);
-    return NULL;
-  }
-
-  // success
-  *memid = _mi_memid_create(MI_MEM_STATIC);
-  memid->initially_zero = true;
-  const size_t start = _mi_align_up(oldtop, alignment);
-  uint8_t* const p = &mi_arena_static[start];
-  _mi_memzero_aligned(p, size);
-  return p;
+// Size of an arena
+static size_t mi_arena_size(mi_arena_t* arena) {
+  return mi_size_of_slices(arena->slice_count);
 }
 
-static void* mi_arena_meta_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
-  *memid = _mi_memid_none();
-
-  // try static
-  void* p = mi_arena_static_zalloc(size, MI_MAX_ALIGN_SIZE, memid);
-  if (p != NULL) return p;
+// Start of the arena memory area
+static uint8_t* mi_arena_start(mi_arena_t* arena) {
+  return ((uint8_t*)arena);
+}
 
-  // or fall back to the OS
-  p = _mi_os_alloc(size, memid, stats);
-  if (p == NULL) return NULL;
+// Start of a slice
+uint8_t* mi_arena_slice_start(mi_arena_t* arena, size_t slice_index) {
+  mi_assert_internal(slice_index < arena->slice_count);
+  return (mi_arena_start(arena) + mi_size_of_slices(slice_index));
+}
 
-  // zero the OS memory if needed
-  if (!memid->initially_zero) {
-    _mi_memzero_aligned(p, size);
-    memid->initially_zero = true;
+mi_page_t* mi_arena_page_at_slice(mi_arena_t* arena, size_t slice_index) {
+  mi_assert_internal(slice_index < arena->slice_count);
+  if (arena->pages_meta != NULL) {
+    mi_page_t* const page = &arena->pages_meta[slice_index];
+    #if MI_PAGE_META_ALIGNED_FREE_SMALL
+    // pages with small blocks still have the page at the start of the slice (and set the `block_size` in pages_meta to 0)
+    if (page->block_size>0) return page;
+    #else
+    return page;
+    #endif    
   }
-  return p;
+  // fall through (for MI_PAGE_META_ALIGNED_FREE_SMALL)
+  return (mi_page_t*)mi_arena_slice_start(arena,slice_index);  
 }
 
-static void mi_arena_meta_free(void* p, mi_memid_t memid, size_t size, mi_stats_t* stats) {
-  if (mi_memkind_is_os(memid.memkind)) {
-    _mi_os_free(p, size, memid, stats);
-  }
-  else {
-    mi_assert(memid.memkind == MI_MEM_STATIC);
+// Arena area
+void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
+  if (size != NULL) *size = 0;
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
+  if (arena == NULL) return NULL;
+  if (size != NULL) {
+    mi_assert_internal(mi_size_of_slices(arena->slice_count) <= arena->total_size);
+    *size = arena->total_size;
   }
+  return mi_arena_start(arena);
 }
 
-static void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
-  return (arena->start + mi_arena_block_size(mi_bitmap_index_bit(bindex)));
-}
 
+// Create an arena memid
+static mi_memid_t mi_memid_create_arena(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
+  mi_assert_internal(slice_index < UINT32_MAX);
+  mi_assert_internal(slice_count < UINT32_MAX);
+  mi_assert_internal(slice_count > 0);
+  mi_assert_internal(slice_index < arena->slice_count);
+  mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
+  memid.mem.arena.arena = arena;
+  memid.mem.arena.slice_index = (uint32_t)slice_index;
+  memid.mem.arena.slice_count = (uint32_t)slice_count;
+  return memid;
+}
 
-/* -----------------------------------------------------------
-  Thread safe allocation in an arena
------------------------------------------------------------ */
+// get the arena and slice span
+static mi_arena_t* mi_arena_from_memid(mi_memid_t memid, size_t* slice_index, size_t* slice_count) {
+  mi_assert_internal(memid.memkind == MI_MEM_ARENA);
+  mi_arena_t* arena = memid.mem.arena.arena;
+  if (slice_index!=NULL) { *slice_index = memid.mem.arena.slice_index; }
+  if (slice_count!=NULL) { *slice_count = memid.mem.arena.slice_count; }
+  return arena;
+}
 
-// claim the `blocks_inuse` bits
-static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
-{
-  size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
-  if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) {
-    mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around
-    return true;
-  };
-  return false;
+static size_t mi_page_full_size(mi_page_t* page) {
+  if (page->memid.memkind == MI_MEM_ARENA) {
+    return page->memid.mem.arena.slice_count * MI_ARENA_SLICE_SIZE;
+  }
+  else if (mi_memid_is_os(page->memid) || page->memid.memkind == MI_MEM_EXTERNAL) {
+    mi_assert_internal((uint8_t*)page->memid.mem.os.base <= (uint8_t*)page);
+    const ptrdiff_t presize = (uint8_t*)page - (uint8_t*)page->memid.mem.os.base;
+    mi_assert_internal((ptrdiff_t)page->memid.mem.os.size >= presize);
+    return (presize > (ptrdiff_t)page->memid.mem.os.size ? 0 : page->memid.mem.os.size - presize);
+  }
+  else {
+    return 0;
+  }
 }
 
 
@@ -228,646 +212,1164 @@ static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index
   Arena Allocation
 ----------------------------------------------------------- */
 
-static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
-                                                    bool commit, mi_memid_t* memid, mi_os_tld_t* tld)
+static mi_decl_noinline void* mi_arena_try_alloc_at(
+  mi_arena_t* arena, size_t slice_count, bool commit, size_t tseq, mi_memid_t* memid)
 {
-  MI_UNUSED(arena_index);
-  mi_assert_internal(mi_arena_id_index(arena->id) == arena_index);
-
-  mi_bitmap_index_t bitmap_index;
-  if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index, tld->stats)) return NULL;
+  size_t slice_index;
+  if (!mi_bbitmap_try_find_and_clearN(arena->slices_free, slice_count, tseq, &slice_index)) return NULL;
 
   // claimed it!
-  void* p = mi_arena_block_start(arena, bitmap_index);
-  *memid = mi_memid_create_arena(arena->id, arena->exclusive, bitmap_index);
+  void* p = mi_arena_slice_start(arena, slice_index);
+  *memid = mi_memid_create_arena(arena, slice_index, slice_count);
   memid->is_pinned = arena->memid.is_pinned;
 
-  // none of the claimed blocks should be scheduled for a decommit
-  if (arena->blocks_purge != NULL) {
-    // this is thread safe as a potential purge only decommits parts that are not yet claimed as used (in `blocks_inuse`).
-    _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, needed_bcount, bitmap_index);
-  }
-
-  // set the dirty bits (todo: no need for an atomic op here?)
-  if (arena->memid.initially_zero && arena->blocks_dirty != NULL) {
-    memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
+  // set the dirty bits and track which slices become accessible
+  size_t touched_slices = slice_count;
+  if (arena->memid.initially_zero) {
+    size_t already_dirty = 0;
+    memid->initially_zero = mi_bitmap_setN(arena->slices_dirty, slice_index, slice_count, &already_dirty);
+    mi_assert_internal(already_dirty <= touched_slices);
+    touched_slices -= already_dirty;
   }
 
   // set commit state
-  if (arena->blocks_committed == NULL) {
-    // always committed
-    memid->initially_committed = true;
-  }
-  else if (commit) {
+  if (commit) {
     // commit requested, but the range may not be committed as a whole: ensure it is committed now
-    memid->initially_committed = true;
-    bool any_uncommitted;
-    _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
-    if (any_uncommitted) {
+    const size_t already_committed = mi_bitmap_popcountN(arena->slices_committed, slice_index, slice_count);
+    if (already_committed < slice_count) {
+      // not all committed, try to commit now
       bool commit_zero = false;
-      if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero, tld->stats)) {
-        memid->initially_committed = false;
+      if (!_mi_os_commit_ex(p, mi_size_of_slices(slice_count), &commit_zero, mi_size_of_slices(slice_count - already_committed))) {
+        // if the commit fails, release ownership, and return NULL;
+        // note: this does not roll back dirty bits but that is ok.
+        mi_bbitmap_setN(arena->slices_free, slice_index, slice_count);
+        return NULL;
       }
-      else {
-        if (commit_zero) { memid->initially_zero = true; }
+      if (commit_zero) {
+        memid->initially_zero = true;
       }
-    }
-  }
-  else {
-    // no need to commit, but check if already fully committed
-    memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
-  }
-
-  return p;
-}
 
-// allocate in a speficic arena
-static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment,
-                                       bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
-{
-  MI_UNUSED_RELEASE(alignment);
-  mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
-  const size_t bcount = mi_block_count_of_size(size);
-  const size_t arena_index = mi_arena_id_index(arena_id);
-  mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
-  mi_assert_internal(size <= mi_arena_block_size(bcount));
-
-  // Check arena suitability
-  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
-  if (arena == NULL) return NULL;
-  if (!allow_large && arena->is_large) return NULL;
-  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL;
-  if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity
-    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
-    if (match_numa_node) { if (!numa_suitable) return NULL; }
-                    else { if (numa_suitable) return NULL; }
-  }
+      // set the commit bits
+      mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, NULL);
 
-  // try to allocate
-  void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, memid, tld);
-  mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment));
-  return p;
-}
+      // committed
+      #if MI_DEBUG > 1
+      if (memid->initially_zero) {
+        if (!mi_mem_is_zero(p, mi_size_of_slices(slice_count))) {
+          _mi_error_message(EFAULT, "internal error: arena allocation was not zero-initialized!\n");
+          memid->initially_zero = false;
+        }
+      }
+      #endif
+    }
+    else {
+      // already fully committed.
+      _mi_os_reuse(p, mi_size_of_slices(slice_count));
+      // if the OS has overcommit, and this is the first time we access these pages, then
+      // count the commit now (as at arena reserve we didn't count those commits as these are on-demand)
+      if (_mi_os_has_overcommit() && touched_slices > 0 && !arena->memid.is_pinned /* huge pages, issue #1236 */) {
+        mi_subproc_stat_increase( arena->subproc, committed, mi_size_of_slices(touched_slices));
+      }
+    }
 
+    mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+    memid->initially_committed = true;
 
-// allocate from an arena with fallback to the OS
-static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment,
-                                                  bool commit, bool allow_large,
-                                                  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
-{
-  MI_UNUSED(alignment);
-  mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  if mi_likely(max_arena == 0) return NULL;
-
-  if (req_arena_id != _mi_arena_id_none()) {
-    // try a specific arena if requested
-    if (mi_arena_id_index(req_arena_id) < max_arena) {
-      void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-      if (p != NULL) return p;
+    // tool support
+    if (memid->initially_zero) {
+      mi_track_mem_defined(p, slice_count * MI_ARENA_SLICE_SIZE);
+    }
+    else {
+      mi_track_mem_undefined(p, slice_count * MI_ARENA_SLICE_SIZE);
     }
   }
   else {
-    // try numa affine allocation
-    for (size_t i = 0; i < max_arena; i++) {
-      void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-      if (p != NULL) return p;
-    }
-
-    // try from another numa node instead..
-    if (numa_node >= 0) {  // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already
-      for (size_t i = 0; i < max_arena; i++) {
-        void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-        if (p != NULL) return p;
-      }
+    // no need to commit, but check if it is already fully committed
+    memid->initially_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count);
+    if (!memid->initially_committed) {
+      // partly committed.. adjust stats
+      size_t already_committed_count = 0;
+      mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count);
+      mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count);
+      mi_subproc_stat_decrease(arena->subproc, committed, mi_size_of_slices(already_committed_count));
     }
   }
-  return NULL;
+
+  mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+  if (commit) { mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); }
+  if (commit) { mi_assert_internal(memid->initially_committed); }
+  mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
+
+  return p;
 }
 
+
+static int mi_reserve_os_memory_ex2(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id);
+
 // try to reserve a fresh arena space
-static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t *arena_id)
+static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_large, mi_arena_id_t* arena_id)
 {
-  if (_mi_preloading()) return false;  // use OS only while pre loading
-  if (req_arena_id != _mi_arena_id_none()) return false;
-
-  const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
+  const size_t arena_count = mi_arenas_get_count(subproc);
   if (arena_count > (MI_MAX_ARENAS - 4)) return false;
 
+  // calc reserve
   size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve);
   if (arena_reserve == 0) return false;
 
   if (!_mi_os_has_virtual_reserve()) {
     arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for WASM for example)
   }
-  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
-  if (arena_count >= 8 && arena_count <= 128) {
-    arena_reserve = ((size_t)1<<(arena_count/8)) * arena_reserve;  // scale up the arena sizes exponentially
+  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE);
+
+  if (arena_count >= 1 && arena_count <= 128) {
+    // scale up the arena sizes exponentially every 8 entries
+    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16);
+    size_t reserve = 0;
+    if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
+      arena_reserve = reserve;
+    }
+  }
+
+  // try to accommodate the requested size for huge allocations
+  req_size = _mi_align_up(req_size + MI_ARENA_MAX_CHUNK_OBJ_SIZE, MI_ARENA_MAX_CHUNK_OBJ_SIZE); // over-reserve for meta-info
+  if (arena_reserve < req_size) {
+    arena_reserve = req_size;
+  }
+
+  // check arena bounds
+  const size_t min_reserve = MI_ARENA_MIN_SIZE;
+  const size_t max_reserve = MI_ARENA_MAX_SIZE;   // 16 GiB
+  if (arena_reserve < min_reserve) {
+    arena_reserve = min_reserve;
+  }
+  else if (arena_reserve > max_reserve) {
+    arena_reserve = max_reserve;
   }
-  if (arena_reserve < req_size) return false;  // should be able to at least handle the current allocation size
+
+  // should be able to at least handle the current allocation size
+  if (arena_reserve < req_size) return false;
 
   // commit eagerly?
   bool arena_commit = false;
-  if (mi_option_get(mi_option_arena_eager_commit) == 2)      { arena_commit = _mi_os_has_overcommit(); }
+  const bool overcommit = _mi_os_has_overcommit();
+  if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = overcommit; }
   else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
 
-  return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0);
+  // on an OS with overcommit (Linux) we don't count the commit yet as it is on-demand. Once a slice
+  // is actually allocated for the first time it will be counted.
+  const bool adjust = (overcommit && arena_commit);
+  if (adjust) { mi_subproc_stat_adjust_decrease( subproc, committed, arena_reserve); }
+  // and try to reserve the arena
+  int err = mi_reserve_os_memory_ex2(subproc, arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
+  if (err != 0) {
+    if (adjust) { mi_subproc_stat_adjust_increase( subproc, committed, arena_reserve); } // roll back
+    // failed to allocate: try a smaller size arena as fallback?
+    const size_t small_arena_reserve = 4 * MI_ARENA_MIN_SIZE; // 128 MiB (or 32 MiB on 32-bit)
+    if (arena_reserve > small_arena_reserve && small_arena_reserve > req_size) {
+      // try again
+      if (adjust) { mi_subproc_stat_adjust_decrease(subproc, committed, small_arena_reserve); }
+      err = mi_reserve_os_memory_ex2(subproc, small_arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
+      if (err != 0 && adjust) { mi_subproc_stat_adjust_increase( subproc, committed, small_arena_reserve); } // roll back
+    }
+  }
+  return (err==0);
+}
+
+
+
+
+/* -----------------------------------------------------------
+  Arena iteration
+----------------------------------------------------------- */
+
+static inline bool mi_arena_is_suitable_ex(mi_arena_t* arena, mi_arena_t* req_arena, bool match_numa, int numa_node, bool allow_pinned) {
+  if (!allow_pinned && arena->memid.is_pinned) return false;
+  if (!mi_arena_is_suitable(arena, req_arena)) return false;
+  if (req_arena == NULL) { // if not specific, check numa affinity
+    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
+    if (match_numa) { if (!numa_suitable) return false; }
+               else { if (numa_suitable)  return false; }
+  }
+  return true;
+}
+
+// determine the start of search; important to keep heaps and threads
+// into their own memory regions to reduce contention.
+static size_t mi_arena_start_idx(mi_heap_t* heap, size_t tseq, size_t arena_cycle) {
+  const size_t hseq   = heap->heap_seq;
+  const size_t hcount = mi_atomic_load_relaxed(&heap->subproc->heap_count);
+  if (arena_cycle <= 1)     return 0;
+  if (hseq==0 || hcount<=1) return (tseq % arena_cycle); // common for single heap programs
+
+  // spread heaps evenly among arena's, and then evenly for threads in their fraction
+  size_t start;
+  mi_assert_internal(arena_cycle <= 0x8FF);             // prevent overflow on 32-bit
+  const size_t frac = (arena_cycle * 256) / hcount;     // fraction in the arena_cycle; at most: arena_cycle * 0x100
+  if (frac==0) {
+    // many heaps (> 256 per arena)
+    start = (hseq % arena_cycle);
+  }
+  else {
+    const size_t hspot = (hseq % hcount);
+    start = (frac * hspot) / 256;
+    if (frac >= 512) {  // at least 2 arena's per heap?
+      start = start + (tseq % (frac/256));
+    }
+  }
+  mi_assert_internal(start < arena_cycle);
+  return start;
 }
 
+#define mi_forall_arenas(heap, req_arena, tseq, name_arena) { \
+  const size_t _arena_count = mi_arenas_get_count(heap->subproc); \
+  const size_t _arena_cycle = (_arena_count == 0 ? 0 : _arena_count - 1); /* first search the arenas below the last one */ \
+  /* always start searching in the arena's below the max */ \
+  const size_t _start = mi_arena_start_idx(heap,tseq,_arena_cycle); \
+  for (size_t _i = 0; _i < _arena_count; _i++) { \
+    mi_arena_t* name_arena; \
+    if (req_arena != NULL) { \
+      name_arena = req_arena; /* if there is a specific req_arena, only search that one */\
+      if (_i > 0) break;      /* only once */ \
+    } \
+    else { \
+      size_t _idx; \
+      if (_i < _arena_cycle) { \
+        _idx = _i + _start; \
+        if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate through the cycle */ \
+      } \
+      else { \
+        _idx = _i; /* remaining arena's after the cycle */ \
+      } \
+      name_arena = mi_arena_from_index(heap->subproc,_idx); \
+    } \
+    if (name_arena != NULL) \
+    {
+
+#define mi_forall_arenas_end()  \
+    } \
+  } \
+  }
+
+#define mi_forall_suitable_arenas(heap, req_arena, tseq, match_numa, numa_node, allow_large, name_arena) \
+  mi_forall_arenas(heap, req_arena,tseq,name_arena) { \
+    if (mi_arena_is_suitable_ex(name_arena, req_arena, match_numa, numa_node, allow_large)) { \
+
+#define mi_forall_suitable_arenas_end() \
+  }} \
+  mi_forall_arenas_end()
+
+/* -----------------------------------------------------------
+  Arena allocation
+----------------------------------------------------------- */
 
-void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large,
-                              mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
+// allocate slices from the arenas
+static mi_decl_noinline void* mi_arenas_try_find_free(
+  mi_heap_t* heap, size_t slice_count, size_t alignment,
+  bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid)
 {
-  mi_assert_internal(memid != NULL && tld != NULL);
-  mi_assert_internal(size > 0);
-  *memid = _mi_memid_none();
+  // mi_assert_internal(slice_count <= mi_slice_count_of_size(MI_ARENA_MAX_CHUNK_OBJ_SIZE));
+  mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
+  if (alignment > MI_ARENA_SLICE_ALIGN) return NULL;
 
-  const int numa_node = _mi_os_numa_node(tld); // current numa node
-
-  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
-  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) {  // is arena allocation allowed?
-    if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
-      void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-      if (p != NULL) return p;
-
-      // otherwise, try to first eagerly reserve a new arena
-      if (req_arena_id == _mi_arena_id_none()) {
-        mi_arena_id_t arena_id = 0;
-        if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
-          // and try allocate in there
-          mi_assert_internal(req_arena_id == _mi_arena_id_none());
-          p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-          if (p != NULL) return p;
-        }
-      }
+  // search arena's
+  mi_forall_suitable_arenas(heap, req_arena, tseq, true /* only numa matching */, numa_node, allow_large, arena)
+  {
+    void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid);
+    if (p != NULL) return p;
+  }
+  mi_forall_suitable_arenas_end();
+  if (numa_node < 0) return NULL;
+
+  // search again but now regardless of preferred numa affinity
+  mi_forall_suitable_arenas(heap, req_arena, tseq, false /* numa non-matching now */, numa_node, allow_large, arena)
+  {
+    void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid);
+    if (p != NULL) return p;
+  }
+  mi_forall_suitable_arenas_end();
+  return NULL;
+}
+
+// Allocate slices from the arena's -- potentially allocating a fresh arena
+static mi_decl_noinline void* mi_arenas_try_alloc(
+  mi_heap_t* heap,
+  size_t slice_count, size_t alignment,
+  bool commit, bool allow_large,
+  mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid)
+{
+  // mi_assert(slice_count <= MI_ARENA_MAX_CHUNK_OBJ_SLICES);
+  mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
+  void* p;
+
+  // not too large?
+  if (slice_count * MI_ARENA_SLICE_SIZE > MI_ARENA_MAX_SIZE) return NULL;
+
+  // try to find free slices in the arena's
+  p = mi_arenas_try_find_free(heap, slice_count, alignment, commit, allow_large, req_arena, tseq, numa_node, memid);
+  if (p != NULL) return p;
+
+  // did we need a specific arena?
+  if (req_arena != NULL) return NULL;
+
+  // don't create arena's while preloading (todo: or should we?)
+  if (_mi_preloading()) return NULL;
+
+  // don't create arena's if OS allocation is disallowed
+  if (mi_option_is_enabled(mi_option_disallow_os_alloc)) return NULL;
+
+  // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?)
+  mi_subproc_t* const subproc = heap->subproc;
+  const size_t arena_count = mi_arenas_get_count(subproc);
+  mi_lock(&subproc->arena_reserve_lock) {
+    if (arena_count == mi_arenas_get_count(subproc)) {
+      // we are the first to enter the lock, reserve a fresh arena
+      mi_arena_id_t arena_id = _mi_arena_id_none();
+      mi_arena_reserve(subproc, mi_size_of_slices(slice_count), allow_large, &arena_id);
+    }
+    else {
+      // another thread already reserved a new arena
     }
   }
+  // try once more to allocate in the new arena
+  mi_assert_internal(req_arena == NULL);
+  p = mi_arenas_try_find_free(heap, slice_count, alignment, commit, allow_large, req_arena, tseq, numa_node, memid);
+  if (p != NULL) return p;
+
+  return NULL;
+}
 
+// Allocate from the OS (if allowed)
+static void* mi_arena_os_alloc_aligned(
+  size_t size, size_t alignment, size_t align_offset,
+  bool commit, bool allow_large,
+  mi_arena_id_t req_arena_id, mi_memid_t* memid)
+{
   // if we cannot use OS allocation, return NULL
   if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) {
     errno = ENOMEM;
     return NULL;
   }
 
-  // finally, fall back to the OS
   if (align_offset > 0) {
-    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats);
+    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid);
   }
   else {
-    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats);
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid);
   }
 }
 
-void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
+
+// Allocate large sized memory
+void* _mi_arenas_alloc_aligned( mi_heap_t* heap,
+  size_t size, size_t alignment, size_t align_offset,
+  bool commit, bool allow_large,
+  mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid)
 {
-  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
-}
+  mi_assert_internal(memid != NULL);
+  mi_assert_internal(size > 0);
 
+  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for theap meta data)
+  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) &&                // is arena allocation allowed?
+      size >= MI_ARENA_MIN_OBJ_SIZE && size <= mi_arena_max_object_size() &&  // and not too small or too large
+      alignment <= MI_ARENA_SLICE_ALIGN && align_offset == 0)                 // and good alignment
+  {
+    const size_t slice_count = mi_slice_count_of_size(size);
+    void* p = mi_arenas_try_alloc(heap, slice_count, alignment, commit, allow_large, req_arena, tseq, numa_node, memid);
+    if (p != NULL) return p;
+  }
 
-void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
-  if (size != NULL) *size = 0;
-  size_t arena_index = mi_arena_id_index(arena_id);
-  if (arena_index >= MI_MAX_ARENAS) return NULL;
-  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
-  if (arena == NULL) return NULL;
-  if (size != NULL) { *size = mi_arena_block_size(arena->block_count); }
-  return arena->start;
+  // fall back to the OS
+  void* p = mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena, memid);
+  return p;
+}
+
+void* _mi_arenas_alloc(mi_heap_t* heap, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid)
+{
+  return _mi_arenas_alloc_aligned(heap, size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena, tseq, numa_node, memid);
 }
 
 
+
 /* -----------------------------------------------------------
-  Arena purge
+  Arena page allocation
 ----------------------------------------------------------- */
 
-static long mi_arena_purge_delay(void) {
-  // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
-  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
+// release ownership of a page. This may free the page if all blocks were concurrently
+// freed in the meantime. Returns true if the page was freed.
+static bool mi_abandoned_page_unown(mi_page_t* page, mi_theap_t* current_theap) {
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(_mi_thread_id()==current_theap->tld->thread_id);
+  mi_thread_free_t tf_new;
+  mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    mi_assert_internal(mi_tf_is_owned(tf_old));
+    while mi_unlikely(mi_tf_block(tf_old) != NULL) {
+      _mi_page_free_collect(page, false);  // update used
+      if (mi_page_all_free(page)) {        // it may become free just before unowning it
+        _mi_arenas_page_unabandon(page, current_theap);
+        _mi_arenas_page_free(page, current_theap);
+        return true;
+      }
+      tf_old = mi_atomic_load_relaxed(&page->xthread_free);
+    }
+    mi_assert_internal(mi_tf_block(tf_old)==NULL);
+    tf_new = mi_tf_create(NULL, false);
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new));
+  return false;
 }
 
-// reset or decommit in an arena and update the committed/decommit bitmaps
-// assumes we own the area (i.e. blocks_in_use is claimed by us)
-static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) {
-  mi_assert_internal(arena->blocks_committed != NULL);
-  mi_assert_internal(arena->blocks_purge != NULL);
-  mi_assert_internal(!arena->memid.is_pinned);
-  const size_t size = mi_arena_block_size(blocks);
-  void* const p = mi_arena_block_start(arena, bitmap_idx);
-  bool needs_recommit;
-  if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) {
-    // all blocks are committed, we can purge freely
-    needs_recommit = _mi_os_purge(p, size, stats);
+
+static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena, bool* keep_abandoned) {
+  // found an abandoned page of the right size
+  mi_page_t* const page  = mi_arena_page_at_slice(arena, slice_index);
+  // can we claim ownership?
+  if (!mi_page_claim_ownership(page)) {
+    // there was a concurrent free that reclaims this page ..
+    // we need to keep it in the abandoned map as the free will call `mi_arena_page_unabandon`,
+    // and wait for readers (us!) to finish. This is why it is very important to set the abandoned
+    // bit again (or otherwise the unabandon will never stop waiting).
+    *keep_abandoned = true;
+    return false;
   }
   else {
-    // some blocks are not committed -- this can happen when a partially committed block is freed
-    // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
-    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
-    // and also undo the decommit stats (as it was already adjusted)
-    mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
-    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats);
-    if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); }
-  }
-
-  // clear the purged blocks
-  _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx);
-  // update committed bitmap
-  if (needs_recommit) {
-    _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
+    // yes, we can reclaim it, keep the abandoned map entry clear
+    *keep_abandoned = false;
+    return true;
   }
 }
 
-// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
-// Note: assumes we (still) own the area as we may purge immediately
-static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) {
-  mi_assert_internal(arena->blocks_purge != NULL);
-  const long delay = mi_arena_purge_delay();
-  if (delay < 0) return;  // is purging allowed at all?
+// allocate initial arena_pages from the main heap
+static mi_arena_pages_t* mi_arena_pages_alloc(mi_arena_t* arena);
 
-  if (_mi_preloading() || delay == 0) {
-    // decommit directly
-    mi_arena_purge(arena, bitmap_idx, blocks, stats);
-  }
-  else {
-    // schedule decommit
-    mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
-    if (expire != 0) {
-      mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10));  // add smallish extra delay
-    }
-    else {
-      mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay);
-    }
-    _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL);
-  }
-}
-
-// purge a range of blocks
-// return true if the full range was purged.
-// assumes we own the area (i.e. blocks_in_use is claimed by us)
-static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx, size_t bitlen, size_t purge, mi_stats_t* stats) {
-  const size_t endidx = startidx + bitlen;
-  size_t bitidx = startidx;
-  bool all_purged = false;
-  while (bitidx < endidx) {
-    // count consequetive ones in the purge mask
-    size_t count = 0;
-    while (bitidx + count < endidx && (purge & ((size_t)1 << (bitidx + count))) != 0) {
-      count++;
-    }
-    if (count > 0) {
-      // found range to be purged
-      const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitidx);
-      mi_arena_purge(arena, range_idx, count, stats);
-      if (count == bitlen) {
-        all_purged = true;
-      }
-    }
-    bitidx += (count+1); // +1 to skip the zero bit (or end)
+static mi_arena_pages_t* mi_heap_arena_pages(mi_heap_t* heap, mi_arena_t* arena) {
+  mi_assert_internal(arena!=NULL);
+  mi_assert_internal(heap!=NULL);
+  mi_assert(arena->arena_idx < MI_MAX_ARENAS);
+  return mi_atomic_load_ptr_relaxed(mi_arena_pages_t, &heap->arena_pages[arena->arena_idx]);
+}
+
+static mi_arena_t* mi_page_arena_pages(mi_page_t* page, size_t* slice_index, size_t* slice_count, mi_arena_pages_t** parena_pages) {
+  // todo: maybe store the arena* directly in the page?
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_arena_t* const arena = mi_arena_from_memid(page->memid, slice_index, slice_count);
+  mi_assert_internal(arena != NULL);
+  if (parena_pages != NULL) {
+    mi_arena_pages_t* const arena_pages = mi_heap_arena_pages(mi_page_heap(page), arena);
+    mi_assert_internal(arena_pages != NULL);
+    mi_assert_internal(slice_index==NULL || mi_bitmap_is_set(arena_pages->pages, *slice_index));
+    *parena_pages = arena_pages;
   }
-  return all_purged;
+  return arena;
 }
 
-// returns true if anything was purged
-static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats)
-{
-  if (arena->memid.is_pinned || arena->blocks_purge == NULL) return false;
-  mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
-  if (expire == 0) return false;
-  if (!force && expire > now) return false;
-
-  // reset expire (if not already set concurrently)
-  mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0);
-
-  // potential purges scheduled, walk through the bitmap
-  bool any_purged = false;
-  bool full_purge = true;
-  for (size_t i = 0; i < arena->field_count; i++) {
-    size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]);
-    if (purge != 0) {
-      size_t bitidx = 0;
-      while (bitidx < MI_BITMAP_FIELD_BITS) {
-        // find consequetive range of ones in the purge mask
-        size_t bitlen = 0;
-        while (bitidx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitidx + bitlen))) != 0) {
-          bitlen++;
-        }
-        // try to claim the longest range of corresponding in_use bits
-        const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitidx);
-        while( bitlen > 0 ) {
-          if (_mi_bitmap_try_claim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index)) {
-            break;
-          }
-          bitlen--;
+static mi_arena_pages_t* mi_heap_ensure_arena_pages(mi_heap_t* heap, mi_arena_t* arena) {
+  mi_assert_internal(arena!=NULL);
+  mi_assert_internal(heap!=NULL);
+  mi_assert(arena->arena_idx < MI_MAX_ARENAS);
+  mi_arena_pages_t* arena_pages = mi_heap_arena_pages(heap, arena);
+  if (arena_pages==NULL) {
+    mi_lock(&heap->arena_pages_lock) {
+      arena_pages = mi_atomic_load_ptr_acquire(mi_arena_pages_t, &heap->arena_pages[arena->arena_idx]);
+      if (arena_pages == NULL) {  // still NULL?
+        if (_mi_is_heap_main(heap)) {
+          // the page info for the main heap is always allocated as part of an arena
+          arena_pages = &arena->pages_main;
         }
-        // actual claimed bits at `in_use`
-        if (bitlen > 0) {
-          // read purge again now that we have the in_use bits
-          purge = mi_atomic_load_acquire(&arena->blocks_purge[i]);
-          if (!mi_arena_purge_range(arena, i, bitidx, bitlen, purge, stats)) {
-            full_purge = false;
-          }
-          any_purged = true;
-          // release the claimed `in_use` bits again
-          _mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index);
+        else {
+          // always allocate the arena pages info from the main heap
+          // todo: allocate into the current arena?
+          arena_pages = mi_arena_pages_alloc(arena);
         }
-        bitidx += (bitlen+1);  // +1 to skip the zero (or end)
-      } // while bitidx
-    } // purge != 0
-  }
-  // if not fully purged, make sure to purge again in the future
-  if (!full_purge) {
-    const long delay = mi_arena_purge_delay();
-    mi_msecs_t expected = 0;
-    mi_atomic_casi64_strong_acq_rel(&arena->purge_expire,&expected,_mi_clock_now() + delay);
+        mi_atomic_store_ptr_release(mi_arena_pages_t, &heap->arena_pages[arena->arena_idx], arena_pages);
+      }
+    }
   }
-  return any_purged;
+  if (_mi_is_heap_main(heap)) { mi_assert(arena_pages != NULL); }  // can never fail
+  return arena_pages;
 }
 
-static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats ) {
-  if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
+static mi_page_t* mi_arenas_page_try_find_abandoned(mi_theap_t* theap, size_t slice_count, size_t block_size)
+{
+  mi_heap_t* const heap = _mi_theap_heap(theap);
+  const size_t tseq = theap->tld->thread_seq;
+  mi_arena_t* const req_arena = heap->exclusive_arena;
 
-  const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count);
-  if (max_arena == 0) return;
+  MI_UNUSED(slice_count);
+  const size_t bin = _mi_bin(block_size);
+  mi_assert_internal(bin < MI_BIN_COUNT);
 
-  // allow only one thread to purge at a time
-  static mi_atomic_guard_t purge_guard;
-  mi_atomic_guard(&purge_guard)
+  // any abandoned in our size class?
+  mi_assert_internal(heap != NULL);
+  if (mi_atomic_load_relaxed(&heap->abandoned_count[bin]) == 0) {
+    return NULL;
+  }
+
+  // search arena's
+  const bool allow_large = true;
+  const int  any_numa = -1;
+  const bool match_numa = true;
+  mi_forall_suitable_arenas(heap, req_arena, tseq, match_numa, any_numa, allow_large, arena)
   {
-    mi_msecs_t now = _mi_clock_now();
-    size_t max_purge_count = (visit_all ? max_arena : 1);
-    for (size_t i = 0; i < max_arena; i++) {
-      mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
-      if (arena != NULL) {
-        if (mi_arena_try_purge(arena, now, force, stats)) {
-          if (max_purge_count <= 1) break;
-          max_purge_count--;
-        }
+    mi_arena_pages_t* const arena_pages = mi_heap_arena_pages(heap, arena);
+    if (arena_pages != NULL) {
+      size_t slice_index;
+      mi_bitmap_t* const bitmap = arena_pages->pages_abandoned[bin];
+
+      if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_try_claim_abandoned, arena)) {
+        // found an abandoned page of the right size
+        // and claimed ownership.
+        mi_page_t* page = mi_arena_page_at_slice(arena, slice_index);
+        mi_assert_internal(mi_page_is_owned(page));
+        mi_assert_internal(mi_page_is_abandoned(page));
+        mi_assert_internal(mi_heap_has_page(heap, arena, page));
+        mi_atomic_decrement_relaxed(&heap->abandoned_count[bin]);
+        mi_theap_stat_decrease(theap, pages_abandoned, 1);
+        mi_theap_stat_counter_increase(theap, pages_reclaim_on_alloc, 1);
+
+        _mi_page_free_collect(page, false);  // update `used` count
+        mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+        mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+        mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
+        mi_assert_internal(_mi_is_aligned(mi_page_slice_start(page), MI_PAGE_ALIGN));
+        mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
+        mi_assert_internal(mi_page_block_size(page) == block_size);
+        mi_assert_internal(!mi_page_is_full(page));
+        return page;
       }
     }
   }
+  mi_forall_suitable_arenas_end();
+  return NULL;
 }
 
+static uint8_t* mi_arenas_page_alloc_fresh_area(mi_theap_t* theap, size_t slice_count, size_t block_size, size_t block_alignment, bool os_align, bool commit, mi_memid_t* memid) {
+  MI_UNUSED_RELEASE(block_size);
+  const bool allow_large = (MI_SECURE < 2); // 2 = guard page at end of each arena page
+  const size_t page_alignment = MI_ARENA_SLICE_ALIGN;
 
-/* -----------------------------------------------------------
-  Arena free
------------------------------------------------------------ */
-
-void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) {
-  mi_assert_internal(size > 0 && stats != NULL);
-  mi_assert_internal(committed_size <= size);
-  if (p==NULL) return;
-  if (size==0) return;
-  const bool all_committed = (committed_size == size);
+  mi_heap_t*  const heap = _mi_theap_heap(theap);
+  mi_tld_t*   const tld  = theap->tld;
+  mi_arena_t* const req_arena = heap->exclusive_arena;
+  const int numa_node = (heap->numa_node >= 0 ? heap->numa_node : tld->numa_node);
 
-  if (mi_memkind_is_os(memid.memkind)) {
-    // was a direct OS allocation, pass through
-    if (!all_committed && committed_size > 0) {
-      // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size)
-      _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
+  // try to allocate from free space in arena's
+  uint8_t* start = NULL;
+  *memid = _mi_memid_none();
+  const size_t alloc_size = mi_size_of_slices(slice_count);
+  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) &&       // allowed to allocate from arena's?
+      !os_align &&                                                   // not large alignment
+      slice_count <= mi_arena_max_object_size()/MI_ARENA_SLICE_SIZE) // and not too large
+  {
+    start = (uint8_t*)mi_arenas_try_alloc(heap, slice_count, page_alignment, commit, allow_large, req_arena, tld->thread_seq, numa_node, memid);
+    if (start != NULL) {
+      mi_arena_pages_t* const arena_pages = mi_heap_ensure_arena_pages(heap, memid->mem.arena.arena);
+      if (arena_pages==NULL) {
+        _mi_arenas_free(start, mi_size_of_slices(slice_count), *memid); // roll back
+        start = NULL;
+      }
+      else {
+        mi_assert_internal(mi_bitmap_is_clearN(arena_pages->pages, memid->mem.arena.slice_index, memid->mem.arena.slice_count));
+        mi_bitmap_set(arena_pages->pages, memid->mem.arena.slice_index);
+      }
     }
-    _mi_os_free(p, size, memid, stats);
   }
-  else if (memid.memkind == MI_MEM_ARENA) {
-    // allocated in an arena
-    size_t arena_idx;
-    size_t bitmap_idx;
-    mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx);
-    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t,&mi_arenas[arena_idx]);
-    mi_assert_internal(arena != NULL);
-    const size_t blocks = mi_block_count_of_size(size);
-
-    // checks
-    if (arena == NULL) {
-      _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
-      return;
-    }
-    mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx));
-    if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) {
-      _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
-      return;
-    }
 
-    // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
-    mi_track_mem_undefined(p,size);
-
-    // potentially decommit
-    if (arena->memid.is_pinned || arena->blocks_committed == NULL) {
-      mi_assert_internal(all_committed);
+  // otherwise fall back to the OS
+  if (start == NULL) {
+    if (os_align) {
+      // note: slice_count already includes the page
+      mi_assert_internal(slice_count >= mi_slice_count_of_size(block_size) + mi_slice_count_of_size(page_alignment));
+      start = (uint8_t*)mi_arena_os_alloc_aligned(alloc_size, block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena, memid);
     }
     else {
-      mi_assert_internal(arena->blocks_committed != NULL);
-      mi_assert_internal(arena->blocks_purge != NULL);
-
-      if (!all_committed) {
-        // mark the entire range as no longer committed (so we recommit the full range when re-using)
-        _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
-        mi_track_mem_noaccess(p,size);
-        if (committed_size > 0) {
-          // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
-          // in the delayed purge, we now need to not count a decommit if the range is not marked as committed.
-          _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
-        }
-        // note: if not all committed, it may be that the purge will reset/decommit the entire range
-        // that contains already decommitted parts. Since purge consistently uses reset or decommit that
-        // works (as we should never reset decommitted parts).
-      }
-      // (delay) purge the entire range
-      mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats);
+      start = (uint8_t*)mi_arena_os_alloc_aligned(alloc_size, page_alignment, 0 /* align offset */, commit, allow_large, req_arena, memid);
     }
+  }
 
-    // and make it available to others again
-    bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
-    if (!all_inuse) {
-      _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size);
-      return;
-    };
+  if (start == NULL) return NULL;
+  mi_assert_internal(_mi_is_aligned(start, MI_PAGE_ALIGN));
+  mi_assert_internal(!os_align || _mi_is_aligned(start + page_alignment, block_alignment));
+  return start;
+}
+
+static size_t mi_page_block_start(size_t block_size, bool os_align) 
+{  
+  #if MI_GUARDED
+  // in a guarded build, we align pages with blocks a multiple of an OS page size, to the OS page size
+  // this ensures that all blocks in such pages are OS page size aligned (which is needed for the guard pages)
+  const size_t os_page_size = _mi_os_page_size();
+  mi_assert_internal(MI_PAGE_ALIGN >= os_page_size);
+  if (!os_align && block_size % os_page_size == 0 && block_size > os_page_size /* at least 2 or more */ ) {
+    return _mi_align_up(mi_page_info_size(), os_page_size);
+  }
+  else
+  #endif
+  if (os_align) {
+    return MI_PAGE_ALIGN;
+  }
+  else if (_mi_is_power_of_two(block_size) && block_size <= MI_PAGE_MAX_START_BLOCK_ALIGN2) {
+    // naturally align power-of-2 blocks up to MI_PAGE_MAX_START_BLOCK_ALIGN2 size (4KiB)
+    return _mi_align_up(mi_page_info_size(), block_size);
+  }
+  else if (block_size != 0 && (block_size % MI_PAGE_OSPAGE_BLOCK_ALIGN2) == 0) {
+    // also align large pages that are a multiple of MI_PAGE_OSPAGE_BLOCK_ALIGN2 (4KiB)
+    return _mi_align_up(mi_page_info_size(), MI_PAGE_OSPAGE_BLOCK_ALIGN2);
   }
   else {
-    // arena was none, external, or static; nothing to do
-    mi_assert_internal(memid.memkind < MI_MEM_OS);
+    // otherwise start after the info
+    return mi_page_info_size();
+  }  
+}
+
+// Allocate a fresh page
+static mi_page_t* mi_arenas_page_alloc_fresh(mi_theap_t* theap, size_t slice_count, size_t block_size, size_t block_alignment, bool commit)
+{
+  const bool os_align        = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
+  const size_t alloc_size    = mi_size_of_slices(slice_count);  
+  mi_memid_t memid           = _mi_memid_none();
+  uint8_t* const slice_start = mi_arenas_page_alloc_fresh_area(theap,slice_count,block_size,block_alignment,os_align,commit,&memid);
+  if (!slice_start) return NULL;
+
+  // guard page at the end of mimalloc page?
+  #if (MI_SECURE >= 2 && (!MI_PAGE_META_IS_SEPARATED || MI_PAGE_META_ALIGNED_FREE_SMALL)) || MI_SECURE >= 4
+  mi_assert(alloc_size > _mi_os_secure_guard_page_size());
+  const size_t page_noguard_size = alloc_size - _mi_os_secure_guard_page_size();
+  if (memid.initially_committed) {
+    _mi_os_secure_guard_page_set_at(slice_start + page_noguard_size, memid);
   }
+  #else
+  const size_t page_noguard_size = alloc_size;  
+  #endif
 
-  // purge expired decommits
-  mi_arenas_try_purge(false, false, stats);
-}
+  // allocate the page meta info
+  mi_page_t* page = NULL;  
+  bool page_meta_is_separate = false;
+  size_t block_start = 0;  
 
-// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
-// for dynamic libraries that are unloaded and need to release all their allocated memory.
-static void mi_arenas_unsafe_destroy(void) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  size_t new_max_arena = 0;
-  for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL) {
-      if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) {
-        mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
-        _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main);
-      }
-      else {
-        new_max_arena = i;
+  // allocate page meta info at the arena start?
+  if (memid.memkind == MI_MEM_ARENA) {
+    mi_arena_t* const arena = memid.mem.arena.arena;    
+    if (arena->pages_meta != NULL) {
+      mi_assert_internal(MI_PAGE_META_IS_SEPARATED!=0);
+      mi_page_t* const page_meta = &arena->pages_meta[memid.mem.arena.slice_index];      
+      mi_assert_internal(page_meta->block_size == 0);
+      #if MI_PAGE_META_ALIGNED_FREE_SMALL
+      // if `block_size <= MI_SMALL_SIZE_MAX` we put the page info in front of the slice, 
+      // (note: it is important that `page_meta->block_size == 0` for `mi_arena_page_at_slice`)
+      if (block_size > MI_SMALL_SIZE_MAX)        
+      #endif
+      {
+        page = page_meta;
+        page_meta_is_separate = true;
+        block_start = 0;
+        #if !defined(MI_PAGE_BLOCK_START_MAX_OFFSET)
+        #define MI_PAGE_BLOCK_START_MAX_OFFSET  (8*MI_INTPTR_BITS) /* 512 */
+        #endif
+        if (block_size >= MI_INTPTR_SIZE && block_size <= MI_PAGE_BLOCK_START_MAX_OFFSET && _mi_is_power_of_two(block_size)) { 
+          block_start += block_size;
+        }
+        mi_assert_internal(page->block_size == 0);
+        _mi_memzero_aligned(page, sizeof(*page));
       }
-      mi_arena_meta_free(arena, arena->meta_memid, arena->meta_size, &_mi_stats_main);
     }
   }
+  if (page == NULL) {
+    // put page meta info in front of the slice
+    page = (mi_page_t*)slice_start;
+    block_start = mi_page_block_start(block_size, os_align);    
+  }
 
-  // try to lower the max arena.
-  size_t expected = max_arena;
-  mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena);
+  // commit first block?
+  size_t commit_size = 0;
+  if (!memid.initially_committed) {
+    commit_size = _mi_align_up(block_start + block_size, MI_PAGE_MIN_COMMIT_SIZE);
+    if (commit_size > page_noguard_size) { commit_size = page_noguard_size; }
+    bool is_zero = false;
+    if mi_unlikely(!mi_arena_commit( mi_memid_arena(memid), slice_start, commit_size, &is_zero, 0)) {
+      _mi_arenas_free(slice_start, alloc_size, memid);
+      return NULL;
+    }
+  }  
+  if (!memid.initially_zero && !page_meta_is_separate) {
+    _mi_memzero_aligned(page, sizeof(*page));
+  }
+
+  // claimed free slices: initialize the page partly
+  if (!memid.initially_zero && memid.initially_committed) {
+    mi_track_mem_undefined(slice_start, slice_count * MI_ARENA_SLICE_SIZE);
+  }
+  else if (memid.initially_committed) {
+    mi_track_mem_defined(slice_start, slice_count * MI_ARENA_SLICE_SIZE);
+  }
+  #if MI_DEBUG > 1
+  if (memid.initially_zero && memid.initially_committed) {
+    if (!mi_mem_is_zero(slice_start, page_noguard_size)) {
+      _mi_error_message(EFAULT, "internal error: page memory was not zero initialized.\n");
+      memid.initially_zero = false;
+      if (block_start > 0) { _mi_memzero_aligned(page, sizeof(*page)); }
+    }
+  }
+  #endif
+  const size_t reserved = (os_align ? 1 : (page_noguard_size - block_start) / block_size);
+  mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX);
+
+  // initialize
+  page->reserved = (uint16_t)reserved;
+  page->page_start = slice_start + block_start;
+  page->block_size = block_size;
+  page->slice_committed = commit_size;
+  page->memid = memid;
+  page->free_is_zero = memid.initially_zero;
+  mi_assert_internal(page->free==NULL);
+  mi_assert_internal(page_meta_is_separate == mi_page_meta_is_separated(page)); 
+  mi_assert_internal(mi_page_slice_start(page) == slice_start);
+
+  // and own it
+  mi_page_claim_ownership(page);
+
+  // register in the page map
+  if mi_unlikely(!_mi_page_map_register(page)) {
+    _mi_arenas_free( slice_start, alloc_size, memid );
+    return NULL;
+  }
+
+  // stats
+  mi_theap_stat_increase(theap, pages, 1);
+  mi_theap_stat_increase(theap, page_bins[_mi_page_stats_bin(page)], 1);
+
+  mi_assert_internal(_mi_is_aligned(mi_page_slice_start(page),MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
+  mi_assert_internal(mi_page_block_size(page) == block_size);
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(mi_page_is_owned(page));
+
+  return page;
 }
 
-// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
-void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) {
-  mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats);
+// Allocate a regular small/medium/large page.
+static mi_page_t* mi_arenas_page_regular_alloc(mi_theap_t* theap, size_t slice_count, size_t block_size)
+{
+  // 1. look for an abandoned page
+  mi_page_t* page = mi_arenas_page_try_find_abandoned(theap, slice_count, block_size);
+  if (page != NULL) {
+    return page;  // return as abandoned
+  }
+
+  // 2. find a free block, potentially allocating a new arena
+  const long commit_on_demand = mi_option_get(mi_option_page_commit_on_demand);
+  const bool commit = (slice_count <= mi_slice_count_of_size(MI_PAGE_MIN_COMMIT_SIZE) ||  // always commit small pages
+                       (commit_on_demand == 2 && _mi_os_has_overcommit()) || (commit_on_demand == 0));
+  page = mi_arenas_page_alloc_fresh(theap, slice_count, block_size, 1, commit);
+  if (page == NULL) return NULL;
+
+  mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count);
+  if (!_mi_page_init(theap, page)) {
+    _mi_arenas_free( page, mi_page_full_size(page), page->memid);
+    return NULL;
+  }
+
+  return page;
 }
 
-// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
-// for dynamic libraries that are unloaded and need to release all their allocated memory.
-void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
-  mi_arenas_unsafe_destroy();
-  _mi_arenas_collect(true /* force purge */, stats);  // purge non-owned arenas
+// Allocate a page containing one block (very large, or with large alignment)
+static mi_page_t* mi_arenas_page_singleton_alloc(mi_theap_t* theap, size_t block_size, size_t block_alignment)
+{
+  const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
+  const size_t info_size = (os_align ? MI_PAGE_ALIGN : mi_page_info_size());
+  #if MI_SECURE < 2
+  const size_t slice_count = mi_slice_count_of_size(info_size + block_size);
+  #else
+  const size_t slice_count = mi_slice_count_of_size(_mi_align_up(info_size + block_size, _mi_os_secure_guard_page_size()) + _mi_os_secure_guard_page_size());
+  #endif
+
+  mi_page_t* page = mi_arenas_page_alloc_fresh(theap, slice_count, block_size, block_alignment, true /* commit singletons always */);
+  if (page == NULL) return NULL;
+
+  mi_assert(page->reserved == 1);
+  if (!_mi_page_init(theap, page)) {
+    _mi_arenas_free( page, mi_page_full_size(page), page->memid);
+    return NULL;
+  }
+
+  return page;
 }
 
-// Is a pointer inside any of our arenas?
-bool _mi_arena_contains(const void* p) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) {
-      return true;
+
+mi_page_t* _mi_arenas_page_alloc(mi_theap_t* theap, size_t block_size, size_t block_alignment) {
+  mi_page_t* page;
+  if mi_unlikely(block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+    mi_assert_internal(_mi_is_power_of_two(block_alignment));
+    page = mi_arenas_page_singleton_alloc(theap, block_size, block_alignment);
+  }
+  else if (block_size <= MI_SMALL_MAX_OBJ_SIZE) {
+    page = mi_arenas_page_regular_alloc(theap, mi_slice_count_of_size(MI_SMALL_PAGE_SIZE), block_size);
+  }
+  else if (block_size <= MI_MEDIUM_MAX_OBJ_SIZE) {
+    page = mi_arenas_page_regular_alloc(theap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size);
+  }
+  #if MI_ENABLE_LARGE_PAGES
+  else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) {
+    page = mi_arenas_page_regular_alloc(theap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size);
+  }
+  #endif
+  else {
+    page = mi_arenas_page_singleton_alloc(theap, block_size, block_alignment);
+  }
+  if mi_unlikely(page == NULL) {
+    return NULL;
+  }
+  // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc);
+  mi_assert_internal(_mi_is_aligned(mi_page_slice_start(page), MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
+  mi_assert_internal(block_alignment <= MI_PAGE_MAX_OVERALLOC_ALIGN || _mi_is_aligned(mi_page_start(page), block_alignment));
+
+  return page;
+}
+
+void _mi_arenas_page_free(mi_page_t* page, mi_theap_t* current_theapx) {
+  mi_assert_internal(_mi_is_aligned(mi_page_slice_start(page), MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_all_free(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(page->next==NULL && page->prev==NULL);
+  mi_assert_internal(current_theapx == NULL || _mi_thread_id()==current_theapx->tld->thread_id);
+
+  if (current_theapx != NULL) {
+    mi_theap_stat_decrease(current_theapx, page_bins[_mi_page_stats_bin(page)], 1);
+    mi_theap_stat_decrease(current_theapx, pages, 1);
+  }
+  else {
+    mi_heap_t* const heap = mi_page_heap(page);
+    mi_heap_stat_decrease(heap, page_bins[_mi_page_stats_bin(page)], 1);
+    mi_heap_stat_decrease(heap, pages, 1);
+  }
+
+  #if MI_DEBUG>1
+  if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) {
+    size_t bin = _mi_bin(mi_page_block_size(page));
+    size_t slice_index;
+    size_t slice_count;
+    mi_arena_pages_t* arena_pages = NULL;
+    mi_arena_t* const arena = mi_page_arena_pages(page, &slice_index, &slice_count, &arena_pages);
+    mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+    mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+    mi_assert_internal(mi_bitmap_is_clearN(arena_pages->pages_abandoned[bin], slice_index, 1));
+    mi_assert_internal(mi_bitmap_is_setN(arena_pages->pages, slice_index, 1));
+    // note: we cannot check for `!mi_page_is_abandoned_and_mapped` since that may
+    // be (temporarily) not true if the free happens while trying to reclaim
+    // see `mi_arena_try_claim_abandoned`
+  }
+  #endif
+
+  // recommit guard page at the end?
+  // we must do this since we may later allocate large spans over this page and cannot have a guard page in between
+  #if (MI_SECURE >= 2 && (!MI_PAGE_META_IS_SEPARATED || MI_PAGE_META_ALIGNED_FREE_SMALL)) || MI_SECURE >= 4
+  if (!page->memid.is_pinned) {
+    _mi_os_secure_guard_page_reset_before(mi_page_slice_start(page) + mi_page_full_size(page), page->memid);
+  }
+  #endif
+
+  // unregister page
+  _mi_page_map_unregister(page);
+  if (page->memid.memkind == MI_MEM_ARENA) {
+    mi_arena_pages_t* arena_pages;
+    size_t slice_index;
+    size_t slice_count; MI_UNUSED(slice_count);
+    mi_arena_t* const arena = mi_page_arena_pages(page, &slice_index, &slice_count, &arena_pages);
+    mi_assert_internal(arena_pages!=NULL);
+    mi_bitmap_clear(arena_pages->pages, slice_index);
+    if (page->slice_committed > 0) {
+      // if committed on-demand, set the commit bits to account commit properly
+      mi_assert_internal(mi_page_full_size(page) >= page->slice_committed);
+      const size_t total_slices = page->slice_committed / MI_ARENA_SLICE_SIZE;  // conservative
+      //mi_assert_internal(mi_bitmap_is_clearN(arena->slices_committed, slice_index, total_slices));
+      mi_assert_internal(slice_count >= total_slices);
+      if (total_slices > 0) {
+        mi_bitmap_setN(arena->slices_committed, slice_index, total_slices, NULL);
+      }
+      // any left over?
+      const size_t extra = page->slice_committed % MI_ARENA_SLICE_SIZE;
+      if (extra > 0) {
+        // pretend it was decommitted already
+        mi_subproc_stat_decrease(arena->subproc, committed, extra);
+      }
+    }
+    else {
+      mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
     }
   }
-  return false;
+  if (mi_page_meta_is_separated(page)) { page->block_size = 0; }  // for assertion checking
+  _mi_arenas_free( mi_page_slice_start(page), mi_page_full_size(page), page->memid);  
 }
 
 /* -----------------------------------------------------------
-  Abandoned blocks/segments.
-  This is used to atomically abandon/reclaim segments 
-  (and crosses the arena API but it is convenient to have here).
-  Abandoned segments still have live blocks; they get reclaimed
-  when a thread frees a block in it, or when a thread needs a fresh
-  segment; these threads scan the abandoned segments through
-  the arena bitmaps.
+  Arena abandon
 ----------------------------------------------------------- */
 
-// Maintain a count of all abandoned segments
-static mi_decl_cache_align _Atomic(size_t)abandoned_count;
-
-size_t _mi_arena_segment_abandoned_count(void) {
-  return mi_atomic_load_relaxed(&abandoned_count);
+void _mi_arenas_page_abandon(mi_page_t* page, mi_theap_t* current_theap) {
+  mi_assert_internal(_mi_is_aligned(mi_page_slice_start(page), MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(!mi_page_all_free(page));
+  mi_assert_internal(page->next==NULL && page->prev == NULL);
+  mi_assert_internal(_mi_thread_id()==current_theap->tld->thread_id);
+  // mi_assert_internal(current_theap == _mi_page_associated_theap(page));
+
+  mi_heap_t* heap = mi_page_heap(page); mi_assert_internal(heap==_mi_theap_heap(current_theap));
+  if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) {
+    // make available for allocations
+    size_t bin = _mi_bin(mi_page_block_size(page));
+    size_t slice_index;
+    size_t slice_count;
+    mi_arena_pages_t* arena_pages = NULL;
+    mi_arena_t* const arena = mi_page_arena_pages(page, &slice_index, &slice_count, &arena_pages); MI_UNUSED(arena);
+
+    mi_assert_internal(!mi_page_is_singleton(page));
+    mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+    mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+    mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
+
+    mi_page_set_abandoned_mapped(page);
+    const bool was_clear = mi_bitmap_set(arena_pages->pages_abandoned[bin], slice_index);
+    MI_UNUSED(was_clear); mi_assert_internal(was_clear);
+    mi_atomic_increment_relaxed(&heap->abandoned_count[bin]);
+    mi_theap_stat_increase(current_theap, pages_abandoned, 1);    
+  }
+  else {
+    // page is full (or a singleton), or the page is OS/externally allocated
+    // leave as is; it will be reclaimed when an object is free'd in the page
+    // but for non-arena pages, add to the subproc list so these can be visited
+    if (page->memid.memkind != MI_MEM_ARENA && mi_option_is_enabled(mi_option_visit_abandoned)) {
+      mi_lock(&heap->os_abandoned_pages_lock) {
+        // push in front
+        page->prev = NULL;
+        page->next = heap->os_abandoned_pages;
+        if (page->next != NULL) { page->next->prev = page; }
+        heap->os_abandoned_pages = page;
+      }
+    }
+    mi_theap_stat_increase(current_theap, pages_abandoned, 1);
+  }
+  mi_abandoned_page_unown(page, current_theap);
 }
 
-// reclaim a specific abandoned segment; `true` on success.
-// sets the thread_id.
-bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment ) 
-{
-  if (segment->memid.memkind != MI_MEM_ARENA) {
-    // not in an arena, consider it un-abandoned now.
-    // but we need to still claim it atomically -- we use the thread_id for that.
-    size_t expected = 0;
-    if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected, _mi_thread_id())) {
-      mi_atomic_decrement_relaxed(&abandoned_count);
-      return true;
+
+// this is called from `free.c:mi_free_try_collect_mt` only.
+bool _mi_arenas_page_try_reabandon_to_mapped(mi_page_t* page) {
+  mi_assert_internal(_mi_is_aligned(mi_page_slice_start(page), MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(!mi_page_is_abandoned_mapped(page));
+  mi_assert_internal(!mi_page_is_full(page));
+  mi_assert_internal(!mi_page_all_free(page));
+  mi_assert_internal(!mi_page_is_singleton(page));
+  if (mi_page_is_full(page) || mi_page_is_abandoned_mapped(page) || page->memid.memkind != MI_MEM_ARENA) {
+    return false;
+  }
+  else {
+    // do not use _mi_heap_theap as we may call this during shutdown of threads and don't want to reinitialize the theap
+    mi_theap_t* const theap = _mi_page_associated_theap_peek(page);
+    if (theap == NULL) {
+      return false;
     }
     else {
-      return false;
+      mi_theap_stat_counter_increase(theap, pages_reabandon_full, 1);
+      mi_theap_stat_adjust_decrease(theap, pages_abandoned, 1);  // adjust as we are not abandoning fresh
+      _mi_arenas_page_abandon(page, theap);
+      return true;
     }
   }
-  // arena segment: use the blocks_abandoned bitmap.
-  size_t arena_idx;
-  size_t bitmap_idx;
-  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
-  mi_assert_internal(arena_idx < MI_MAX_ARENAS);
-  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
-  mi_assert_internal(arena != NULL);
-  bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx);
-  if (was_marked) { 
-    mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
-    mi_atomic_decrement_relaxed(&abandoned_count); 
-    mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
+}
+
+// called from `mi_free` if trying to unabandon an abandoned page
+void _mi_arenas_page_unabandon(mi_page_t* page, mi_theap_t* current_theapx) {
+  mi_assert_internal(_mi_is_aligned(mi_page_slice_start(page), MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(current_theapx==NULL || _mi_thread_id()==current_theapx->tld->thread_id);
+
+  mi_heap_t* const heap = mi_page_heap(page);
+  if (mi_page_is_abandoned_mapped(page)) {
+    mi_assert_internal(page->memid.memkind==MI_MEM_ARENA);
+    // remove from the abandoned map
+    size_t bin = _mi_bin(mi_page_block_size(page));
+    size_t slice_index;
+    size_t slice_count;
+    mi_arena_pages_t* arena_pages;
+    mi_arena_t* arena = mi_page_arena_pages(page, &slice_index, &slice_count, &arena_pages);  MI_UNUSED(arena);
+
+    mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+    mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+
+    // this busy waits until a concurrent reader (from alloc_abandoned) is done
+    mi_bitmap_clear_once_set(arena_pages->pages_abandoned[bin], slice_index);
+    mi_page_clear_abandoned_mapped(page);
+    mi_atomic_decrement_relaxed(&heap->abandoned_count[bin]);
+  }
+  else {
+    // page is full (or a singleton), page is OS allocated
+    // if not an arena page, remove from the subproc os pages list
+    if (page->memid.memkind != MI_MEM_ARENA && mi_option_is_enabled(mi_option_visit_abandoned)) {
+      mi_lock(&heap->os_abandoned_pages_lock) {
+        if (page->prev != NULL) { page->prev->next = page->next; }
+        if (page->next != NULL) { page->next->prev = page->prev; }
+        if (heap->os_abandoned_pages == page) { heap->os_abandoned_pages = page->next; }
+        page->next = NULL;
+        page->prev = NULL;
+      }
+    }
+  }
+  if (current_theapx!=NULL) {
+    mi_theap_stat_decrease(current_theapx, pages_abandoned, 1);
+  }
+  else {
+    mi_heap_stat_decrease(heap, pages_abandoned, 1);
   }
-  // mi_assert_internal(was_marked);
-  mi_assert_internal(!was_marked || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-  //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-  return was_marked;
 }
 
-// mark a specific segment as abandoned
-// clears the thread_id.
-void _mi_arena_segment_mark_abandoned(mi_segment_t* segment) 
-{
-  mi_atomic_store_release(&segment->thread_id, 0);
-  mi_assert_internal(segment->used == segment->abandoned);
-  if (segment->memid.memkind != MI_MEM_ARENA) {
-    // not in an arena; count it as abandoned and return
-    mi_atomic_increment_relaxed(&abandoned_count);
-    return;
-  }
-  size_t arena_idx;
-  size_t bitmap_idx;
-  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
-  mi_assert_internal(arena_idx < MI_MAX_ARENAS);
-  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
-  mi_assert_internal(arena != NULL);
-  const bool was_unmarked = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
-  if (was_unmarked) { mi_atomic_increment_relaxed(&abandoned_count); }
-  mi_assert_internal(was_unmarked);
-  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
+
+/* -----------------------------------------------------------
+  Arena free
+----------------------------------------------------------- */
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices);
+static void mi_arenas_try_purge(bool force, bool visit_all, mi_subproc_t* subproc, size_t tseq);
+
+void _mi_arenas_free(void* p, size_t size, mi_memid_t memid) {
+  if (p==NULL) return;
+  if (size==0) return;
+
+  // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
+  mi_track_mem_undefined(p, size);
+
+  if (mi_memkind_is_os(memid.memkind)) {
+    // was a direct OS allocation, pass through
+    _mi_os_free(p, size, memid);
+  }
+  else if (memid.memkind == MI_MEM_ARENA) {
+    // allocated in an arena
+    size_t slice_count;
+    size_t slice_index;
+    mi_arena_t* arena = mi_arena_from_memid(memid, &slice_index, &slice_count);
+    mi_assert_internal((size%MI_ARENA_SLICE_SIZE)==0);
+    mi_assert_internal((slice_count*MI_ARENA_SLICE_SIZE)==size);
+    mi_assert_internal(mi_arena_slice_start(arena,slice_index) <= (uint8_t*)p);
+    mi_assert_internal(mi_arena_slice_start(arena,slice_index) + mi_size_of_slices(slice_count) > (uint8_t*)p);
+    // checks
+    if (arena == NULL) {
+      _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+    mi_assert_internal(slice_index < arena->slice_count);
+    mi_assert_internal(slice_index >= mi_arena_info_slices(arena));
+    if (slice_index < mi_arena_info_slices(arena) || slice_index > arena->slice_count) {
+      _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+
+    // potentially decommit
+    if (!arena->memid.is_pinned /* && !arena->memid.initially_committed */) { // todo: allow decommit even if initially committed?
+      // (delay) purge the page
+      mi_arena_schedule_purge(arena, slice_index, slice_count);
+    }
+
+    // and make it available to others again
+    bool all_inuse = mi_bbitmap_setN(arena->slices_free, slice_index, slice_count);
+    if (!all_inuse) {
+      _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", mi_arena_slice_start(arena,slice_index), mi_size_of_slices(slice_count));
+      return;
+    };
+  }
+  else if (memid.memkind == MI_MEM_META) {
+    _mi_meta_free(p, size, memid);
+  }
+  else {
+    // arena was none, external, or static; nothing to do
+    mi_assert_internal(mi_memid_needs_no_free(memid));
+  }
+
+  // try to purge expired decommits
+  // mi_arenas_try_purge(false, false, NULL);
 }
 
-// start a cursor at a randomized arena
-void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  current->start = (max_arena == 0 ? 0 : (mi_arena_id_t)( _mi_heap_random_next(heap) % max_arena));
-  current->count = 0;
-  current->bitmap_idx = 0;  
+// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
+void _mi_arenas_collect(bool force_purge, bool visit_all, mi_tld_t* tld) {
+  mi_arenas_try_purge(force_purge, visit_all, tld->subproc, tld->thread_seq);
 }
 
-// reclaim abandoned segments 
-// this does not set the thread id (so it appears as still abandoned)
-mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous ) 
-{
-  const int max_arena = (int)mi_atomic_load_relaxed(&mi_arena_count);
-  if (max_arena <= 0 || mi_atomic_load_relaxed(&abandoned_count) == 0) return NULL;
-
-  int count = previous->count;
-  size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx);
-  size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx) + 1;
-  // visit arena's (from previous)
-  for (; count < max_arena; count++, field_idx = 0, bit_idx = 0) {
-    mi_arena_id_t arena_idx = previous->start + count;
-    if (arena_idx >= max_arena) { arena_idx = arena_idx % max_arena; } // wrap around
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
+
+// Is a pointer contained in the given arena area?
+static bool mi_arena_strictly_contains(mi_arena_t* arena, const void* p) {
+  return (arena != NULL && 
+          mi_arena_start(arena) <= (const uint8_t*)p &&
+          mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) >(const uint8_t*)p);
+}
+
+// Is a pointer inside any of our arenas?
+static bool mi_arenas_contain_ex(const void* p, mi_arena_t* parent) {
+  mi_subproc_t* subproc = _mi_subproc();
+  const size_t max_arena = mi_arenas_get_count(subproc);
+  for (size_t i = 0; i < max_arena; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]);
     if (arena != NULL) {
-      // visit the abandoned fields (starting at previous_idx)
-      for ( ; field_idx < arena->field_count; field_idx++, bit_idx = 0) {
-        size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]);
-        if mi_unlikely(field != 0) { // skip zero fields quickly
-          // visit each set bit in the field  (todo: maybe use `ctz` here?)
-          for ( ; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) {
-            // pre-check if the bit is set
-            size_t mask = ((size_t)1 << bit_idx);
-            if mi_unlikely((field & mask) == mask) {
-              mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
-              // try to reclaim it atomically
-              if (_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) {
-                mi_atomic_decrement_relaxed(&abandoned_count);
-                previous->bitmap_idx = bitmap_idx;
-                previous->count = count;
-                mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-                mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
-                mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
-                //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-                return segment;
-              }
-            }
-          }
+      if (parent==NULL || arena==parent || arena->parent==parent) {
+        if (mi_arena_strictly_contains(arena, p)) {
+          return true;
         }
       }
     }
   }
-  // no more found
-  previous->bitmap_idx = 0;
-  previous->count = 0;
-  return NULL;
+  return false;
+}
+
+// Is a pointer inside any of our arenas?
+bool _mi_arenas_contain(const void* p) {
+  return mi_arenas_contain_ex(p, NULL);
+}
+
+// Is a pointer contained in the given arena area?
+bool mi_arena_contains(mi_arena_id_t arena_id, const void* p) {
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
+  if (arena==NULL) return false;
+  else if (mi_arena_strictly_contains(arena, p)) return true;
+  else return mi_arenas_contain_ex(p, arena);  // maybe a subarena?
+}
+
+
+/* -----------------------------------------------------------
+  Remove an arena.
+----------------------------------------------------------- */
+
+// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
+// for dynamic libraries that are unloaded and need to release all their allocated memory.
+static void mi_arenas_unsafe_destroy(mi_subproc_t* subproc) {
+  mi_assert_internal(subproc != NULL);
+  const size_t arena_count = mi_arenas_get_count(subproc);
+  for (size_t i = 0; i < arena_count; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]);
+    if (arena != NULL) {
+      // mi_lock_done(&arena->abandoned_visit_lock);
+      mi_atomic_store_ptr_release(mi_arena_t, &subproc->arenas[i], NULL);
+      if (mi_memkind_is_os(arena->memid.memkind)) {
+        _mi_os_free_ex(mi_arena_start(arena), mi_arena_size(arena), true, arena->memid, subproc); // pass `subproc` to avoid accessing the theap pointer (in `_mi_subproc()`)
+      }
+    }
+  }
+  // try to lower the max arena.
+  size_t expected = arena_count;
+  mi_atomic_cas_strong_acq_rel(&subproc->arena_count, &expected, (size_t)0);
+}
+
+
+// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
+// for dynamic libraries that are unloaded and need to release all their allocated memory.
+void _mi_arenas_unsafe_destroy_all(mi_subproc_t* subproc) {
+  mi_arenas_unsafe_destroy(subproc);
+  // mi_arenas_try_purge(true /* force purge */, true /* visit all*/, subproc, 0 /* thread seq */);  // purge non-owned arenas
 }
 
 
@@ -875,102 +1377,316 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr
   Add an arena.
 ----------------------------------------------------------- */
 
-static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
+static bool mi_arenas_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t* arena_id)
+{
   mi_assert_internal(arena != NULL);
-  mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0);
-  mi_assert_internal(arena->block_count > 0);
-  if (arena_id != NULL) { *arena_id = -1; }
+  mi_assert_internal(arena->slice_count > 0);
+  if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
+
+  // try to find a NULL entry
+  mi_arena_t* expected;
+  size_t count = mi_arenas_get_count(subproc);
+  for( size_t i = 0; i < count; i++) {
+    if (mi_arena_from_index(subproc,i) == NULL) {
+      arena->arena_idx = i;
+      expected = NULL;
+      if (mi_atomic_cas_ptr_strong_release(mi_arena_t, &subproc->arenas[i], &expected, arena)) {
+        // success
+        if (arena_id != NULL) { *arena_id = mi_arena_id_from_arena(arena); }
+        return true;
+      }
+    }
+  }
 
-  size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
-  if (i >= MI_MAX_ARENAS) {
-    mi_atomic_decrement_acq_rel(&mi_arena_count);
-    return false;
+  // otherwise, try to allocate a fresh slot
+  while(count<MI_MAX_ARENAS) {
+    if (mi_atomic_cas_strong_release(&subproc->arena_count, &count, count+1)) {
+      arena->arena_idx = count;
+      expected = NULL;
+      if (mi_atomic_cas_ptr_strong_release(mi_arena_t, &subproc->arenas[count], &expected, arena)) {
+        mi_subproc_stat_counter_increase(arena->subproc, arena_count, 1);
+        if (arena_id != NULL) { *arena_id = mi_arena_id_from_arena(arena); }
+        return true;
+      }
+    }
   }
-  _mi_stat_counter_increase(&stats->arena_count,1);
-  arena->id = mi_arena_id_create(i);
-  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
-  if (arena_id != NULL) { *arena_id = arena->id; }
-  return true;
+
+  // failed
+  arena->arena_idx = 0;
+  arena->subproc = NULL;
+  return false;
+}
+
+static size_t mi_arena_pages_size(size_t slice_count, size_t* bitmap_base) {
+  if (slice_count == 0) slice_count = MI_BCHUNK_BITS;
+  mi_assert_internal((slice_count % MI_BCHUNK_BITS) == 0);
+  const size_t base_size = _mi_align_up(sizeof(mi_arena_pages_t), MI_BCHUNK_SIZE);
+  const size_t bitmaps_count = 1 + MI_ARENA_BIN_COUNT; // pages, and abandoned
+  const size_t bitmaps_size = bitmaps_count * mi_bitmap_size(slice_count, NULL);
+  const size_t size = base_size + bitmaps_size;
+  if (bitmap_base != NULL) *bitmap_base = base_size;
+  return size;
+}
+
+static size_t mi_arena_info_slices_needed(size_t slice_count, size_t* bitmap_base) {
+  if (slice_count == 0) slice_count = MI_BCHUNK_BITS;
+  mi_assert_internal((slice_count % MI_BCHUNK_BITS) == 0);
+  const size_t base_size = _mi_align_up(sizeof(mi_arena_t), MI_BCHUNK_SIZE);
+  const size_t bitmaps_count = 4 + MI_ARENA_BIN_COUNT; // commit, dirty, purge, pages, and abandoned
+  const size_t bitmaps_size = bitmaps_count * mi_bitmap_size(slice_count, NULL) + mi_bbitmap_size(slice_count, NULL); // + free
+  #if MI_PAGE_META_IS_SEPARATED
+  const size_t pages_size = slice_count * sizeof(mi_page_t);
+  #else
+  const size_t pages_size = 0;
+  #endif
+  const size_t size = base_size + bitmaps_size + pages_size;
+
+  const size_t os_page_size = _mi_os_page_size();
+  const size_t info_size = _mi_align_up(size, os_page_size) + _mi_os_secure_guard_page_size();
+  const size_t info_slices = mi_slice_count_of_size(info_size);
+
+  if (bitmap_base != NULL) *bitmap_base = base_size;
+  return info_slices;
+}
+
+static mi_bitmap_t* mi_arena_bitmap_init(size_t slice_count, uint8_t** base) {
+  mi_bitmap_t* bitmap = (mi_bitmap_t*)(*base);
+  *base = (*base) + mi_bitmap_init(bitmap, slice_count, true /* already zero */);
+  return bitmap;
+}
+
+static mi_bbitmap_t* mi_arena_bbitmap_init(size_t slice_count, uint8_t** base) {
+  mi_bbitmap_t* bbitmap = (mi_bbitmap_t*)(*base);
+  *base = (*base) + mi_bbitmap_init(bbitmap, slice_count, true /* already zero */);
+  return bbitmap;
+}
+
+static mi_arena_pages_t* mi_arena_pages_alloc(mi_arena_t* arena) {
+  const size_t slice_count = arena->slice_count;
+  size_t bitmap_base = 0;
+  const size_t size = mi_arena_pages_size(slice_count, &bitmap_base);
+  mi_arena_pages_t* arena_pages = (mi_arena_pages_t*)mi_heap_zalloc_aligned(mi_heap_main(), size, MI_BCHUNK_SIZE);
+  if (arena_pages==NULL) return NULL;
+  uint8_t* base = (uint8_t*)arena_pages + bitmap_base;
+  mi_assert_internal(_mi_is_aligned(base, MI_BCHUNK_SIZE));
+  arena_pages->pages = mi_arena_bitmap_init(slice_count, &base);
+  for (size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) {
+    arena_pages->pages_abandoned[i] = mi_arena_bitmap_init(slice_count, &base);
+  }
+  return arena_pages;
 }
 
-static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
+static mi_arena_t* mi_arena_initialize(mi_subproc_t* subproc, void* start,
+                                        size_t slice_count, mi_arena_t* parent, size_t total_size,
+                                        int numa_node, bool exclusive,
+                                        mi_memid_t memid, mi_commit_fun_t* commit_fun, void* commit_fun_arg, mi_arena_id_t* arena_id)
 {
-  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
-  if (size < MI_ARENA_BLOCK_SIZE) return false;
+  mi_assert_internal(_mi_is_aligned(start,MI_ARENA_SLICE_ALIGN));
+  mi_assert_internal(mi_size_of_slices(slice_count)>=MI_ARENA_MIN_SIZE);
 
-  if (is_large) {
-    mi_assert_internal(memid.initially_committed && memid.is_pinned);
+  if (slice_count > MI_BITMAP_MAX_BIT_COUNT) {  // 16 GiB for now
+    // note: this should never happen if called from `mi_manage_os_memory` (as that allocates sub-arenas when needed)
+    _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", mi_size_of_slices(slice_count)/MI_MiB, mi_size_of_slices(MI_BITMAP_MAX_BIT_COUNT)/MI_MiB);
+    return NULL;
   }
 
-  const size_t bcount = size / MI_ARENA_BLOCK_SIZE;
-  const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
-  const size_t bitmaps = (memid.is_pinned ? 3 : 5);
-  const size_t asize  = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t));
-  mi_memid_t meta_memid;
-  mi_arena_t* arena   = (mi_arena_t*)mi_arena_meta_zalloc(asize, &meta_memid, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
-  if (arena == NULL) return false;
+  size_t bitmap_base;
+  const size_t info_slices = mi_arena_info_slices_needed(slice_count, &bitmap_base);
+  if (slice_count < info_slices+1) {
+    _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", mi_size_of_slices(slice_count)/MI_KiB, mi_size_of_slices(info_slices+1)/MI_KiB);
+    return NULL;
+  }
+  else if (info_slices >= MI_ARENA_MAX_CHUNK_OBJ_SLICES) {
+    _mi_warning_message("cannot use OS memory since it is too large with respect to the maximum object size (size %zu MiB, meta-info slices %zu, maximum object slices are %zu)", mi_size_of_slices(slice_count)/MI_MiB, info_slices, MI_ARENA_MAX_CHUNK_OBJ_SLICES);
+    return NULL;
+  }
+
+  mi_arena_t* arena = (mi_arena_t*)start;
+
+  // commit & zero if needed
+  if (!memid.initially_committed) {
+    size_t commit_size = mi_size_of_slices(info_slices);
+    // leave a guard OS page decommitted at the end?
+    if (!memid.is_pinned) { commit_size -= _mi_os_secure_guard_page_size(); }
+    bool ok = false;
+    if (commit_fun != NULL) {
+      ok = (*commit_fun)(true /* commit */, arena, commit_size, NULL, commit_fun_arg);
+    }
+    else {
+      ok = _mi_os_commit(arena, commit_size, NULL);
+    }
+    if (!ok) {
+      _mi_warning_message("unable to commit meta-data for OS memory");
+      return NULL;
+    }
+  }
+  else if (!memid.is_pinned) {
+    // if MI_SECURE, set a guard page at the end of the arena info
+    // todo: this does not respect the commit_fun as the memid is of external memory
+    _mi_os_secure_guard_page_set_before((uint8_t*)arena + mi_size_of_slices(info_slices), memid);
+  }
+  if (!memid.initially_zero) {
+    _mi_memzero(arena, mi_size_of_slices(info_slices) - _mi_os_secure_guard_page_size());
+  }
 
-  // already zero'd due to zalloc
-  // _mi_memzero(arena, asize);
-  arena->id = _mi_arena_id_none();
+  // init
+  arena->subproc = subproc;
   arena->memid = memid;
-  arena->exclusive = exclusive;
-  arena->meta_size = asize;
-  arena->meta_memid = meta_memid;
-  arena->block_count = bcount;
-  arena->field_count = fields;
-  arena->start = (uint8_t*)start;
-  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
-  arena->is_large     = is_large;
+  arena->is_exclusive = exclusive;
+  arena->slice_count = slice_count;
+  arena->info_slices = info_slices;
+  arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
   arena->purge_expire = 0;
-  arena->search_idx   = 0;
-  // consequetive bitmaps
-  arena->blocks_dirty     = &arena->blocks_inuse[fields];     // just after inuse bitmap
-  arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap
-  arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after abandoned bitmap
-  arena->blocks_purge     = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[4*fields]); // just after committed bitmap
-  // initialize committed bitmap?
-  if (arena->blocks_committed != NULL && arena->memid.initially_committed) {
-    memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning
+  arena->commit_fun = commit_fun;
+  arena->commit_fun_arg = commit_fun_arg;
+  arena->parent = parent;
+  arena->total_size = total_size;
+
+  // init bitmaps
+  uint8_t* base = mi_arena_start(arena) + bitmap_base;
+  arena->slices_free = mi_arena_bbitmap_init(slice_count, &base);
+  arena->slices_committed = mi_arena_bitmap_init(slice_count, &base);
+  arena->slices_dirty = mi_arena_bitmap_init(slice_count, &base);
+  arena->slices_purge = mi_arena_bitmap_init(slice_count, &base);
+  arena->pages_main.pages = mi_arena_bitmap_init(slice_count, &base);
+  for (size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) {
+    arena->pages_main.pages_abandoned[i] = mi_arena_bitmap_init(slice_count, &base);
+  }
+  #if MI_PAGE_META_IS_SEPARATED
+  arena->pages_meta = (mi_page_t*)base;
+  base += (slice_count * sizeof(mi_page_t));
+  #else
+  arena->pages_meta = NULL;
+  #endif
+  mi_assert_internal(mi_size_of_slices(info_slices) >= (size_t)(base - mi_arena_start(arena)));
+
+  // reserve our meta info (and reserve slices outside the memory area)
+  mi_bbitmap_unsafe_setN(arena->slices_free, info_slices /* start */, arena->slice_count - info_slices);
+  if (memid.initially_committed) {
+    mi_bitmap_unsafe_setN(arena->slices_committed, 0, arena->slice_count);
+  }
+  if (!memid.initially_zero) {
+    mi_bitmap_unsafe_setN(arena->slices_dirty, 0, arena->slice_count);
+  }
+  
+  if (!mi_arenas_add(subproc, arena, arena_id)) { return NULL;  }
+  return arena;
+}
+
+static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t size, int numa_node, bool exclusive,
+  mi_memid_t memid, mi_commit_fun_t* commit_fun, void* commit_fun_arg, mi_arena_id_t* arena_id) mi_attr_noexcept
+{
+  // checks
+  mi_assert(_mi_is_aligned(start, MI_ARENA_SLICE_SIZE));
+  mi_assert(start!=NULL);
+  if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
+  if (start==NULL) return false;
+  if (!_mi_is_aligned(start, MI_ARENA_SLICE_SIZE)) {
+    // we can align the start since the memid tracks the real base of the memory.
+    void* const aligned_start = _mi_align_up_ptr(start, MI_ARENA_SLICE_SIZE);
+    const size_t diff = (uint8_t*)aligned_start - (uint8_t*)start;
+    if (diff >= size || (size - diff) < MI_ARENA_SLICE_SIZE) {
+      _mi_warning_message("after alignment, the size of the arena becomes too small (memory at %p with size %zu)\n", start, size);
+      return false;
+    }
+    start = aligned_start;
+    size = size - diff;
   }
 
-  // and claim leftover blocks if needed (so we never allocate there)
-  ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
-  mi_assert_internal(post >= 0);
-  if (post > 0) {
-    // don't use leftover bits at the end
-    mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
-    _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL);
+  // allocate enough arena's to span the full memory area
+  // the first arena is the owner, the rest are "sub-arena" (with `parent` pointing to the first one)
+  size_t total_slice_count = _mi_align_down(size / MI_ARENA_SLICE_SIZE, MI_BCHUNK_BITS);
+  size_t total_size = mi_size_of_slices(total_slice_count);
+  if (total_size < MI_ARENA_MIN_SIZE) {
+    _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", size/MI_KiB, MI_ARENA_MIN_SIZE/MI_KiB);
+    return false;
   }
-  return mi_arena_add(arena, arena_id, &_mi_stats_main);
 
+  mi_arena_t* parent = NULL;
+  do {
+    // counting down on the total_slice_count
+    size_t slice_count = total_slice_count;
+    if (slice_count > MI_BITMAP_MAX_BIT_COUNT) {  // 16 GiB for now (with 64KiB slices)
+      slice_count = MI_BITMAP_MAX_BIT_COUNT;
+    }
+    
+    // initialize
+    mi_arena_t* arena = mi_arena_initialize( subproc, start, slice_count, parent,
+                                              (parent==NULL ? total_size : 0), numa_node, exclusive,
+                                              memid, commit_fun, commit_fun_arg,
+                                              (parent==NULL ? arena_id : NULL));
+    if (arena==NULL) {
+      // failed to initialize due to failing commit or too many arena's
+      if (parent==NULL) {
+        return false;
+      }
+      else {
+        // partial success, but failed to use the full area..
+        // todo: roll-back in this case? that requires a lock on the arena's array though
+        mi_assert(mi_size_of_slices(total_slice_count) <= parent->total_size);
+        parent->total_size -= mi_size_of_slices(total_slice_count);
+        return true;
+      }
+    }
+
+    // success
+    if (parent==NULL) { 
+      parent = arena; 
+      memid.memkind = MI_MEM_NONE;
+    }
+    mi_assert(slice_count <= total_slice_count);
+    total_slice_count -= slice_count;
+    start = (uint8_t*)start + mi_size_of_slices(slice_count);
+  } 
+  while (total_slice_count > 0);
+
+  return true;
+}
+ 
+bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
+  memid.mem.os.base = start;
+  memid.mem.os.size = size;
+  memid.initially_committed = is_committed;
+  memid.initially_zero = is_zero;
+  memid.is_pinned = is_pinned;
+  return mi_manage_os_memory_ex2(_mi_subproc(), start, size, numa_node, exclusive, memid, NULL, NULL, arena_id);
 }
 
-bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+bool mi_manage_memory(void* start, size_t size, bool is_committed, bool is_zero, bool is_pinned, int numa_node, bool exclusive, mi_commit_fun_t* commit_fun, void* commit_fun_arg, mi_arena_id_t* arena_id) mi_attr_noexcept
+{
   mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
+  memid.mem.os.base = start;
+  memid.mem.os.size = size;
   memid.initially_committed = is_committed;
   memid.initially_zero = is_zero;
-  memid.is_pinned = is_large;
-  return mi_manage_os_memory_ex2(start,size,is_large,numa_node,exclusive,memid, arena_id);
+  memid.is_pinned = is_pinned;
+  return mi_manage_os_memory_ex2(_mi_subproc(), start, size, numa_node, exclusive, memid, commit_fun, commit_fun_arg, arena_id);
 }
 
+
 // Reserve a range of regular OS memory
-int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+static int mi_reserve_os_memory_ex2(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) {
   if (arena_id != NULL) *arena_id = _mi_arena_id_none();
-  size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
+  size = _mi_align_up(size, MI_ARENA_SLICE_SIZE); // at least one slice
   mi_memid_t memid;
-  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main);
+  void* start = _mi_os_alloc_aligned(size, MI_ARENA_SLICE_ALIGN, commit, allow_large, &memid);
   if (start == NULL) return ENOMEM;
-  const bool is_large = memid.is_pinned; // todo: use separate is_large field?
-  if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
-    _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main);
+  if (!mi_manage_os_memory_ex2(subproc, start, size, -1 /* numa node */, exclusive, memid, NULL, NULL, arena_id)) {
+    _mi_os_free_ex(start, size, commit, memid, NULL);
     _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
     return ENOMEM;
   }
-  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : "");
+  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), memid.is_pinned ? " (in large os pages)" : "");
+  // mi_debug_show_arenas(true, true, false);
+
   return 0;
 }
 
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  return mi_reserve_os_memory_ex2(_mi_subproc(), size, commit, allow_large, exclusive, arena_id);
+}
 
 // Manage a range of regular OS memory
 bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept {
@@ -987,55 +1703,235 @@ int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noe
   Debugging
 ----------------------------------------------------------- */
 
-static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_field_t* fields, size_t field_count ) {
-  _mi_verbose_message("%s%s:\n", prefix, header);
-  size_t bcount = 0;
-  size_t inuse_count = 0;
-  for (size_t i = 0; i < field_count; i++) {
-    char buf[MI_BITMAP_FIELD_BITS + 1];
-    uintptr_t field = mi_atomic_load_relaxed(&fields[i]);
-    for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++, bcount++) {
-      if (bcount < block_count) {
-        bool inuse = ((((uintptr_t)1 << bit) & field) != 0);
-        if (inuse) inuse_count++;
-        buf[bit] = (inuse ? 'x' : '.');
+// Return idx of the slice past the last used slice
+static size_t mi_arena_used_slices(mi_arena_t* arena) {
+  size_t idx;
+  if (mi_bbitmap_bsr_inv(arena->slices_free, &idx)) {
+    return (idx + 1);
+  }
+  else {
+    return mi_arena_info_slices(arena);
+  }
+}
+
+static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf, size_t* k) {
+  size_t bit_set_count = 0;
+  for (int bit = 0; bit < MI_BFIELD_BITS; bit++) {
+    bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0);
+    if (is_set) bit_set_count++;
+    buf[*k] = (is_set ? 'x' : '.');
+    *k = *k + 1;
+  }
+  return bit_set_count;
+}
+
+typedef enum mi_ansi_color_e {
+  MI_BLACK = 30,
+  MI_MAROON,
+  MI_DARKGREEN,
+  MI_ORANGE,
+  MI_NAVY,
+  MI_PURPLE,
+  MI_TEAL,
+  MI_GRAY,
+  MI_DARKGRAY = 90,
+  MI_RED,
+  MI_GREEN,
+  MI_YELLOW,
+  MI_BLUE,
+  MI_MAGENTA,
+  MI_CYAN,
+  MI_WHITE
+} mi_ansi_color_t;
+
+static void mi_debug_color(char* buf, size_t* k, mi_ansi_color_t color) {
+  *k += _mi_snprintf(buf + *k, 32, "\x1B[%dm", (int)color);
+}
+
+static int mi_page_commit_usage(mi_page_t* page) {
+  // if (mi_page_size(page) <= MI_PAGE_MIN_COMMIT_SIZE) return 100;
+  const size_t committed_size = mi_page_committed(page);
+  const size_t used_size = page->used * mi_page_block_size(page);
+  return (int)(used_size * 100 / committed_size);
+}
+
+static size_t mi_debug_show_page_bfield(char* buf, size_t* k, mi_arena_t* arena, size_t slice_index, long* pbit_of_page, mi_ansi_color_t* pcolor_of_page ) {
+  size_t bit_set_count = 0;
+  long bit_of_page = *pbit_of_page;
+  mi_ansi_color_t color = *pcolor_of_page;
+  mi_ansi_color_t prev_color = MI_GRAY;
+  for (int bit = 0; bit < MI_BFIELD_BITS; bit++, bit_of_page--) {
+    // bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0);
+    void* start = mi_arena_slice_start(arena, slice_index + bit);
+    mi_page_t* page = _mi_safe_ptr_page(start);
+    char c = ' ';
+    if (page!=NULL && start==mi_page_slice_start(page)) {
+      mi_assert_internal(bit_of_page <= 0);
+      bit_set_count++;
+      c = 'p';
+      color = MI_GRAY;
+      if (mi_page_is_singleton(page)) { c = 's'; }
+      else if (mi_page_is_full(page)) { c = 'f'; }
+      if (!mi_page_is_abandoned(page)) { c = _mi_toupper(c); }
+      int commit_usage = mi_page_commit_usage(page);
+      if (commit_usage < 25) { color = MI_MAROON; }
+      else if (commit_usage < 50) { color = MI_ORANGE; }
+      else if (commit_usage < 75) { color = MI_TEAL; }
+      else color = MI_DARKGREEN;
+      bit_of_page = (long)page->memid.mem.arena.slice_count;
+    }
+    else {
+      c = '?';
+      if (bit_of_page > 0) { c = '-'; }
+      else if (_mi_meta_is_meta_page(start)) { c = 'm'; color = MI_GRAY; }
+      else if (slice_index + bit < arena->info_slices) { c = 'i'; color = MI_GRAY; }
+      // else if (mi_bitmap_is_setN(arena->pages_purge, slice_index + bit, NULL)) { c = '*'; }
+      else if (mi_bbitmap_is_setN(arena->slices_free, slice_index+bit,1)) {
+        if (mi_bitmap_is_set(arena->slices_purge, slice_index + bit)) { c = '~'; color = MI_ORANGE; }
+        else if (mi_bitmap_is_set(arena->slices_committed, slice_index + bit)) { c = '_'; color = MI_GRAY; }
+        else { c = '.'; color = MI_GRAY; }
+      }
+      if (bit==MI_BFIELD_BITS-1 && bit_of_page > 1) { c = '>'; }
+    }
+    if (color != prev_color) {
+      mi_debug_color(buf, k, color);
+      prev_color = color;
+    }
+    buf[*k] = c; *k += 1;
+  }
+  mi_debug_color(buf, k, MI_GRAY);
+  *pbit_of_page = bit_of_page;
+  *pcolor_of_page = color;
+  return bit_set_count;
+}
+
+static size_t mi_debug_show_chunks(const char* header1, const char* header2, const char* header3,
+                                   size_t slice_count, size_t chunk_count,
+                                   mi_bchunk_t* chunks, mi_bchunkmap_t* chunk_bins, bool invert, mi_arena_t* arena, bool narrow)
+{
+  _mi_raw_message("\x1B[37m%s%s%s (use/commit: \x1B[31m0 - 25%%\x1B[33m - 50%%\x1B[36m - 75%%\x1B[32m - 100%%\x1B[0m)\n", header1, header2, header3);
+  const size_t fields_per_line = (narrow ? 2 : 4);
+  const size_t used_slice_count = mi_arena_used_slices(arena);
+  size_t bit_count = 0;
+  size_t bit_set_count = 0;
+  long bit_of_page = 0;
+  mi_ansi_color_t color_of_page = MI_GRAY;
+  for (size_t i = 0; i < chunk_count && bit_count < slice_count; i++) {
+    char buf[5*MI_BCHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf));
+    if (bit_count > used_slice_count && i+2 < chunk_count) {
+      const size_t diff = chunk_count - 1 - i;
+      bit_count += diff*MI_BCHUNK_BITS;
+      _mi_raw_message("  |\n");
+      i = chunk_count-1;
+    }
+
+    size_t k = 0;
+
+    if (i<10)        { buf[k++] = ('0' + (char)i); buf[k++] = ' '; buf[k++] = ' '; }
+    else if (i<100)  { buf[k++] = ('0' + (char)(i/10)); buf[k++] = ('0' + (char)(i%10)); buf[k++] = ' '; }
+    else if (i<1000) { buf[k++] = ('0' + (char)(i/100)); buf[k++] = ('0' + (char)((i%100)/10)); buf[k++] = ('0' + (char)(i%10)); }
+
+    char chunk_kind = ' ';
+    if (chunk_bins != NULL) {
+      switch (mi_bbitmap_debug_get_bin(chunk_bins,i)) {
+        case MI_CBIN_SMALL:  chunk_kind = 'S'; break;
+        case MI_CBIN_MEDIUM: chunk_kind = 'M'; break;
+        case MI_CBIN_LARGE:  chunk_kind = 'L'; break;
+        case MI_CBIN_HUGE:   chunk_kind = 'H'; break;
+        case MI_CBIN_OTHER:  chunk_kind = 'X'; break;
+        default: chunk_kind = ' '; break; // suppress warning
+        // case MI_CBIN_NONE: chunk_kind = 'N'; break;
+      }
+    }
+    buf[k++] = chunk_kind;
+    buf[k++] = ' ';
+
+    for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
+      if (j > 0 && (j % fields_per_line) == 0) {
+        // buf[k++] = '\n'; _mi_memset(buf+k,' ',7); k += 7;
+        _mi_raw_message("  %s\n\x1B[37m", buf);
+        _mi_memzero(buf, sizeof(buf));
+        _mi_memset(buf, ' ', 5); k = 5;
+      }
+      if (bit_count < slice_count) {
+        mi_bfield_t bfield = 0;
+        if (chunks!=NULL) {
+          bfield = chunks[i].bfields[j];
+        }
+        if (invert) bfield = ~bfield;
+        size_t xcount = (chunks==NULL ? mi_debug_show_page_bfield(buf, &k, arena, bit_count, &bit_of_page, &color_of_page)
+                                      : mi_debug_show_bfield(bfield, buf, &k));
+        if (invert) xcount = MI_BFIELD_BITS - xcount;
+        bit_set_count += xcount;
+        buf[k++] = ' ';
       }
       else {
-        buf[bit] = ' ';
+        _mi_memset(buf + k, 'o', MI_BFIELD_BITS);
+        k += MI_BFIELD_BITS;
       }
+      bit_count += MI_BFIELD_BITS;
     }
-    buf[MI_BITMAP_FIELD_BITS] = 0;
-    _mi_verbose_message("%s  %s\n", prefix, buf);
+    _mi_raw_message("  %s\n\x1B[37m", buf);
   }
-  _mi_verbose_message("%s  total ('x'): %zu\n", prefix, inuse_count);
-  return inuse_count;
+  _mi_raw_message("\x1B[0m  total pages: %zu\n", bit_set_count);
+  return bit_set_count;
 }
 
-void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
-  size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
-  size_t inuse_total = 0;
-  size_t abandoned_total = 0;
-  size_t purge_total = 0;
+//static size_t mi_debug_show_bitmap_binned(const char* header1, const char* header2, const char* header3, size_t slice_count,
+//                                           mi_bitmap_t* bitmap, mi_bchunkmap_t* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) {
+//  return mi_debug_show_chunks(header1, header2, header3, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], chunk_bins, invert, arena, narrow);
+//}
+
+static void mi_debug_show_arenas_ex(mi_heap_t* heap, bool show_pages, bool narrow) mi_attr_noexcept {
+  mi_subproc_t* subproc = heap->subproc;
+  size_t max_arenas = mi_arenas_get_count(subproc);
+  //size_t free_total = 0;
+  //size_t slice_total = 0;
+  //size_t abandoned_total = 0;
+  size_t page_total = 0;
   for (size_t i = 0; i < max_arenas; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]);
     if (arena == NULL) break;
-    _mi_verbose_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_BLOCK_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : ""));
-    if (show_inuse) {
-      inuse_total += mi_debug_show_bitmap("  ", "inuse blocks", arena->block_count, arena->blocks_inuse, arena->field_count);
-    }
-    if (arena->blocks_committed != NULL) {
-      mi_debug_show_bitmap("  ", "committed blocks", arena->block_count, arena->blocks_committed, arena->field_count);
-    }
-    if (show_abandoned) {
-      abandoned_total += mi_debug_show_bitmap("  ", "abandoned blocks", arena->block_count, arena->blocks_abandoned, arena->field_count);      
-    }
-    if (show_purge && arena->blocks_purge != NULL) {
-      purge_total += mi_debug_show_bitmap("  ", "purgeable blocks", arena->block_count, arena->blocks_purge, arena->field_count);
+    mi_assert(arena->subproc == subproc);
+    // slice_total += arena->slice_count;
+    _mi_raw_message("%sarena %zu at %p: %zu slices (%zu MiB)%s%s, subproc: %p, numa: %i\n", 
+        (arena->parent==NULL ? "" : "(sub)"), i, arena, arena->slice_count, (size_t)(mi_size_of_slices(arena->slice_count)/MI_MiB), 
+        (arena->memid.is_pinned ? ", pinned" : ""), (arena->is_exclusive ? ", exclusive" : ""), 
+        arena->subproc, arena->numa_node);
+    //if (show_inuse) {
+    //  free_total += mi_debug_show_bbitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL);
+    //}
+    //if (show_committed) {
+    //  mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false, NULL);
+    //}
+    // todo: abandoned slices
+    //if (show_purge) {
+    //  purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL);
+    //}
+    if (show_pages) {
+      // mi_arena_pages_t* arena_pages = mi_heap_arena_pages(heap, arena);
+      // if (arena_pages != NULL)
+      {
+        const char* header1 = "chunks (p:page, f:full, s:singleton, P,F,S:not abandoned, i:arena-info, m:meta-data, ~:free-purgable, _:free-committed, .:free-reserved)";
+        const char* header2 = (narrow ? "\n       " : " ");
+        const char* header3 = "(chunk bin: S:small, M : medium, L : large, X : other)";
+        page_total += mi_debug_show_chunks(header1, header2, header3, arena->slice_count,
+                                           mi_bbitmap_chunk_count(arena->slices_free), NULL,
+                                           arena->slices_free->chunkmap_bins, false, arena, narrow);
+      }
     }
   }
-  if (show_inuse)     _mi_verbose_message("total inuse blocks    : %zu\n", inuse_total);
-  if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total);
-  if (show_purge)     _mi_verbose_message("total purgeable blocks: %zu\n", purge_total);
+  // if (show_inuse)     _mi_raw_message("total inuse slices    : %zu\n", slice_total - free_total);
+  // if (show_abandoned) _mi_raw_message("total abandoned slices: %zu\n", abandoned_total);
+  if (show_pages) _mi_raw_message("total pages in arenas: %zu\n", page_total);
+}
+
+void mi_debug_show_arenas(void) mi_attr_noexcept {
+  mi_debug_show_arenas_ex(mi_heap_main(), true /* show pages */, true /* narrow? */);
+}
+
+void mi_arenas_print(void) mi_attr_noexcept {
+  mi_debug_show_arenas();
 }
 
 
@@ -1044,7 +1940,7 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge)
 ----------------------------------------------------------- */
 // reserve at a specific numa node
 int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
-  if (arena_id != NULL) *arena_id = -1;
+  if (arena_id != NULL) *arena_id = NULL;
   if (pages==0) return 0;
   if (numa_node < -1) numa_node = -1;
   if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
@@ -1058,8 +1954,8 @@ int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_m
   }
   _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
 
-  if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) {
-    _mi_os_free(p, hsize, memid, &_mi_stats_main);
+  if (!mi_manage_os_memory_ex2(_mi_subproc(), p, hsize, numa_node, exclusive, memid, NULL, NULL, arena_id)) {
+    _mi_os_free(p, hsize, memid);
     return ENOMEM;
   }
   return 0;
@@ -1074,17 +1970,17 @@ int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t
   if (pages == 0) return 0;
 
   // pages per numa node
-  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
-  if (numa_count <= 0) numa_count = 1;
+  int numa_count = (numa_nodes > 0 && numa_nodes <= INT_MAX ? (int)numa_nodes : _mi_os_numa_node_count());
+  if (numa_count <= 0) { numa_count = 1; }
   const size_t pages_per = pages / numa_count;
   const size_t pages_mod = pages % numa_count;
   const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);
 
   // reserve evenly among numa nodes
-  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+  for (int numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
     size_t node_pages = pages_per;  // can be 0
-    if (numa_node < pages_mod) node_pages++;
-    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
+    if ((size_t)numa_node < pages_mod) { node_pages++; }
+    int err = mi_reserve_huge_os_pages_at(node_pages, numa_node, timeout_per);
     if (err) return err;
     if (pages < node_pages) {
       pages = 0;
@@ -1106,3 +2002,506 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
   return err;
 }
 
+
+
+
+
+/* -----------------------------------------------------------
+  Arena purge
+----------------------------------------------------------- */
+
+static long mi_arena_purge_delay(void) {
+  // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
+  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
+}
+
+// reset or decommit in an arena and update the commit bitmap
+// assumes we own the area (i.e. slices_free is claimed by us)
+// returns if the memory is no longer committed (versus reset which keeps the commit)
+static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
+  mi_assert_internal(!arena->memid.is_pinned);
+  mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+
+  const size_t size = mi_size_of_slices(slice_count);
+  void* const p = mi_arena_slice_start(arena, slice_index);
+  //const bool all_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count);
+  size_t already_committed;
+  mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed); // pretend all committed.. (as we lack a clearN call that counts the already set bits..)
+  const bool all_committed = (already_committed == slice_count);
+  const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed /* allow reset? */, mi_size_of_slices(already_committed), arena->commit_fun, arena->commit_fun_arg);
+
+  if (needs_recommit) {
+    // no longer committed
+    mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count);
+    // we just counted in the purge to decommit all, but the some part was not committed so adjust that here
+    // mi_subproc_stat_decrease(arena->subproc, committed, mi_size_of_slices(slice_count - already_committed));
+  }
+  else if (!all_committed) {
+    // we cannot assume any of these are committed any longer (even with reset since we did setN and may have marked uncommitted slices as committed)
+    mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count);
+    // we adjust the commit count as parts will be re-committed
+    // mi_subproc_stat_decrease(arena->subproc, committed, mi_size_of_slices(already_committed));
+  }
+
+  return needs_recommit;
+}
+
+
+// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
+// Note: assumes we (still) own the area as we may purge immediately
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
+  const long delay = mi_arena_purge_delay();
+  if (arena->memid.is_pinned || delay < 0 || _mi_preloading()) return;  // is purging allowed at all?
+
+  mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+  if (delay == 0) {
+    // purge directly
+    mi_arena_purge(arena, slice_index, slice_count);
+  }
+  else {
+    // schedule purge
+    const mi_msecs_t expire = _mi_clock_now() + delay;
+    mi_msecs_t expire0 = 0;
+    if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire0, expire)) {
+      // expiration was not yet set
+      // maybe set the global arenas expire as well (if it wasn't set already)
+      mi_assert_internal(expire0==0);
+      mi_atomic_casi64_strong_acq_rel(&arena->subproc->purge_expire, &expire0, expire);
+    }
+    else {
+      // already an expiration was set
+    }
+    mi_bitmap_setN(arena->slices_purge, slice_index, slice_count, NULL);
+  }
+}
+
+typedef struct mi_purge_visit_info_s {
+  mi_msecs_t now;
+  mi_msecs_t delay;
+  bool all_purged;
+  bool any_purged;
+} mi_purge_visit_info_t;
+
+static bool mi_arena_try_purge_range(mi_arena_t* arena, size_t slice_index, size_t slice_count) {
+  mi_assert(slice_count < MI_BCHUNK_BITS);
+  if (mi_bbitmap_try_clearNC(arena->slices_free, slice_index, slice_count)) {
+    // purge
+    bool decommitted = mi_arena_purge(arena, slice_index, slice_count); MI_UNUSED(decommitted);
+    mi_assert_internal(!decommitted || mi_bitmap_is_clearN(arena->slices_committed, slice_index, slice_count));
+    // and reset the free range
+    mi_bbitmap_setN(arena->slices_free, slice_index, slice_count);
+    return true;
+  }
+  else {
+    // was allocated again already
+    return false;
+  }
+}
+
+static bool mi_arena_try_purge_visitor(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg) {
+  mi_purge_visit_info_t* vinfo = (mi_purge_visit_info_t*)arg;
+  // try to purge: first claim the free blocks
+  if (mi_arena_try_purge_range(arena, slice_index, slice_count)) {
+    vinfo->any_purged = true;
+    vinfo->all_purged = true;
+  }
+  else if (slice_count > 1)
+  {
+    // failed to claim the full range, try per slice instead
+    for (size_t i = 0; i < slice_count; i++) {
+      const bool purged = mi_arena_try_purge_range(arena, slice_index + i, 1);
+      vinfo->any_purged = vinfo->any_purged || purged;
+      vinfo->all_purged = vinfo->all_purged && purged;
+    }
+  }
+  // don't clear the purge bits as that is done atomically be the _bitmap_forall_set_ranges
+  // mi_bitmap_clearN(arena->slices_purge, slice_index, slice_count);
+  return true; // continue
+}
+
+// returns 
+// -1 = nothing was purged 
+// 0  = nothing was purged yet because have not yet reached the expire time
+// 1  = some pages in the arena were purged
+static int mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
+{
+  // check pre-conditions
+  if (arena->memid.is_pinned) return -1;
+
+  // expired yet?
+  mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
+  if (!force) {
+    if (expire==0) return -1;
+    if (expire > now) return 0;
+  }
+
+  // reset expire
+  mi_atomic_storei64_release(&arena->purge_expire, (mi_msecs_t)0);
+  mi_subproc_stat_counter_increase(arena->subproc, arena_purges, 1);
+
+  // go through all purge info's  (with max MI_BFIELD_BITS ranges at a time)
+  // this also clears those ranges atomically (so any newly freed blocks will get purged next
+  // time around)
+  mi_purge_visit_info_t vinfo = { now, mi_arena_purge_delay(), true /*all?*/, false /*any?*/};
+
+  // we purge by at least `minslices` to not fragment transparent huge pages for example
+  const size_t minslices = mi_slice_count_of_size(_mi_os_minimal_purge_size());
+  _mi_bitmap_forall_setc_rangesn(arena->slices_purge, minslices, &mi_arena_try_purge_visitor, arena, &vinfo);
+
+  return (vinfo.any_purged ? 1 : -1);
+}
+
+
+static void mi_arenas_try_purge(bool force, bool visit_all, mi_subproc_t* subproc, size_t tseq)
+{
+  // try purge can be called often so try to only run when needed
+  const long delay = mi_arena_purge_delay();
+  if (_mi_preloading() || delay <= 0) return;  // nothing will be scheduled
+
+  // check if any arena needs purging?
+  const mi_msecs_t now = _mi_clock_now();
+  const mi_msecs_t arenas_expire = mi_atomic_loadi64_acquire(&subproc->purge_expire);
+  if (!visit_all && !force && (arenas_expire == 0 || arenas_expire > now)) return;
+
+  const size_t max_arena = mi_arenas_get_count(subproc);
+  if (max_arena == 0) return;
+
+  // allow only one thread to purge at a time (todo: allow concurrent purging?)
+  static mi_atomic_guard_t purge_guard;
+  mi_atomic_guard(&purge_guard)
+  {
+    // increase global expire: at most one purge per delay cycle
+    if (arenas_expire > now) { mi_atomic_storei64_release(&subproc->purge_expire, now + (delay/10)); }
+    const size_t arena_start = tseq % max_arena;
+    size_t max_purge_count = (visit_all ? max_arena : (max_arena/4)+1);
+    bool all_visited = true;
+    bool any_purged = false;
+    for (size_t _i = 0; _i < max_arena; _i++) {
+      size_t i = _i + arena_start;
+      if (i >= max_arena) { i -= max_arena; }
+      mi_arena_t* arena = mi_arena_from_index(subproc,i);
+      if (arena != NULL) {
+        const int purged = mi_arena_try_purge(arena, now, force);
+        if (purged >= 0) {      // purged, or arena expire is not yet reached
+          any_purged = true;
+          if (purged >= 1) {    // purged
+            if (max_purge_count <= 1) {
+              all_visited = false;
+              break;
+            }
+            max_purge_count--;
+          }
+        }
+      }
+    }
+    if (all_visited && !any_purged) {
+      mi_atomic_storei64_release(&subproc->purge_expire, (mi_msecs_t)0);
+    }
+  }
+}
+
+
+/* -----------------------------------------------------------
+  Visit all pages and blocks in a heap
+----------------------------------------------------------- */
+
+typedef struct mi_heap_visit_info_s {
+  mi_heap_t* heap;
+  mi_block_visit_fun* visitor;
+  void* arg;
+  bool visit_blocks;
+} mi_heap_visit_info_t;
+
+static bool mi_heap_visit_page(mi_page_t* page, mi_heap_visit_info_t* vinfo) {
+  mi_heap_area_t area;
+  _mi_heap_area_init(&area, page);
+  mi_assert_internal(vinfo->heap == mi_page_heap(page));
+  if (!vinfo->visitor(vinfo->heap, &area, NULL, area.block_size, vinfo->arg)) {
+    return false;
+  }
+  if (vinfo->visit_blocks) {
+    return _mi_theap_area_visit_blocks(&area, page, vinfo->visitor, vinfo->arg);
+  }
+  else {
+    return true;
+  }
+}
+
+static bool mi_heap_visit_page_at(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg) {
+  MI_UNUSED(slice_count);
+  mi_heap_visit_info_t* vinfo = (mi_heap_visit_info_t*)arg;
+  mi_page_t* page = mi_arena_page_at_slice(arena, slice_index);
+  return mi_heap_visit_page(page, vinfo);
+}
+
+bool _mi_heap_visit_blocks(mi_heap_t* heap, bool abandoned_only, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  mi_assert(visitor!=NULL);
+  if (visitor==NULL) return false;
+  if (heap==NULL) { heap = mi_heap_main(); }
+  // visit all pages in a heap
+  // we don't have to claim because we assume we are the only thread running (with this heap).
+  // (but we could atomically claim as well by first doing abandoned_reclaim and afterwards reabandoning).
+  mi_heap_visit_info_t visit_info = { heap, visitor, arg, visit_blocks };
+  bool ok = true;
+  mi_forall_arenas(heap, NULL, 0, arena) {
+    mi_arena_pages_t* arena_pages = mi_heap_arena_pages(heap, arena);
+    if (ok && arena_pages != NULL) {
+      if (abandoned_only) {
+        for (size_t bin = 0; ok && bin < MI_BIN_COUNT; bin++) {
+          // todo: if we had a single abandoned page map as well, this can be faster.
+          if (mi_atomic_load_relaxed(&heap->abandoned_count[bin]) > 0) {
+            ok = _mi_bitmap_forall_set(arena_pages->pages_abandoned[bin], &mi_heap_visit_page_at, arena, &visit_info);
+          }
+        }
+      }
+      else {
+        ok = _mi_bitmap_forall_set(arena_pages->pages, &mi_heap_visit_page_at, arena, &visit_info);
+      }
+    }
+  }
+  mi_forall_arenas_end();
+  if (!ok) return false;
+
+  // visit abandoned pages in OS allocated memory
+  // (technically we don't need the initial lock as we assume we are the only thread running in this subproc)
+  mi_page_t* page = NULL;
+  mi_lock(&heap->os_abandoned_pages_lock) {
+    page = heap->os_abandoned_pages;
+  }
+  while (ok && page != NULL) {
+    mi_page_t* next = page->next;  // read upfront in case the visitor frees the page
+    ok = mi_heap_visit_page(page, &visit_info);
+    page = next;
+  }
+
+  return ok;
+}
+
+bool mi_heap_visit_blocks(mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  return _mi_heap_visit_blocks(heap, false, visit_blocks, visitor, arg);
+}
+
+bool mi_heap_visit_abandoned_blocks(mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  return _mi_heap_visit_blocks(heap, true, visit_blocks, visitor, arg);
+}
+
+
+typedef struct mi_heap_delete_visit_info_s {
+  mi_heap_t*  heap_target;
+  mi_theap_t* theap_target;
+  mi_theap_t* theap;
+} mi_heap_delete_visit_info_t;
+
+static bool mi_heap_delete_page(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) {
+  MI_UNUSED(block); MI_UNUSED(block_size); MI_UNUSED(heap);
+  mi_heap_delete_visit_info_t* info = (mi_heap_delete_visit_info_t*)arg;
+  mi_heap_t*  heap_target           = info->heap_target;
+  mi_theap_t* const theap           = NULL; // info->theap;       mi_assert_internal(_mi_theap_heap(theap) == heap);
+  mi_page_t*  const page            = (mi_page_t*)area->reserved1;
+
+  mi_page_claim_ownership(page);       // claim ownership
+  if (mi_page_is_abandoned(page)) {
+    _mi_arenas_page_unabandon(page,theap);
+  }
+  else {
+    page->next = page->prev = NULL;    // yikes.. better not to try to access this from a thread later on..
+    mi_page_set_theap(page,NULL);      // set threadid to abandoned
+  }
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(mi_page_is_owned(page));
+
+  if (page->used==0) {
+    // free the page
+    _mi_arenas_page_free(page, theap);
+  }
+  else if (heap_target==NULL) {
+    // destroy the page
+    page->used=0;                        // note: invariant `|local_free| + |free| == reserved - used`  does not hold in this case
+    _mi_arenas_page_free(page, theap);
+  }
+  else {
+    // move the page to `heap_target` as an abandoned page
+    // first remove it from the current heap
+    const size_t sbin = _mi_page_stats_bin(page);
+    size_t slice_index;
+    size_t slice_count;
+    mi_arena_pages_t* arena_pages = NULL;
+    mi_arena_t* const arena = mi_page_arena_pages(page, &slice_index, &slice_count, &arena_pages);
+    mi_assert_internal(mi_bitmap_is_set(arena_pages->pages, slice_index));
+    mi_bitmap_clear(arena_pages->pages, slice_index);
+    if (theap != NULL) {
+      mi_theap_stat_decrease(theap, page_bins[sbin], 1);
+      mi_theap_stat_decrease(theap, pages, 1);
+    }
+    else {
+      mi_heap_stat_decrease((mi_heap_t*)heap, page_bins[_mi_page_stats_bin(page)], 1);
+      mi_heap_stat_decrease((mi_heap_t*)heap, pages, 1);
+    }
+    mi_theap_t* theap_target = info->theap_target;
+
+    // and then add it to the new target heap
+    mi_arena_pages_t* arena_pages_target = mi_heap_ensure_arena_pages(heap_target, arena);
+    if mi_unlikely(arena_pages_target==NULL) {
+      // if we cannot allocate this, we move it to the main heap instead (which does not require allocation)
+      heap_target = mi_heap_main();
+      theap_target = mi_heap_theap(heap_target);
+      arena_pages_target = mi_heap_ensure_arena_pages(heap_target, arena);
+      mi_assert_internal(arena_pages_target!=NULL);
+    }
+    mi_assert_internal(mi_bitmap_is_clear(arena_pages_target->pages, slice_index));
+    mi_bitmap_set(arena_pages_target->pages, slice_index);
+    page->heap = heap_target;
+    mi_theap_stat_increase(theap_target, page_bins[sbin], 1);
+    mi_theap_stat_increase(theap_target, pages, 1);
+
+    // and abandon in the new heap
+    _mi_arenas_page_abandon(page,theap_target);
+  }
+  return true;
+}
+
+static void mi_heap_delete_pages(mi_heap_t* heap, mi_heap_t* heap_target) {
+  mi_theap_t* const theap_target = (heap_target != NULL ? _mi_heap_theap(heap_target) : NULL);
+  // mi_theap_t* const theap = _mi_heap_theap(heap);
+  mi_heap_delete_visit_info_t info = { heap_target, theap_target, NULL };
+  _mi_heap_visit_blocks(heap, false, false, &mi_heap_delete_page, &info);
+  #if MI_DEBUG>1
+  // no more arena pages?
+  for (size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) {
+    mi_arena_pages_t* const arena_pages = mi_atomic_load_ptr_relaxed(mi_arena_pages_t, &heap->arena_pages[i]);
+    if (arena_pages!=NULL) {
+      mi_assert_internal(mi_bitmap_is_all_clear(arena_pages->pages));
+    }
+  }
+  // nor os abandoned pages?
+  mi_lock(&heap->os_abandoned_pages_lock) {
+
+    mi_assert_internal(heap->os_abandoned_pages == NULL);
+  }
+  // nor arena abandoned pages?
+  for (size_t i = 0; i < MI_BIN_COUNT; i++) {
+    mi_assert_internal(mi_atomic_load_relaxed(&heap->abandoned_count[i])==0);
+  }
+  #endif
+}
+
+void _mi_heap_move_pages(mi_heap_t* heap_from, mi_heap_t* heap_to) {
+  if (_mi_is_heap_main(heap_from)) return;
+  if (heap_to==NULL) { heap_to = mi_heap_main(); }
+  mi_heap_delete_pages(heap_from, heap_to);
+}
+
+void _mi_heap_destroy_pages(mi_heap_t* heap_from) {
+  if (_mi_is_heap_main(heap_from)) return;
+  mi_heap_delete_pages(heap_from, NULL);
+}
+
+/* -----------------------------------------------------------
+  Unloading and reloading an arena.
+----------------------------------------------------------- */
+/*
+static bool mi_arena_page_register(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg) {
+  MI_UNUSED(arg); MI_UNUSED(slice_count);
+  mi_assert_internal(slice_count == 1);
+  mi_page_t* page = mi_arena_page_at_slice(arena, slice_index);
+  mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1));
+  if (!_mi_page_map_register(page)) return false; // break
+  mi_assert_internal(_mi_ptr_page(page)==page);
+  return true;
+}
+
+mi_decl_nodiscard static bool mi_arena_pages_reregister(mi_arena_t* arena) {
+  return _mi_bitmap_forall_set(arena->pages, &mi_arena_page_register, arena, NULL);
+}
+
+mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* full_size) {
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
+  if (arena==NULL) {
+    return false;
+  }
+  else if (!arena->is_exclusive) {
+    _mi_warning_message("cannot unload a non-exclusive arena (id %zu at %p)\n", arena_id, arena);
+    return false;
+  }
+  else if (arena->memid.memkind != MI_MEM_EXTERNAL) {
+    _mi_warning_message("can only unload managed arena's for external memory (id %zu at %p)\n", arena_id, arena);
+    return false;
+  }
+
+  // find accessed size
+  const size_t asize = mi_size_of_slices(mi_arena_used_slices(arena));
+  if (base != NULL) { *base = (void*)arena; }
+  if (full_size != NULL) { *full_size = arena->memid.mem.os.size;  }
+  if (accessed_size != NULL) { *accessed_size = asize; }
+
+  // adjust abandoned page count
+  mi_subproc_t* const subproc = arena->subproc;
+  for (size_t bin = 0; bin < MI_BIN_COUNT; bin++) {
+    const size_t count = mi_bitmap_popcount(arena->pages_abandoned[bin]);
+    if (count > 0) { mi_atomic_decrement_acq_rel(&subproc->abandoned_count[bin]); }
+  }
+
+  // unregister the pages
+  _mi_page_map_unregister_range(arena, asize);
+
+  // set arena entry to NULL
+  const size_t count = mi_arenas_get_count(subproc);
+  for(size_t i = 0; i < count; i++) {
+    if (mi_arena_from_index(subproc, i) == arena) {
+      mi_atomic_store_ptr_release(mi_arena_t, &subproc->arenas[i], NULL);
+      if (i + 1 == count) { // try adjust the count?
+        size_t expected = count;
+        mi_atomic_cas_strong_acq_rel(&subproc->arena_count, &expected, count-1);
+      }
+      break;
+    }
+  }
+  return true;
+}
+
+mi_decl_export bool mi_arena_reload(void* start, size_t size, mi_commit_fun_t* commit_fun, void* commit_fun_arg, mi_arena_id_t* arena_id) {
+  // assume the memory area is already containing the arena
+  if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
+  if (start == NULL || size == 0) return false;
+  mi_arena_t* arena = (mi_arena_t*)start;
+  mi_memid_t memid = arena->memid;
+  if (memid.memkind != MI_MEM_EXTERNAL) {
+    _mi_warning_message("can only reload arena's from external memory (%p)\n", arena);
+    return false;
+  }
+  if (memid.mem.os.base != start) {
+    _mi_warning_message("the reloaded arena base address differs from the external memory (arena: %p, external: %p)\n", arena, start);
+    return false;
+  }
+  if (memid.mem.os.size != size) {
+    _mi_warning_message("the reloaded arena size differs from the external memory (arena size: %zu, external size: %zu)\n", arena->memid.mem.os.size, size);
+    return false;
+  }
+  if (!arena->is_exclusive) {
+    _mi_warning_message("the reloaded arena is not exclusive\n");
+    return false;
+  }
+
+  // re-initialize
+  arena->is_exclusive = true;
+  arena->commit_fun = commit_fun;
+  arena->commit_fun_arg = commit_fun_arg;
+  arena->subproc = _mi_subproc();
+  if (!mi_arenas_add(arena->subproc, arena, arena_id)) {
+    return false;
+  }
+  if (!mi_arena_pages_reregister(arena)) {
+    // todo: clear arena entry in the subproc?
+    return false;
+  }
+
+  // adjust abandoned page count
+  for (size_t bin = 0; bin < MI_BIN_COUNT; bin++) {
+    const size_t count = mi_bitmap_popcount(arena->pages_abandoned[bin]);
+    if (count > 0) { mi_atomic_decrement_acq_rel(&arena->subproc->abandoned_count[bin]); }
+  }
+
+  return true;
+}
+
+*/
diff --git a/system/lib/mimalloc/src/bitmap.c b/system/lib/mimalloc/src/bitmap.c
index 4b6be66bcd2c9..ef17b700a5401 100644
--- a/system/lib/mimalloc/src/bitmap.c
+++ b/system/lib/mimalloc/src/bitmap.c
@@ -1,436 +1,1930 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
+Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 
 /* ----------------------------------------------------------------------------
-Concurrent bitmap that can set/reset sequences of bits atomically,
-represented as an array of fields where each field is a machine word (`size_t`)
-
-There are two api's; the standard one cannot have sequences that cross
-between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
-
-The `_across` postfixed functions do allow sequences that can cross over
-between the fields. (This is used in arena allocation)
+Concurrent bitmap that can set/reset sequences of bits atomically
 ---------------------------------------------------------------------------- */
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
+#include "mimalloc/bits.h"
 #include "bitmap.h"
 
-/* -----------------------------------------------------------
-  Bitmap definition
------------------------------------------------------------ */
+#ifndef MI_OPT_SIMD
+#define MI_OPT_SIMD   0
+#endif
+
+/* --------------------------------------------------------------------------------
+  bfields
+-------------------------------------------------------------------------------- */
 
-// The bit mask for a given number of blocks at a specified bit index.
-static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) {
-  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
-  mi_assert_internal(count > 0);
-  if (count >= MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
-  if (count == 0) return 0;
-  return ((((size_t)1 << count) - 1) << bitidx);
+static inline size_t mi_bfield_ctz(mi_bfield_t x) {
+  return mi_ctz(x);
 }
 
+static inline size_t mi_bfield_clz(mi_bfield_t x) {
+  return mi_clz(x);
+}
 
-/* -----------------------------------------------------------
-  Claim a bit sequence atomically
------------------------------------------------------------ */
+static inline size_t mi_bfield_popcount(mi_bfield_t x) {
+  return mi_popcount(x);
+}
 
-// Try to atomically claim a sequence of `count` bits in a single
-// field at `idx` in `bitmap`. Returns `true` on success.
-inline bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
-{
-  mi_assert_internal(bitmap_idx != NULL);
-  mi_assert_internal(count <= MI_BITMAP_FIELD_BITS);
-  mi_assert_internal(count > 0);
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t map  = mi_atomic_load_relaxed(field);
-  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
-
-  // search for 0-bit sequence of length count
-  const size_t mask = mi_bitmap_mask_(count, 0);
-  const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count;
-
-#ifdef MI_HAVE_FAST_BITSCAN
-  size_t bitidx = mi_ctz(~map);    // quickly find the first zero bit if possible
-#else
-  size_t bitidx = 0;               // otherwise start at 0
-#endif
-  size_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
-
-  // scan linearly for a free range of zero bits
-  while (bitidx <= bitidx_max) {
-    const size_t mapm = (map & m);
-    if (mapm == 0) {  // are the mask bits free at bitidx?
-      mi_assert_internal((m >> bitidx) == mask); // no overflow?
-      const size_t newmap = (map | m);
-      mi_assert_internal((newmap^map) >> bitidx == mask);
-      if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) {  // TODO: use weak cas here?
-        // no success, another thread claimed concurrently.. keep going (with updated `map`)
-        continue;
+static inline mi_bfield_t mi_bfield_clear_least_bit(mi_bfield_t x) {
+  return (x & (x-1));
+}
+
+// find the least significant bit that is set (i.e. count trailing zero's)
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
+  return mi_bsf(x,idx);
+}
+
+// find the most significant bit that is set.
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bfield_find_highest_bit(mi_bfield_t x, size_t* idx) {
+  return mi_bsr(x, idx);
+}
+
+
+
+// find each set bit in a bit field `x` and clear it, until it becomes zero.
+static inline bool mi_bfield_foreach_bit(mi_bfield_t* x, size_t* idx) {
+  const bool found = mi_bfield_find_least_bit(*x, idx);
+  *x = mi_bfield_clear_least_bit(*x);
+  return found;
+}
+
+static inline mi_bfield_t mi_bfield_zero(void) {
+  return 0;
+}
+
+static inline mi_bfield_t mi_bfield_one(void) {
+  return 1;
+}
+
+static inline mi_bfield_t mi_bfield_all_set(void) {
+  return ~((mi_bfield_t)0);
+}
+
+// mask of `bit_count` bits set shifted to the left by `shiftl`
+static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) {
+  mi_assert_internal(bit_count > 0);
+  mi_assert_internal(bit_count + shiftl <= MI_BFIELD_BITS);
+  mi_assert_internal(shiftl < MI_BFIELD_BITS);
+  const mi_bfield_t mask0 = (bit_count < MI_BFIELD_BITS ? (mi_bfield_one() << bit_count)-1 : mi_bfield_all_set());
+  return (mask0 << shiftl);
+}
+
+
+// ------- mi_bfield_atomic_set ---------------------------------------
+// the `_set` functions return also the count of bits that were already set (for commit statistics)
+// the `_clear` functions return also whether the new bfield is all clear or not (for the chunk_map)
+
+// Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1
+static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  const mi_bfield_t mask = mi_bfield_mask(1, idx);;
+  const mi_bfield_t old = mi_atomic_or_acq_rel(b, mask);
+  return ((old&mask) == 0);
+}
+
+// Clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0.
+// `all_clear` is set if the new bfield is zero.
+static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  const mi_bfield_t mask = mi_bfield_mask(1, idx);;
+  mi_bfield_t old = mi_atomic_and_acq_rel(b, ~mask);
+  if (all_clear != NULL) { *all_clear = ((old&~mask)==0); }
+  return ((old&mask) == mask);
+}
+
+// Clear a bit but only when/once it is set. This is used by concurrent free's while
+// the page is abandoned and mapped. This can incure a busy wait :-( but it should
+// happen almost never (and is accounted for in the stats)
+static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  const mi_bfield_t mask = mi_bfield_mask(1, idx);;
+  mi_bfield_t old = mi_atomic_load_relaxed(b);
+  do {
+    if mi_unlikely((old&mask) == 0) {
+      old = mi_atomic_load_acquire(b);
+      if ((old&mask)==0) {
+        mi_subproc_stat_counter_increase(_mi_subproc(), pages_unabandon_busy_wait, 1);
       }
-      else {
-        // success, we claimed the bits!
-        *bitmap_idx = mi_bitmap_index_create(idx, bitidx);
-        return true;
+      while ((old&mask)==0) { // busy wait
+        mi_atomic_yield();
+        old = mi_atomic_load_acquire(b);
       }
     }
+  } while (!mi_atomic_cas_weak_acq_rel(b,&old, (old&~mask)));
+  mi_assert_internal((old&mask)==mask);  // we should only clear when it was set
+}
+
+// Set a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's.
+// `already_set` contains the count of bits that were already set (used when committing ranges to account
+// statistics correctly).
+static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_set) {
+  mi_assert_internal(mask != 0);
+  mi_bfield_t old = mi_atomic_load_relaxed(b);
+  while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask)) {};  // try to atomically set the mask bits until success
+  if (already_set!=NULL) { *already_set = mi_bfield_popcount(old&mask); }
+  return ((old&mask) == 0);
+}
+
+// Clear a mask set of bits atomically, and return true of the mask bits transitioned from all 1's to 0's
+// `all_clear` is set to `true` if the new bfield became zero.
+static inline bool mi_bfield_atomic_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) {
+  mi_assert_internal(mask != 0);
+  mi_bfield_t old = mi_atomic_load_relaxed(b);
+  while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)) {};  // try to atomically clear the mask bits until success
+  if (all_clear != NULL) { *all_clear = ((old&~mask)==0); }
+  return ((old&mask) == mask);
+}
+
+static inline bool mi_bfield_atomic_setX(_Atomic(mi_bfield_t)*b, size_t* already_set) {
+  const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_all_set());
+  if (already_set!=NULL) { *already_set = mi_bfield_popcount(old); }
+  return (old==0);
+}
+
+// static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b, bool* all_clear) {
+//   const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_zero());
+//   if (all_clear!=NULL) { *all_clear = true; }
+//   return (~old==0);
+// }
+
+// ------- mi_bfield_atomic_try_clear ---------------------------------------
+
+
+// Tries to clear a mask atomically, and returns true if the mask bits atomically transitioned from mask to 0
+// and false otherwise (leaving the bit field as is).
+// `all_clear` is set to `true` if the new bfield became zero.
+static inline bool mi_bfield_atomic_try_clear_mask_of(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, mi_bfield_t expect, bool* all_clear) {
+  mi_assert_internal(mask != 0);
+  // try to atomically clear the mask bits
+  do {
+    if ((expect & mask) != mask) {  // are all bits still set?
+      if (all_clear != NULL) { *all_clear = (expect == 0); }
+      return false;
+    }
+  } while (!mi_atomic_cas_weak_acq_rel(b, &expect, expect & ~mask));
+  if (all_clear != NULL) { *all_clear = ((expect & ~mask) == 0);  }
+  return true;
+}
+
+static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)* b, mi_bfield_t mask, bool* all_clear) {
+  mi_assert_internal(mask != 0);
+  const mi_bfield_t expect = mi_atomic_load_relaxed(b);
+  return mi_bfield_atomic_try_clear_mask_of(b, mask, expect, all_clear);
+}
+
+// Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0
+// and `false` otherwise leaving the bfield `b` as-is.
+// `all_clear` is set to true if the new bfield became zero (and false otherwise)
+mi_decl_maybe_unused static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)* b, size_t idx, bool* all_clear) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  const mi_bfield_t mask = mi_bfield_one()<<idx;
+  return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
+}
+
+// Tries to clear a byte atomically, and returns true if the byte atomically transitioned from 0xFF to 0
+// `all_clear` is set to true if the new bfield became zero (and false otherwise)
+mi_decl_maybe_unused static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  mi_assert_internal((idx%8)==0);
+  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<idx;
+  return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
+}
+
+// Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's.
+// and false otherwise leaving the bit field as-is.
+// `all_clear` is set to true if the new bfield became zero (which is always the case if successful).
+static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b, bool* all_clear) {
+  mi_bfield_t old = mi_bfield_all_set();
+  if (mi_atomic_cas_strong_acq_rel(b, &old, mi_bfield_zero())) {
+    if (all_clear != NULL) { *all_clear = true; }
+    return true;
+  }
+  else return false;
+}
+
+
+// ------- mi_bfield_atomic_is_set ---------------------------------------
+
+// Check if a bit is set
+static inline bool mi_bfield_atomic_is_set(const _Atomic(mi_bfield_t)*b, const size_t idx) {
+  const mi_bfield_t x = mi_atomic_load_acquire(b);
+  return ((x & mi_bfield_mask(1,idx)) != 0);
+}
+
+// Check if a bit is clear
+static inline bool mi_bfield_atomic_is_clear(const _Atomic(mi_bfield_t)*b, const size_t idx) {
+  const mi_bfield_t x = mi_atomic_load_acquire(b);
+  return ((x & mi_bfield_mask(1, idx)) == 0);
+}
+
+// Check if a bit is xset
+static inline bool mi_bfield_atomic_is_xset(mi_xset_t set, const _Atomic(mi_bfield_t)*b, const size_t idx) {
+  if (set) return mi_bfield_atomic_is_set(b, idx);
+      else return mi_bfield_atomic_is_clear(b, idx);
+}
+
+// Check if all bits corresponding to a mask are set.
+static inline bool mi_bfield_atomic_is_set_mask(const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
+  mi_assert_internal(mask != 0);
+  const mi_bfield_t x = mi_atomic_load_acquire(b);
+  return ((x & mask) == mask);
+}
+
+// Check if all bits corresponding to a mask are clear.
+static inline bool mi_bfield_atomic_is_clear_mask(const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
+  mi_assert_internal(mask != 0);
+  const mi_bfield_t x = mi_atomic_load_acquire(b);
+  return ((x & mask) == 0);
+}
+
+// Check if all bits corresponding to a mask are set/cleared.
+static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
+  mi_assert_internal(mask != 0);
+  if (set) return mi_bfield_atomic_is_set_mask(b, mask);
+      else return mi_bfield_atomic_is_clear_mask(b, mask);
+}
+
+// Count bits in a mask
+static inline size_t mi_bfield_atomic_popcount_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
+  const mi_bfield_t x = mi_atomic_load_acquire(b);
+  return mi_bfield_popcount(x & mask);
+}
+
+
+/* --------------------------------------------------------------------------------
+ bitmap chunks
+-------------------------------------------------------------------------------- */
+
+// ------- mi_bchunk_set ---------------------------------------
+
+// Set a single bit
+static inline bool mi_bchunk_set(mi_bchunk_t* chunk, size_t cidx, size_t* already_set) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  const bool was_clear = mi_bfield_atomic_set(&chunk->bfields[i], idx);
+  if (already_set != NULL) { *already_set = (was_clear ? 0 : 1); }
+  return was_clear;
+}
+
+// Set `0 < n <= MI_BFIELD_BITS`, and return true of the mask bits transitioned from all 0's to 1's.
+// `already_set` contains the count of bits that were already set (used when committing ranges to account
+// statistics correctly).
+// Can cross over two bfields.
+static inline bool mi_bchunk_setNX(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  mi_assert_internal(n > 0 && n <= MI_BFIELD_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  if mi_likely(idx + n <= MI_BFIELD_BITS) {
+    // within one field
+    return mi_bfield_atomic_set_mask(&chunk->bfields[i], mi_bfield_mask(n,idx), already_set);
+  }
+  else {
+    // spanning two fields
+    const size_t m = MI_BFIELD_BITS - idx;  // bits to clear in the first field
+    mi_assert_internal(m < n);
+    mi_assert_internal(i < MI_BCHUNK_FIELDS - 1);
+    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
+    size_t already_set1;
+    const bool all_set1 = mi_bfield_atomic_set_mask(&chunk->bfields[i], mi_bfield_mask(m, idx), &already_set1);
+    mi_assert_internal(n - m > 0);
+    mi_assert_internal(n - m < MI_BFIELD_BITS);
+    size_t already_set2;
+    const bool all_set2 = mi_bfield_atomic_set_mask(&chunk->bfields[i+1], mi_bfield_mask(n - m, 0), &already_set2);
+    if (already_set != NULL) { *already_set = already_set1 + already_set2; }
+    return (all_set1 && all_set2);
+  }
+}
+
+// Set a sequence of `n` bits within a chunk.
+// Returns true if all bits transitioned from 0 to 1 (or 1 to 0).
+mi_decl_noinline static bool mi_bchunk_xsetNC(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* palready_set, bool* pmaybe_all_clear) {
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
+  mi_assert_internal(n>0);
+  bool all_transition = true;
+  bool maybe_all_clear = true;
+  size_t total_already_set = 0;
+  size_t idx   = cidx % MI_BFIELD_BITS;
+  size_t field = cidx / MI_BFIELD_BITS;
+  while (n > 0) {
+    size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
+    if (m > n) { m = n; }
+    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
+    mi_assert_internal(field < MI_BCHUNK_FIELDS);
+    const mi_bfield_t mask = mi_bfield_mask(m, idx);
+    size_t already_set = 0;
+    bool all_clear = false;
+    const bool transition = (set ? mi_bfield_atomic_set_mask(&chunk->bfields[field], mask, &already_set)
+                                 : mi_bfield_atomic_clear_mask(&chunk->bfields[field], mask, &all_clear));
+    mi_assert_internal((transition && already_set == 0) || (!transition && already_set > 0));
+    all_transition = all_transition && transition;
+    total_already_set += already_set;
+    maybe_all_clear = maybe_all_clear && all_clear;
+    // next field
+    field++;
+    idx = 0;
+    mi_assert_internal(m <= n);
+    n -= m;
+  }
+  if (palready_set!=NULL) { *palready_set = total_already_set; }
+  if (pmaybe_all_clear!=NULL) { *pmaybe_all_clear = maybe_all_clear; }
+  return all_transition;
+}
+
+static inline bool mi_bchunk_setN(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) {
+  mi_assert_internal(n>0 && n <= MI_BCHUNK_BITS);
+  if (n==1) return mi_bchunk_set(chunk, cidx, already_set);
+  // if (n==8 && (cidx%8) == 0) return mi_bchunk_set8(chunk, cidx, already_set);
+  // if (n==MI_BFIELD_BITS) return mi_bchunk_setX(chunk, cidx, already_set);
+  if (n<=MI_BFIELD_BITS) return mi_bchunk_setNX(chunk, cidx, n, already_set);
+  return mi_bchunk_xsetNC(MI_BIT_SET, chunk, cidx, n, already_set, NULL);
+}
+
+// ------- mi_bchunk_clear ---------------------------------------
+
+static inline bool mi_bchunk_clear(mi_bchunk_t* chunk, size_t cidx, bool* all_clear) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  return mi_bfield_atomic_clear(&chunk->bfields[i], idx, all_clear);
+}
+
+static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) {
+  mi_assert_internal(n>0 && n <= MI_BCHUNK_BITS);
+  if (n==1) return mi_bchunk_clear(chunk, cidx, maybe_all_clear);
+  // if (n==8) return mi_bchunk_clear8(chunk, cidx, maybe_all_clear);
+  // if (n==MI_BFIELD_BITS) return mi_bchunk_clearX(chunk, cidx, maybe_all_clear);
+  // TODO: implement mi_bchunk_xsetNX instead of setNX
+  return mi_bchunk_xsetNC(MI_BIT_CLEAR, chunk, cidx, n, NULL, maybe_all_clear);
+}
+
+// Check if a sequence of `n` bits within a chunk are all set/cleared.
+// This can cross bfield's
+mi_decl_noinline static size_t mi_bchunk_popcountNC(mi_bchunk_t* chunk, size_t field_idx, size_t idx, size_t n) {
+  mi_assert_internal((field_idx*MI_BFIELD_BITS) + idx + n <= MI_BCHUNK_BITS);
+  size_t count = 0;
+  while (n > 0) {
+    size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
+    if (m > n) { m = n; }
+    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
+    mi_assert_internal(field_idx < MI_BCHUNK_FIELDS);
+    const size_t mask = mi_bfield_mask(m, idx);
+    count += mi_bfield_atomic_popcount_mask(&chunk->bfields[field_idx], mask);
+    // next field
+    field_idx++;
+    idx = 0;
+    n -= m;
+  }
+  return count;
+}
+
+// Count set bits a sequence of `n` bits.
+static inline size_t mi_bchunk_popcountN(mi_bchunk_t* chunk, size_t cidx, size_t n) {
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
+  mi_assert_internal(n>0);
+  if (n==0) return 0;
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  if (n==1) { return (mi_bfield_atomic_is_set(&chunk->bfields[i], idx) ? 1 : 0); }
+  if (idx + n <= MI_BFIELD_BITS) { return mi_bfield_atomic_popcount_mask(&chunk->bfields[i], mi_bfield_mask(n, idx)); }
+  return mi_bchunk_popcountNC(chunk, i, idx, n);
+}
+
+
+// ------- mi_bchunk_is_xset ---------------------------------------
+
+// Check if a sequence of `n` bits within a chunk are all set/cleared.
+// This can cross bfield's
+mi_decl_noinline static bool mi_bchunk_is_xsetNC(mi_xset_t set, const mi_bchunk_t* chunk, size_t field_idx, size_t idx, size_t n) {
+  mi_assert_internal((field_idx*MI_BFIELD_BITS) + idx + n <= MI_BCHUNK_BITS);
+  while (n > 0) {
+    size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
+    if (m > n) { m = n; }
+    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
+    mi_assert_internal(field_idx < MI_BCHUNK_FIELDS);
+    const size_t mask = mi_bfield_mask(m, idx);
+    if (!mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field_idx], mask)) {
+      return false;
+    }
+    // next field
+    field_idx++;
+    idx = 0;
+    n -= m;
+  }
+  return true;
+}
+
+// Check if a sequence of `n` bits within a chunk are all set/cleared.
+static inline bool mi_bchunk_is_xsetN(mi_xset_t set, const mi_bchunk_t* chunk, size_t cidx, size_t n) {
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
+  mi_assert_internal(n>0);
+  if (n==0) return true;
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  if (n==1) { return mi_bfield_atomic_is_xset(set, &chunk->bfields[i], idx); }
+  if (idx + n <= MI_BFIELD_BITS) { return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[i], mi_bfield_mask(n, idx)); }
+  return mi_bchunk_is_xsetNC(set, chunk, i, idx, n);
+}
+
+
+// ------- mi_bchunk_try_clear  ---------------------------------------
+
+// Clear `0 < n <= MI_BITFIELD_BITS`. Can cross over a bfield boundary.
+static inline bool mi_bchunk_try_clearNX(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  mi_assert_internal(n <= MI_BFIELD_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  if mi_likely(idx + n <= MI_BFIELD_BITS) {
+    // within one field
+    return mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], mi_bfield_mask(n, idx), pmaybe_all_clear);
+  }
+  else {
+    // spanning two fields (todo: use double-word atomic ops?)
+    const size_t m = MI_BFIELD_BITS - idx;  // bits to clear in the first field
+    mi_assert_internal(m < n);
+    mi_assert_internal(i < MI_BCHUNK_FIELDS - 1);
+    bool field1_is_clear;
+    if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], mi_bfield_mask(m, idx), &field1_is_clear)) return false;
+    // try the second field as well
+    mi_assert_internal(n - m > 0);
+    mi_assert_internal(n - m < MI_BFIELD_BITS);
+    bool field2_is_clear;
+    if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[i+1], mi_bfield_mask(n - m, 0), &field2_is_clear)) {
+      // we failed to clear the second field, restore the first one
+      mi_bfield_atomic_set_mask(&chunk->bfields[i], mi_bfield_mask(m, idx), NULL);
+      return false;
+    }
+    if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = field1_is_clear && field2_is_clear;  }
+    return true;
+  }
+}
+
+// Clear a full aligned bfield.
+// static inline bool mi_bchunk_try_clearX(mi_bchunk_t* chunk, size_t cidx, bool* pmaybe_all_clear) {
+//   mi_assert_internal(cidx < MI_BCHUNK_BITS);
+//   mi_assert_internal((cidx%MI_BFIELD_BITS) == 0);
+//   const size_t i = cidx / MI_BFIELD_BITS;
+//   return mi_bfield_atomic_try_clearX(&chunk->bfields[i], pmaybe_all_clear);
+// }
+
+// Try to atomically clear a sequence of `n` bits within a chunk.
+// Returns true if all bits transitioned from 1 to 0,
+// and false otherwise leaving all bit fields as is.
+// Note: this is the complex one as we need to unwind partial atomic operations if we fail halfway..
+// `maybe_all_clear` is set to `true` if all the bfields involved become zero.
+mi_decl_noinline static bool mi_bchunk_try_clearNC(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) {
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
+  mi_assert_internal(n>0);
+  if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = true; }
+  if (n==0) return true;
+
+  // first field
+  const size_t start_idx = cidx % MI_BFIELD_BITS;
+  const size_t start_field = cidx / MI_BFIELD_BITS;
+  size_t field = start_field;
+  size_t m = MI_BFIELD_BITS - start_idx;   // m are the bits to clear in this field
+  if (m > n) { m = n; }
+  mi_assert_internal(start_idx + m <= MI_BFIELD_BITS);
+  mi_assert_internal(start_field < MI_BCHUNK_FIELDS);
+  const mi_bfield_t mask_start = mi_bfield_mask(m, start_idx);
+  bool maybe_all_clear;
+  if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_start, &maybe_all_clear)) return false;
+
+  // done?
+  mi_assert_internal(m <= n);
+  n -= m;
+
+  // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields
+  // mid fields?
+  while (n >= MI_BFIELD_BITS) {
+    field++;
+    mi_assert_internal(field < MI_BCHUNK_FIELDS);
+    bool field_is_clear;
+    if (!mi_bfield_atomic_try_clearX(&chunk->bfields[field], &field_is_clear)) goto restore;
+    maybe_all_clear = maybe_all_clear && field_is_clear;
+    n -= MI_BFIELD_BITS;
+  }
+
+  // last field?
+  if (n > 0) {
+    mi_assert_internal(n < MI_BFIELD_BITS);
+    field++;
+    mi_assert_internal(field < MI_BCHUNK_FIELDS);
+    const mi_bfield_t mask_end = mi_bfield_mask(n, 0);
+    bool field_is_clear;
+    if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_end, &field_is_clear)) goto restore;
+    maybe_all_clear = maybe_all_clear && field_is_clear;
+  }
+
+  if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = maybe_all_clear; }
+  return true;
+
+restore:
+  // `field` is the index of the field that failed to set atomically; we need to restore all previous fields
+  mi_assert_internal(field > start_field);
+  while( field > start_field) {
+    field--;
+    if (field == start_field) {
+      mi_bfield_atomic_set_mask(&chunk->bfields[field], mask_start, NULL);
+    }
     else {
-      // on to the next bit range
-#ifdef MI_HAVE_FAST_BITSCAN
-      mi_assert_internal(mapm != 0);
-      const size_t shift = (count == 1 ? 1 : (MI_INTPTR_BITS - mi_clz(mapm) - bitidx));
-      mi_assert_internal(shift > 0 && shift <= count);
-#else
-      const size_t shift = 1;
+      mi_bfield_atomic_setX(&chunk->bfields[field], NULL);  // mid-field: set all bits again
+    }
+  }
+  return false;
+}
+
+
+static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) {
+  mi_assert_internal(n>0);
+  // if (n==MI_BFIELD_BITS) return mi_bchunk_try_clearX(chunk, cidx, maybe_all_clear);
+  if (n<=MI_BFIELD_BITS) return mi_bchunk_try_clearNX(chunk, cidx, n, maybe_all_clear);
+  return mi_bchunk_try_clearNC(chunk, cidx, n, maybe_all_clear);
+}
+
+
+// ------- mi_bchunk_try_find_and_clear ---------------------------------------
+
+#if MI_OPT_SIMD && defined(__AVX2__)
+mi_decl_maybe_unused static inline __m256i mi_mm256_zero(void) {
+  return _mm256_setzero_si256();
+}
+mi_decl_maybe_unused static inline __m256i mi_mm256_ones(void) {
+  return _mm256_set1_epi64x(~0);
+}
+mi_decl_maybe_unused static inline bool mi_mm256_is_ones(__m256i vec) {
+  return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec));
+}
+mi_decl_maybe_unused static inline bool mi_mm256_is_zero( __m256i vec) {
+  return _mm256_testz_si256(vec,vec);
+}
 #endif
-      bitidx += shift;
-      m <<= shift;
+
+static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx) {
+  mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
+  // note: this must be acquire (and not relaxed), or otherwise the AVX code below can loop forever
+  // as the compiler won't reload the registers vec1 and vec2 from memory again.
+  const mi_bfield_t b = mi_atomic_load_acquire(&chunk->bfields[chunk_idx]);
+  size_t idx;
+  if (mi_bfield_find_least_bit(b, &idx)) {           // find the least bit
+    if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[chunk_idx], mi_bfield_mask(1,idx), b, NULL)) {  // clear it atomically
+      *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
+      mi_assert_internal(*pidx < MI_BCHUNK_BITS);
+      return true;
+    }
+  }
+  return false;
+}
+
+// Find least 1-bit in a chunk and try to clear it atomically
+// set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
+// This is used to find free slices and abandoned pages and should be efficient.
+// todo: try neon version
+static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) {
+  #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
+  for(int tries=0; tries<4; tries++) {   // paranoia: at most 4 tries
+    const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+    const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF  : 0)
+    const uint32_t mask = ~_mm256_movemask_epi8(vcmp);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
+    // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared)
+    if (mask==0) return false;
+    mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
+    const size_t chunk_idx = _tzcnt_u32(mask) / 8;
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
+    // try again
+    // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
+    // we add an explicit memory barrier as older gcc compilers do not reload the registers even with an atomic acquire (issue #1206)
+    #if defined(__GNUC__)
+    __asm __volatile ("" : : "g"(chunk) : "memory");
+    #endif
+  }
+  #elif MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  for(int tries=0; tries<4; tries++) {   // paranoia: at most 4 tries
+    size_t chunk_idx = 0;
+    #if 0
+    // one vector at a time
+    __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+    if (mi_mm256_is_zero(vec)) {
+      chunk_idx += 4;
+      vec = _mm256_load_si256(((const __m256i*)chunk->bfields) + 1);
+    }
+    const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF  : 0)
+    const uint32_t mask = ~_mm256_movemask_epi8(vcmp);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
+    // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared)
+    if (mask==0) return false;
+    mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
+    chunk_idx += _tzcnt_u32(mask) / 8;
+    #else
+    // a cache line is 64b so we can just as well load all at the same time
+    const __m256i vec1  = _mm256_load_si256((const __m256i*)chunk->bfields);
+    const __m256i vec2  = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
+    const __m256i cmpv  = mi_mm256_zero();
+    const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (elem64 == 0 ? 0xFF  : 0)
+    const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (elem64 == 0 ? 0xFF  : 0)
+    const uint32_t mask1 = ~_mm256_movemask_epi8(vcmp1);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
+    const uint32_t mask2 = ~_mm256_movemask_epi8(vcmp2);  // mask of most significant bit of each byte (so each 8 bits are all set or clear)
+    const uint64_t mask = ((uint64_t)mask2 << 32) | mask1;
+    // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared)
+    if (mask==0) return false;
+    mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
+    chunk_idx = mi_ctz(mask) / 8;
+    #endif
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
+    // try again
+    // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
+    // we add an explicit memory barrier as older gcc compilers do not reload the registers even with an atomic acquire (issue #1206)
+    #if defined(__GNUC__)
+    __asm __volatile ("" : : "g"(chunk) : "memory");
+    #endif
+  }
+  #elif MI_OPT_SIMD && (MI_BCHUNK_BITS==512) && MI_ARCH_ARM64
+  for(int tries=0; tries<4; tries++) {   // paranoia: at most 4 tries
+    // a cache line is 64b so we can just as well load all at the same time (?)
+    const uint64x2_t vzero1_lo = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields));        // 2x64 bit is_zero
+    const uint64x2_t vzero1_hi = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields + 2));    // 2x64 bit is_zero
+    const uint64x2_t vzero2_lo = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields + 4));    // 2x64 bit is_zero
+    const uint64x2_t vzero2_hi = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields + 6));    // 2x64 bit is_zero
+    const uint32x4_t vzero1    = vuzp1q_u32(vreinterpretq_u32_u64(vzero1_lo),vreinterpretq_u32_u64(vzero1_hi)); // unzip even elements: narrow to 4x32 bit is_zero ()
+    const uint32x4_t vzero2    = vuzp1q_u32(vreinterpretq_u32_u64(vzero2_lo),vreinterpretq_u32_u64(vzero2_hi)); // unzip even elements: narrow to 4x32 bit is_zero ()
+    const uint32x4_t vzero1x   = vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(vzero1), 24));        // shift-right 2x32bit elem by 24: lo 16 bits contain the 2 lo bytes
+    const uint32x4_t vzero2x   = vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(vzero2), 24));
+    const uint16x8_t vzero12   = vreinterpretq_u16_u32(vuzp1q_u32(vzero1x,vzero2x));                           // unzip even 32-bit elements into one vector
+    const uint8x8_t  vzero     = vmovn_u16(vzero12);                                                           // narrow the bottom 16-bits
+    const uint64_t mask = ~vget_lane_u64(vreinterpret_u64_u8(vzero), 0);  // 1 byte for each bfield (0xFF => bfield has a bit set)
+    if (mask==0) return false;
+    mi_assert_internal((mi_ctz(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
+    const size_t chunk_idx = mi_ctz(mask) / 8;
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
+    // try again
+    // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
+    // we add an explicit memory barrier as older gcc compilers do not reload the registers even with an atomic acquire (issue #1206)
+    #if defined(__GNUC__)
+    __asm __volatile ("" : : "g"(chunk) : "memory");
+    #endif
+  }
+  #else
+  for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+    if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx)) return true;
+  }
+  #endif
+  return false;  
+}
+
+static inline bool mi_bchunk_try_find_and_clear_1(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+  mi_assert_internal(n==1); MI_UNUSED(n);
+  return mi_bchunk_try_find_and_clear(chunk, pidx);
+}
+
+mi_decl_maybe_unused static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx) {
+  const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
+  // has_set8 has low bit in each byte set if the byte in x == 0xFF
+  const mi_bfield_t has_set8 =
+    ((~b - MI_BFIELD_LO_BIT8) &      // high bit set if byte in x is 0xFF or < 0x7F
+     (b  & MI_BFIELD_HI_BIT8))       // high bit set if byte in x is >= 0x80
+     >> 7;                           // shift high bit to low bit
+  size_t idx;
+  if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit
+    mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
+    mi_assert_internal((idx%8)==0);
+    if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[chunk_idx], (mi_bfield_t)0xFF << idx, b, NULL)) {  // unset the byte atomically
+      *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
+      mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
+      return true;
     }
   }
-  // no bits found
   return false;
 }
 
-// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-// `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
-bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
-  size_t idx = start_field_idx;
-  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
-    if (idx >= bitmap_fields) { idx = 0; } // wrap
-    if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+// find least aligned byte in a chunk with all bits set, and try unset it atomically
+// set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
+// Used to find medium size pages in the free blocks.
+// todo: try neon version
+static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) {
+  #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  while (true) {
+    // since a cache-line is 64b, load all at once
+    const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
+    const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1);
+    const __m256i cmpv = mi_mm256_ones();
+    const __m256i vcmp1 = _mm256_cmpeq_epi8(vec1, cmpv); // (byte == ~0 ? 0xFF : 0)
+    const __m256i vcmp2 = _mm256_cmpeq_epi8(vec2, cmpv); // (byte == ~0 ? 0xFF : 0)
+    const uint32_t mask1 = _mm256_movemask_epi8(vcmp1);    // mask of most significant bit of each byte
+    const uint32_t mask2 = _mm256_movemask_epi8(vcmp2);    // mask of most significant bit of each byte
+    const uint64_t mask = ((uint64_t)mask2 << 32) | mask1;
+    // mask is inverted, so each bit is 0xFF iff the corresponding byte has a bit set (and thus can be cleared)
+    if (mask==0) return false;
+    const size_t bidx = _tzcnt_u64(mask);          // byte-idx of the byte in the chunk
+    const size_t chunk_idx = bidx / 8;
+    const size_t idx = (bidx % 8)*8;
+    mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
+    if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) {  // clear it atomically
+      *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
+      mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
       return true;
     }
+    // try again
+    // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded  }
+  }
+  #else
+    for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+      if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx)) return true;
+    }
+    return false;
+  #endif
+}
+
+static inline bool mi_bchunk_try_find_and_clear_8(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+  mi_assert_internal(n==8); MI_UNUSED(n);
+  return mi_bchunk_try_find_and_clear8(chunk, pidx);
+}
+
+
+// find a sequence of `n` bits in a chunk with `0 < n <= MI_BFIELD_BITS` with all bits set,
+// and try to clear them atomically.
+// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
+// will cross bfield boundaries.
+mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+  if (n == 0 || n > MI_BFIELD_BITS) return false;
+  const mi_bfield_t mask = mi_bfield_mask(n, 0);
+  // for all fields in the chunk
+  for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+    mi_bfield_t b0 = mi_atomic_load_relaxed(&chunk->bfields[i]);
+    mi_bfield_t b = b0;
+    size_t idx;
+
+    // is there a range inside the field?
+    while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
+      if (idx + n > MI_BFIELD_BITS) break; // too short: maybe cross over, or continue with the next field
+
+      const size_t bmask = mask<<idx;
+      mi_assert_internal(bmask>>idx == mask);
+      if ((b&bmask) == bmask) { // found a match with all bits set, try clearing atomically
+        if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[i], bmask, b0, NULL)) {
+          *pidx = (i*MI_BFIELD_BITS) + idx;
+          mi_assert_internal(*pidx < MI_BCHUNK_BITS);
+          mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
+          return true;
+        }
+        else {
+          // if we failed to atomically commit, reload b and try again from the start
+          b = b0 = mi_atomic_load_acquire(&chunk->bfields[i]);
+        }
+      }
+      else {
+        // advance by clearing the least run of ones, for example, with n>=4, idx=2:
+        // b             = 1111 1101 1010 1100
+        // .. + (1<<idx) = 1111 1101 1011 0000
+        // .. & b        = 1111 1101 1010 0000
+        b = b & (b + (mi_bfield_one() << idx));
+      }
+    }
+
+    // check if we can cross into the next bfield
+    if (b!=0 && i < MI_BCHUNK_FIELDS-1) {
+      const size_t post = mi_bfield_clz(~b);
+      if (post > 0) {
+        const size_t pre = mi_bfield_ctz(~mi_atomic_load_relaxed(&chunk->bfields[i+1]));
+        if (post + pre >= n) {
+          // it fits -- try to claim it atomically
+          const size_t cidx = (i*MI_BFIELD_BITS) + (MI_BFIELD_BITS - post);
+          if (mi_bchunk_try_clearNX(chunk, cidx, n, NULL)) {
+            // we cleared all atomically
+            *pidx = cidx;
+            mi_assert_internal(*pidx < MI_BCHUNK_BITS);
+            mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
+            return true;
+          }
+        }
+      }
+    }
   }
   return false;
 }
 
-// Like _mi_bitmap_try_find_from_claim but with an extra predicate that must be fullfilled
-bool _mi_bitmap_try_find_from_claim_pred(mi_bitmap_t bitmap, const size_t bitmap_fields, 
-            const size_t start_field_idx, const size_t count, 
-            mi_bitmap_pred_fun_t pred_fun, void* pred_arg,            
-            mi_bitmap_index_t* bitmap_idx) {
-  size_t idx = start_field_idx;
-  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
-    if (idx >= bitmap_fields) idx = 0; // wrap
-    if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
-      if (pred_fun == NULL || pred_fun(*bitmap_idx, pred_arg)) { 
+// find a sequence of `n` bits in a chunk with `n <= MI_BCHUNK_BITS` with all bits set,
+// and try to clear them atomically.
+// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
+// This can cross bfield boundaries.
+static mi_decl_noinline bool mi_bchunk_try_find_and_clearNC(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+  if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
+
+  // we first scan ahead to see if there is a range of `n` set bits, and only then try to clear atomically
+  mi_assert_internal(n>0);
+  const size_t skip_count = (n-1)/MI_BFIELD_BITS;
+  size_t cidx;
+  for (size_t i = 0; i < MI_BCHUNK_FIELDS - skip_count; i++)
+  {
+    size_t m = n;   // bits to go
+
+    // first field
+    mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
+    size_t ones = mi_bfield_clz(~b);
+
+    cidx = (i*MI_BFIELD_BITS) + (MI_BFIELD_BITS - ones);  // start index
+    if (ones >= m) {
+      // we found enough bits already!
+      m = 0;
+    }
+    else if (ones > 0) {
+      // keep scanning further fields until we have enough bits
+      m -= ones;
+      size_t j = 1;   // field count from i
+      while (i+j < MI_BCHUNK_FIELDS) {
+        mi_assert_internal(m > 0);
+        b = mi_atomic_load_relaxed(&chunk->bfields[i+j]);
+        ones = mi_bfield_ctz(~b);
+        if (ones >= m) {
+          // we found enough bits
+          m = 0;
+          break;
+        }
+        else if (ones == MI_BFIELD_BITS) {
+          // not enough yet, proceed to the next field
+          j++;
+          m -= MI_BFIELD_BITS;
+        }
+        else {
+          // the range was not enough, start from scratch
+          i = i + j - 1;  // no need to re-scan previous fields, except the last one (with clz this time)
+          mi_assert_internal(m>0);
+          break;
+        }
+      }
+    }
+
+    // did we find a range?
+    if (m==0) {
+      if (mi_bchunk_try_clearN(chunk, cidx, n, NULL)) {
+        // we cleared all atomically
+        *pidx = cidx;
+        mi_assert_internal(*pidx < MI_BCHUNK_BITS);
+        mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
         return true;
       }
-      // predicate returned false, unclaim and look further
-      _mi_bitmap_unclaim(bitmap, bitmap_fields, count, *bitmap_idx);
+      // note: if we fail for a small `n` on the first field, we don't rescan that field (as `i` is incremented)
     }
+    // otherwise continue searching
   }
   return false;
 }
 
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  // mi_assert_internal((bitmap[idx] & mask) == mask);
-  const size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
-  return ((prev & mask) == mask);
-}
-
-
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
-  size_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask);
-  if (any_zero != NULL) { *any_zero = ((prev & mask) != mask); }
-  return ((prev & mask) == 0);
-}
-
-// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one.
-static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  const size_t field = mi_atomic_load_relaxed(&bitmap[idx]);
-  if (any_ones != NULL) { *any_ones = ((field & mask) != 0); }
-  return ((field & mask) == mask);
-}
-
-// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. 
-// Returns `true` if successful when all previous `count` bits were 0.
-bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  size_t expected = mi_atomic_load_relaxed(&bitmap[idx]);
-  do  {    
-    if ((expected & mask) != 0) return false;
-  } 
-  while (!mi_atomic_cas_strong_acq_rel(&bitmap[idx], &expected, expected | mask));
-  mi_assert_internal((expected & mask) == 0);
+
+
+// ------- mi_bchunk_clear_once_set ---------------------------------------
+
+static inline void mi_bchunk_clear_once_set(mi_bchunk_t* chunk, size_t cidx) {
+  mi_assert_internal(cidx < MI_BCHUNK_BITS);
+  const size_t i = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  mi_bfield_atomic_clear_once_set(&chunk->bfields[i], idx);
+}
+
+
+// ------- mi_bitmap_all_are_clear ---------------------------------------
+
+
+// are all bits in a bitmap chunk clear?
+static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
+  #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
+  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+  return mi_mm256_is_zero(vec);
+  #elif MI_OPT_SIMD &&  defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  // a 64b cache-line contains the entire chunk anyway so load both at once
+  const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
+  const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
+  return (mi_mm256_is_zero(_mm256_or_si256(vec1,vec2)));
+  #elif MI_OPT_SIMD && (MI_BCHUNK_BITS==512) && MI_ARCH_ARM64
+  const uint64x2_t v0 = vld1q_u64((uint64_t*)chunk->bfields);
+  const uint64x2_t v1 = vld1q_u64((uint64_t*)chunk->bfields + 2);
+  const uint64x2_t v2 = vld1q_u64((uint64_t*)chunk->bfields + 4);
+  const uint64x2_t v3 = vld1q_u64((uint64_t*)chunk->bfields + 6);
+  const uint64x2_t v  = vorrq_u64(vorrq_u64(v0,v1),vorrq_u64(v2,v3));
+  return (vmaxvq_u32(vreinterpretq_u32_u64(v)) == 0);
+  #else
+  for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+    if (mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false;
+  }
   return true;
+  #endif
 }
 
+// are all bits in a bitmap chunk set?
+static inline bool mi_bchunk_all_are_set_relaxed(mi_bchunk_t* chunk) {
+#if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
+  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+  return mi_mm256_is_ones(vec);
+#elif MI_OPT_SIMD &&  defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  // a 64b cache-line contains the entire chunk anyway so load both at once
+  const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
+  const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
+  return (mi_mm256_is_ones(_mm256_and_si256(vec1, vec2)));
+#elif MI_OPT_SIMD && (MI_BCHUNK_BITS==512) && MI_ARCH_ARM64
+  const uint64x2_t v0 = vld1q_u64((uint64_t*)chunk->bfields);
+  const uint64x2_t v1 = vld1q_u64((uint64_t*)chunk->bfields + 2);
+  const uint64x2_t v2 = vld1q_u64((uint64_t*)chunk->bfields + 4);
+  const uint64x2_t v3 = vld1q_u64((uint64_t*)chunk->bfields + 6);
+  const uint64x2_t v  = vandq_u64(vandq_u64(v0,v1),vandq_u64(v2,v3));
+  return (vminvq_u32(vreinterpretq_u32_u64(v)) == 0xFFFFFFFFUL);
+#else
+  for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+    if (~mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false;
+  }
+  return true;
+#endif
+}
+
+
+static bool mi_bchunk_bsr(mi_bchunk_t* chunk, size_t* pidx) {
+  for (size_t i = MI_BCHUNK_FIELDS; i > 0; ) {
+    i--;
+    mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
+    size_t idx;
+    if (mi_bsr(b, &idx)) {
+      *pidx = (i*MI_BFIELD_BITS) + idx;
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool mi_bchunk_bsr_inv(mi_bchunk_t* chunk, size_t* pidx) {
+  for (size_t i = MI_BCHUNK_FIELDS; i > 0; ) {
+    i--;
+    mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
+    size_t idx;
+    if (mi_bsr(~b, &idx)) {
+      *pidx = (i*MI_BFIELD_BITS) + idx;
+      return true;
+    }
+  }
+  return false;
+}
 
-bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL);
+static size_t mi_bchunk_popcount(mi_bchunk_t* chunk) {
+  size_t popcount = 0;
+  for (size_t i = 0; i < MI_BCHUNK_FIELDS; i++) {
+    const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
+    popcount += mi_bfield_popcount(b);
+  }
+  return popcount;
 }
 
-bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  bool any_ones;
-  mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
-  return any_ones;
+
+/* --------------------------------------------------------------------------------
+ bitmap chunkmap
+-------------------------------------------------------------------------------- */
+
+static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) {
+  mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  mi_bchunk_set(&bitmap->chunkmap, chunk_idx, NULL);
+}
+
+static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) {
+  mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  // check if the corresponding chunk is all clear
+  if (!mi_bchunk_all_are_clear_relaxed(&bitmap->chunks[chunk_idx])) return false;
+  // clear the chunkmap bit
+  mi_bchunk_clear(&bitmap->chunkmap, chunk_idx, NULL);
+  // .. but a concurrent set may have happened in between our all-clear test and the clearing of the
+  // bit in the mask. We check again to catch this situation.
+  if (!mi_bchunk_all_are_clear_relaxed(&bitmap->chunks[chunk_idx])) {
+    mi_bchunk_set(&bitmap->chunkmap, chunk_idx, NULL);
+    return false;
+  }
+  return true;
 }
 
 
-//--------------------------------------------------------------------------
-// the `_across` functions work on bitmaps where sequences can cross over
-// between the fields. This is used in arena allocation
-//--------------------------------------------------------------------------
+/* --------------------------------------------------------------------------------
+  bitmap
+-------------------------------------------------------------------------------- */
+
+size_t mi_bitmap_size(size_t bit_count, size_t* pchunk_count) {
+  mi_assert_internal((bit_count % MI_BCHUNK_BITS) == 0);
+  bit_count = _mi_align_up(bit_count, MI_BCHUNK_BITS);
+  mi_assert_internal(bit_count <= MI_BITMAP_MAX_BIT_COUNT);
+  mi_assert_internal(bit_count > 0);
+  const size_t chunk_count = bit_count / MI_BCHUNK_BITS;
+  mi_assert_internal(chunk_count >= 1);
+  const size_t size = offsetof(mi_bitmap_t,chunks) + (chunk_count * MI_BCHUNK_SIZE);
+  mi_assert_internal( (size%MI_BCHUNK_SIZE) == 0 );
+  if (pchunk_count != NULL) { *pchunk_count = chunk_count;  }
+  return size;
+}
 
-// Try to atomically claim a sequence of `count` bits starting from the field
-// at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success.
-// Only needs to consider crossing into the next fields (see `mi_bitmap_try_find_from_claim_across`)
-static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
+
+// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
+// returns the size of the bitmap
+size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero) {
+  size_t chunk_count;
+  const size_t size = mi_bitmap_size(bit_count, &chunk_count);
+  if (!already_zero) {
+    _mi_memzero_aligned(bitmap, size);
+  }
+  mi_atomic_store_release(&bitmap->chunk_count, chunk_count);
+  mi_assert_internal(mi_atomic_load_relaxed(&bitmap->chunk_count) <= MI_BITMAP_MAX_CHUNK_COUNT);
+  return size;
+}
+
+
+// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
+static void mi_bchunks_unsafe_setN(mi_bchunk_t* chunks, mi_bchunkmap_t* cmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+
+  // start chunk and index
+  size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
+  const size_t ccount = _mi_divide_up(n, MI_BCHUNK_BITS);
+
+  // first update the chunkmap
+  mi_bchunk_setN(cmap, chunk_idx, ccount, NULL);
+
+  // first chunk
+  size_t m = MI_BCHUNK_BITS - cidx;
+  if (m > n) { m = n; }
+  mi_bchunk_setN(&chunks[chunk_idx], cidx, m, NULL);
+
+  // n can be large so use memset for efficiency for all in-between chunks
+  chunk_idx++;
+  n -= m;
+  const size_t mid_chunks = n / MI_BCHUNK_BITS;
+  if (mid_chunks > 0) {
+    _mi_memset(&chunks[chunk_idx], ~0, mid_chunks * MI_BCHUNK_SIZE);
+    chunk_idx += mid_chunks;
+    n -= (mid_chunks * MI_BCHUNK_BITS);
+  }
+
+  // last chunk
+  if (n > 0) {
+    mi_assert_internal(n < MI_BCHUNK_BITS);
+    mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
+    mi_bchunk_setN(&chunks[chunk_idx], 0, n, NULL);
+  }
+}
+
+// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
+void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap));
+  mi_bchunks_unsafe_setN(&bitmap->chunks[0], &bitmap->chunkmap, idx, n);
+}
+
+
+
+
+// ------- mi_bitmap_xset ---------------------------------------
+
+// Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
+bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* palready_set) {
+  mi_assert_internal(n>0);
+  const size_t maxbits = mi_bitmap_max_bits(bitmap);
+  mi_assert_internal(idx + n <= maxbits);
+  if (idx+n > maxbits) { // paranoia
+    if (idx >= maxbits) return false;
+    n = maxbits - idx;
+  }
+
+  // iterate through the chunks
+  size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  size_t cidx = idx % MI_BCHUNK_BITS;
+  bool were_allclear = true;
+  size_t already_set = 0;
+  while (n > 0) {
+    const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
+    size_t _already_set = 0;
+    were_allclear = mi_bchunk_setN(&bitmap->chunks[chunk_idx], cidx, m, &_already_set) && were_allclear;
+    already_set += _already_set;
+    mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards
+    mi_assert_internal(m <= n);
+    n -= m;
+    cidx = 0;
+    chunk_idx++;
+  }
+  if (palready_set != NULL) { *palready_set = already_set;  }
+  return were_allclear;
+}
+
+// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 1's to 0's.
+bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  const size_t maxbits = mi_bitmap_max_bits(bitmap);
+  mi_assert_internal(idx + n <= maxbits);
+  if (idx+n > maxbits) { // paranoia
+    if (idx >= maxbits) return false;
+    n = maxbits - idx;
+  }
+
+  // iterate through the chunks
+  size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  size_t cidx = idx % MI_BCHUNK_BITS;
+  bool were_allset = true;
+  while (n > 0) {
+    const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
+    bool maybe_all_clear = false;
+    were_allset = mi_bchunk_clearN(&bitmap->chunks[chunk_idx], cidx, m, &maybe_all_clear) && were_allset;
+    if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); }
+    mi_assert_internal(m <= n);
+    n -= m;
+    cidx = 0;
+    chunk_idx++;
+  }
+  return were_allset;
+}
+
+// Count bits set in a range of `n` bits.
+size_t mi_bitmap_popcountN( mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  const size_t maxbits = mi_bitmap_max_bits(bitmap);
+  mi_assert_internal(idx + n <= maxbits);
+  if (idx+n > maxbits) { // paranoia
+    if (idx >= maxbits) return 0;
+    n = maxbits - idx;
+  }
+
+  // iterate through the chunks
+  size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  size_t cidx = idx % MI_BCHUNK_BITS;
+  size_t popcount = 0;
+  while (n > 0) {
+    const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
+    popcount += mi_bchunk_popcountN(&bitmap->chunks[chunk_idx], cidx, m);
+    mi_assert_internal(m <= n);
+    n -= m;
+    cidx = 0;
+    chunk_idx++;
+  }
+  return popcount;
+}
+
+
+// Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
+bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_setN(bitmap, idx, 1, NULL);
+}
+
+bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_clearN(bitmap, idx, 1);
+}
+
+
+
+// ------- mi_bitmap_is_xset ---------------------------------------
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  const size_t maxbits = mi_bitmap_max_bits(bitmap);
+  mi_assert_internal(idx + n <= maxbits);
+  if (idx+n > maxbits) { // paranoia
+    if (idx >= maxbits) return false;
+    n = maxbits - idx;
+  }
+
+  // iterate through the chunks
+  size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  size_t cidx = idx % MI_BCHUNK_BITS;
+  bool xset = true;
+  while (n > 0 && xset) {
+    const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
+    xset = mi_bchunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, m) && xset;
+    mi_assert_internal(m <= n);
+    n -= m;
+    cidx = 0;
+    chunk_idx++;
+  }
+  return xset;
+}
+
+bool mi_bitmap_is_all_clear(mi_bitmap_t* bitmap) {
+  return mi_bitmap_is_xsetN(MI_BIT_CLEAR, bitmap, 0, mi_bitmap_max_bits(bitmap));
+}
+
+/* --------------------------------------------------------------------------------
+  Iterate through a bfield
+-------------------------------------------------------------------------------- */
+
+// Cycle iteration through a bitfield. This is used to space out threads
+// so there is less chance of contention. When searching for a free page we
+// like to first search only the accessed part (so we reuse better). This
+// high point is called the `cycle`.
+//
+// We then iterate through the bitfield as:
+// first: [start, cycle>
+// then : [0, start>
+// then : [cycle, MI_BFIELD_BITS>
+//
+// The start is determined usually as `tseq % cycle` to have each thread
+// start at a different spot.
+// - We use `popcount` to improve branch prediction (maybe not needed? can we simplify?)
+// - The `cycle_mask` is the part `[start, cycle>`.
+#define mi_bfield_iterate(bfield,start,cycle,name_idx,SUF) { \
+  mi_assert_internal(start <= cycle); \
+  mi_assert_internal(start < MI_BFIELD_BITS); \
+  mi_assert_internal(cycle <= MI_BFIELD_BITS); \
+  const mi_bfield_t _cycle_mask##SUF = mi_bfield_mask(cycle - start, start); \
+  size_t _bcount##SUF = mi_bfield_popcount(bfield); \
+  mi_bfield_t _b##SUF = bfield & _cycle_mask##SUF; /* process [start, cycle> first*/\
+  while(_bcount##SUF > 0) { \
+    _bcount##SUF--;\
+    if (_b##SUF==0) { _b##SUF = bfield & ~_cycle_mask##SUF; } /* process [0,start> + [cycle, MI_BFIELD_BITS> next */ \
+    /* size_t name_idx; */ \
+    const bool _found##SUF = mi_bfield_find_least_bit(_b##SUF,&name_idx); \
+    _b##SUF = mi_bfield_clear_least_bit(_b##SUF); /* clear early so `continue` works */ \
+    mi_assert_internal(_found##SUF); MI_UNUSED(_found##SUF); \
+    { \
+
+#define mi_bfield_iterate_end(SUF) \
+    } \
+  } \
+}
+
+
+#define mi_bfield_cycle_iterate(bfield,tseq,cycle,name_idx,SUF) { \
+  const size_t _start##SUF = (uint32_t)(tseq) % (uint32_t)(cycle); /* or: 0 to always search from the start? */\
+  mi_bfield_iterate(bfield,_start##SUF,cycle,name_idx,SUF)
+
+#define mi_bfield_cycle_iterate_end(SUF) \
+  mi_bfield_iterate_end(SUF); \
+}
+
+
+/* --------------------------------------------------------------------------------
+  mi_bitmap_find
+  (used to find free pages)
+-------------------------------------------------------------------------------- */
+
+typedef bool (mi_bitmap_visit_fun_t)(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* idx, void* arg1, void* arg2);
+
+// Go through the bitmap and for every sequence of `n` set bits, call the visitor function.
+// If it returns `true` stop the search.
+static inline bool mi_bitmap_find(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bitmap_visit_fun_t* on_find, void* arg1, void* arg2)
 {
-  mi_assert_internal(bitmap_idx != NULL);
-
-  // check initial trailing zeros
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t map = mi_atomic_load_relaxed(field);
-  const size_t initial = mi_clz(map);  // count of initial zeros starting at idx
-  mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS);
-  if (initial == 0)     return false;
-  if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx);    // no need to cross fields (this case won't happen for us)
-  if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries
-  
-  // scan ahead
-  size_t found = initial;
-  size_t mask = 0;     // mask bits for the final field
-  while(found < count) {
-    field++;
-    map = mi_atomic_load_relaxed(field);
-    const size_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found));
-    mi_assert_internal(mask_bits > 0 && mask_bits <= MI_BITMAP_FIELD_BITS);
-    mask = mi_bitmap_mask_(mask_bits, 0);
-    if ((map & mask) != 0) return false;  // some part is already claimed
-    found += mask_bits;
-  }
-  mi_assert_internal(field < &bitmap[bitmap_fields]);
-
-  // we found a range of contiguous zeros up to the final field; mask contains mask in the final field
-  // now try to claim the range atomically
-  mi_bitmap_field_t* const final_field = field;
-  const size_t final_mask = mask;
-  mi_bitmap_field_t* const initial_field = &bitmap[idx];
-  const size_t initial_idx = MI_BITMAP_FIELD_BITS - initial;
-  const size_t initial_mask = mi_bitmap_mask_(initial, initial_idx);
-
-  // initial field
-  size_t newmap;
-  field = initial_field;
-  map = mi_atomic_load_relaxed(field);
-  do {
-    newmap = (map | initial_mask);
-    if ((map & initial_mask) != 0) { goto rollback; };
-  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+  const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
+  for (size_t i = 0; i < chunkmap_max; i++) {
+    // and for each chunkmap entry we iterate over its bits to find the chunks
+    const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
+    size_t hi;
+    if (mi_bfield_find_highest_bit(cmap_entry, &hi)) {
+      size_t eidx = 0;
+      mi_bfield_cycle_iterate(cmap_entry, tseq%8, hi+1, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`)
+      {
+        mi_assert_internal(eidx <= MI_BFIELD_BITS);
+        const size_t chunk_idx = i*MI_BFIELD_BITS + eidx;
+        mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
+        if ((*on_find)(bitmap, chunk_idx, n, pidx, arg1, arg2)) {
+          return true;
+        }
+      }
+      mi_bfield_cycle_iterate_end(Y);
+    }
+  }
+  return false;
+}
 
-  // intermediate fields
-  while (++field < final_field) {
-    newmap = MI_BITMAP_FIELD_FULL;
-    map = 0;
-    if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; }
+
+/* --------------------------------------------------------------------------------
+  Bitmap: try_find_and_claim  -- used to allocate abandoned pages
+  note: the compiler will fully inline the indirect function call
+-------------------------------------------------------------------------------- */
+
+typedef struct mi_claim_fun_data_s {
+  mi_arena_t*   arena;  
+} mi_claim_fun_data_t;
+
+static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* pidx, void* arg1, void* arg2)
+{
+  mi_assert_internal(n==1); MI_UNUSED(n);
+  mi_claim_fun_t* claim_fun = (mi_claim_fun_t*)arg1;
+  mi_claim_fun_data_t* claim_data = (mi_claim_fun_data_t*)arg2;
+  size_t cidx;
+  if mi_likely(mi_bchunk_try_find_and_clear(&bitmap->chunks[chunk_idx], &cidx)) {
+    const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx;
+    mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap));
+    bool keep_set = true;
+    if ((*claim_fun)(slice_index, claim_data->arena, &keep_set)) {
+      // success!
+      mi_assert_internal(!keep_set);
+      *pidx = slice_index;
+      return true;
+    }
+    else {
+      // failed to claim it, set abandoned mapping again (unless the page was freed)
+      if (keep_set) {
+        const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx, NULL);
+        mi_assert_internal(wasclear); MI_UNUSED(wasclear);
+      }
+    }
+  }
+  else {
+    // we may find that all are cleared only on a second iteration but that is ok as
+    // the chunkmap is a conservative approximation.
+    mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
   }
+  return false;
+}
 
-  // final field
-  mi_assert_internal(field == final_field);
-  map = mi_atomic_load_relaxed(field);
-  do {
-    newmap = (map | final_mask);
-    if ((map & final_mask) != 0) { goto rollback; }
-  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+// Find a set bit in the bitmap and try to atomically clear it and claim it.
+// (Used to find pages in the pages_abandoned bitmaps.)
+mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
+  mi_claim_fun_t* claim, mi_arena_t* arena )
+{
+  mi_claim_fun_data_t claim_data = { arena };
+  return mi_bitmap_find(bitmap, tseq, 1, pidx, &mi_bitmap_try_find_and_claim_visit, (void*)claim, &claim_data);
+}
 
-  // claimed!
-  mi_stat_counter_increase(stats->arena_crossover_count,1);
-  *bitmap_idx = mi_bitmap_index_create(idx, initial_idx);
+
+bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx) {
+  const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
+  for (size_t i = chunkmap_max; i > 0; ) {
+    i--;
+    mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
+    size_t cmap_idx;
+    if (mi_bsr(cmap,&cmap_idx)) {
+      // highest chunk
+      const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
+      size_t cidx;
+      if (mi_bchunk_bsr(&bitmap->chunks[chunk_idx], &cidx)) {
+        *idx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Return count of all set bits in a bitmap.
+size_t mi_bitmap_popcount(mi_bitmap_t* bitmap) {
+  // for all chunkmap entries
+  size_t popcount = 0;
+  const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
+  for (size_t i = 0; i < chunkmap_max; i++) {
+    mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
+    size_t cmap_idx;
+    // for each chunk (corresponding to a set bit in a chunkmap entry)
+    while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) {
+      const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
+      // count bits in a chunk
+      popcount += mi_bchunk_popcount(&bitmap->chunks[chunk_idx]);
+    }
+  }
+  return popcount;
+}
+
+
+
+// Clear a bit once it is set.
+void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx) {
+  mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
+  mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
+  mi_bchunk_clear_once_set(&bitmap->chunks[chunk_idx], cidx);
+}
+
+
+// Visit all set bits in a bitmap.
+// todo: optimize further? maybe use avx512 to directly get all indices using a mask_compressstore?
+bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) {
+  // for all chunkmap entries
+  const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
+  for(size_t i = 0; i < chunkmap_max; i++) {
+    mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
+    size_t cmap_idx;
+    // for each chunk (corresponding to a set bit in a chunkmap entry)
+    while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) {
+      const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
+      // for each chunk field
+      mi_bchunk_t* const chunk = &bitmap->chunks[chunk_idx];
+      for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
+        const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS);
+        mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]);
+        size_t bidx;
+        while (mi_bfield_foreach_bit(&b, &bidx)) {
+          const size_t idx = base_idx + bidx;
+          if (!visit(idx, 1, arena, arg)) return false;
+        }
+      }
+    }
+  }
   return true;
+}
 
-rollback:
-  // roll back intermediate fields
-  // (we just failed to claim `field` so decrement first)
-  while (--field > initial_field) {
-    newmap = 0;
-    map = MI_BITMAP_FIELD_FULL;
-    mi_assert_internal(mi_atomic_load_relaxed(field) == map);
-    mi_atomic_store_release(field, newmap);
-  }
-  if (field == initial_field) {               // (if we failed on the initial field, `field + 1 == initial_field`)
-    map = mi_atomic_load_relaxed(field);
-    do {
-      mi_assert_internal((map & initial_mask) == initial_mask);
-      newmap = (map & ~initial_mask);
-    } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+// Visit all set bits in a bitmap but try to return ranges (within bfields) if possible.
+// Also clear those ranges atomically.
+// Used by purging to purge larger ranges when possible
+// todo: optimize further? maybe use avx512 to directly get all indices using a mask_compressstore?
+bool _mi_bitmap_forall_setc_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) {
+  // for all chunkmap entries
+  const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
+  for (size_t i = 0; i < chunkmap_max; i++) {
+    mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
+    size_t cmap_idx;
+    // for each chunk (corresponding to a set bit in a chunkmap entry)
+    while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) {
+      const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
+      // for each chunk field
+      mi_bchunk_t* const chunk = &bitmap->chunks[chunk_idx];
+      for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
+        const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS);
+        mi_bfield_t b = mi_atomic_exchange_relaxed(&chunk->bfields[j], (mi_bfield_t)0);
+#if MI_DEBUG > 1
+        const size_t bpopcount = mi_popcount(b);
+        size_t rngcount = 0;
+#endif
+        size_t bidx;
+        while (mi_bfield_find_least_bit(b, &bidx)) {
+          size_t rng = mi_ctz(~(b>>bidx)); // all the set bits from bidx
+#if MI_DEBUG > 1
+          rngcount += rng;
+#endif
+          const size_t idx = base_idx + bidx;
+          mi_assert_internal(rng>=1 && rng<=MI_BFIELD_BITS);
+          mi_assert_internal((idx % MI_BFIELD_BITS) + rng <= MI_BFIELD_BITS);
+          mi_assert_internal((idx / MI_BCHUNK_BITS) < mi_bitmap_chunk_count(bitmap));
+          if (!visit(idx, rng, arena, arg)) return false;
+          // clear rng bits in b
+          b = b & ~mi_bfield_mask(rng, bidx);
+        }
+        mi_assert_internal(rngcount == bpopcount);
+      }
+    }
   }
-  mi_stat_counter_increase(stats->arena_rollback_count,1);
-  // retry? (we make a recursive call instead of goto to be able to use const declarations)
-  if (retries <= 2) {
-    return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx, stats);
+  return true;
+}
+
+// Visit all set bits in a bitmap but try to return ranges (within bfields) if possible,
+// but only in chunks of at least `rngslices` slices (that are also aligned at `rngslices`)
+// and clear those ranges atomically.
+// However, the `rngslices` are capped at `MI_BFIELD_BITS` at most.
+// Used by purging to purge larger ranges when possible. With transparent huge pages we only
+// want to purge whole huge pages (2 MiB) at a time which is what the `rngslices` parameter achieves.
+bool _mi_bitmap_forall_setc_rangesn(mi_bitmap_t* bitmap, size_t rngslices, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) 
+{
+  // use the generic routine for `rngslices<=1` (as that one finds longest ranges at a time)
+  if (rngslices<=1) {
+    return _mi_bitmap_forall_setc_ranges(bitmap, visit, arena, arg);
   }
-  else {
-    return false;
+  // mi_assert_internal(rngslices <= MI_BFIELD_BITS);  
+  if (rngslices > MI_BFIELD_BITS) { rngslices = MI_BFIELD_BITS;  } // cap at MI_BFIELD_BITS at most
+
+  // for all chunkmap entries
+  const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS);
+  for (size_t i = 0; i < chunkmap_max; i++) {
+    mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]);
+    size_t cmap_idx;
+    // for each chunk (corresponding to a set bit in a chunkmap entry)
+    while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) {
+      const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
+      // for each chunk field
+      mi_bchunk_t* const chunk = &bitmap->chunks[chunk_idx];
+      for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
+        const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS);
+        mi_bfield_t b = mi_atomic_exchange_relaxed(&chunk->bfields[j], (mi_bfield_t)0);   // atomic clear
+        mi_bfield_t skipped = 0;                                                          // but track which bits we skip so we can restore them
+        for(size_t shift = 0; rngslices + shift <= MI_BFIELD_BITS; shift += rngslices) {  // per `rngslices` to keep alignment
+          const mi_bfield_t rngmask = mi_bfield_mask(rngslices, shift);
+          if ((b & rngmask) == rngmask) {
+            const size_t idx = base_idx + shift;
+            if (!visit(idx, rngslices, arena, arg)) {
+              // break early
+              if (skipped != 0) {
+                mi_atomic_or_relaxed(&chunk->bfields[j], skipped);
+                return false;
+              }
+            }
+          }
+          else {
+            skipped = skipped | (b & rngmask);
+          }          
+        } 
+        
+        if (skipped != 0) {
+          mi_atomic_or_relaxed(&chunk->bfields[j], skipped);
+        }
+      }
+    }
   }
+  return true;
 }
 
 
-// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) {
-  mi_assert_internal(count > 0);
-  if (count <= 2) {
-    // we don't bother with crossover fields for small counts
-    return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx);
+/* --------------------------------------------------------------------------------
+  binned bitmap's
+-------------------------------------------------------------------------------- */
+
+
+size_t mi_bbitmap_size(size_t bit_count, size_t* pchunk_count) {
+  // mi_assert_internal((bit_count % MI_BCHUNK_BITS) == 0);
+  bit_count = _mi_align_up(bit_count, MI_BCHUNK_BITS);
+  mi_assert_internal(bit_count <= MI_BITMAP_MAX_BIT_COUNT);
+  mi_assert_internal(bit_count > 0);
+  const size_t chunk_count = bit_count / MI_BCHUNK_BITS;
+  mi_assert_internal(chunk_count >= 1);
+  const size_t size = offsetof(mi_bbitmap_t,chunks) + (chunk_count * MI_BCHUNK_SIZE);
+  mi_assert_internal( (size%MI_BCHUNK_SIZE) == 0 );
+  if (pchunk_count != NULL) { *pchunk_count = chunk_count;  }
+  return size;
+}
+
+// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
+// returns the size of the bitmap
+size_t mi_bbitmap_init(mi_bbitmap_t* bbitmap, size_t bit_count, bool already_zero) {
+  size_t chunk_count;
+  const size_t size = mi_bbitmap_size(bit_count, &chunk_count);
+  if (!already_zero) {
+    _mi_memzero_aligned(bbitmap, size);
   }
+  mi_atomic_store_release(&bbitmap->chunk_count, chunk_count);
+  mi_assert_internal(mi_atomic_load_relaxed(&bbitmap->chunk_count) <= MI_BITMAP_MAX_CHUNK_COUNT);
+  return size;
+}
+
+void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap));
+  mi_bchunks_unsafe_setN(&bbitmap->chunks[0], &bbitmap->chunkmap, idx, n);
+}
 
-  // visit the fields
-  size_t idx = start_field_idx;
-  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
-    if (idx >= bitmap_fields) { idx = 0; } // wrap
-    // first try to claim inside a field
-    /*
-    if (count <= MI_BITMAP_FIELD_BITS) {
-      if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+bool mi_bbitmap_bsr_inv(mi_bbitmap_t* bbitmap, size_t* idx) {
+  const size_t chunkmap_max = _mi_divide_up(mi_bbitmap_chunk_count(bbitmap), MI_BFIELD_BITS);
+  for (size_t i = chunkmap_max; i > 0; ) {
+    i--;
+    mi_bfield_t cmap = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[i]);
+    size_t cmap_idx;
+    if (mi_bsr(~cmap, &cmap_idx)) {
+      // highest chunk
+      const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx;
+      size_t cidx;
+      if (mi_bchunk_bsr_inv(&bbitmap->chunks[chunk_idx], &cidx)) {
+        *idx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
         return true;
       }
     }
-    */
-    // if that fails, then try to claim across fields
-    if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx, stats)) {
-      return true;
+  }
+  return false;
+}
+
+
+/* --------------------------------------------------------------------------------
+ binned bitmap used to track free slices
+-------------------------------------------------------------------------------- */
+
+// Assign a specific size bin to a chunk
+static void mi_bbitmap_set_chunk_bin(mi_bbitmap_t* bbitmap, size_t chunk_idx, mi_chunkbin_t bin) {
+  mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
+  for (mi_chunkbin_t ibin = MI_CBIN_SMALL; ibin < MI_CBIN_NONE; ibin = mi_chunkbin_inc(ibin)) {
+    if (ibin == bin) {
+      const bool was_clear = mi_bchunk_set(& bbitmap->chunkmap_bins[ibin], chunk_idx, NULL);
+      if (was_clear) { mi_os_stat_increase(chunk_bins[ibin],1); }
+    }
+    else {
+      const bool was_set = mi_bchunk_clear(&bbitmap->chunkmap_bins[ibin], chunk_idx, NULL);
+      if (was_set) { mi_os_stat_decrease(chunk_bins[ibin],1); }
+    }
+  }
+}
+
+mi_chunkbin_t mi_bbitmap_debug_get_bin(const mi_bchunkmap_t* chunkmap_bins, size_t chunk_idx) {
+  for (mi_chunkbin_t ibin = MI_CBIN_SMALL; ibin < MI_CBIN_NONE; ibin = mi_chunkbin_inc(ibin)) {
+    if (mi_bchunk_is_xsetN(MI_BIT_SET, &chunkmap_bins[ibin], chunk_idx, 1)) {
+      return ibin;
+    }
+  }
+  return MI_CBIN_NONE;
+}
+
+// Track the index of the highest chunk that is accessed.
+static void mi_bbitmap_chunkmap_set_max(mi_bbitmap_t* bbitmap, size_t chunk_idx) {
+  size_t oldmax = mi_atomic_load_relaxed(&bbitmap->chunk_max_accessed);
+  if mi_unlikely(chunk_idx > oldmax) {
+    mi_atomic_cas_strong_relaxed(&bbitmap->chunk_max_accessed, &oldmax, chunk_idx);
+  }
+}
+
+// Set a bit in the chunkmap
+static void mi_bbitmap_chunkmap_set(mi_bbitmap_t* bbitmap, size_t chunk_idx, bool check_all_set) {
+  mi_assert(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
+  if (check_all_set) {
+    if (mi_bchunk_all_are_set_relaxed(&bbitmap->chunks[chunk_idx])) {
+      // all slices are free in this chunk: return back to the NONE bin
+      mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, MI_CBIN_NONE);
+    }
+  }
+  mi_bchunk_set(&bbitmap->chunkmap, chunk_idx, NULL);
+  mi_bbitmap_chunkmap_set_max(bbitmap, chunk_idx);
+}
+
+static bool mi_bbitmap_chunkmap_try_clear(mi_bbitmap_t* bbitmap, size_t chunk_idx) {
+  mi_assert(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
+  // check if the corresponding chunk is all clear
+  if (!mi_bchunk_all_are_clear_relaxed(&bbitmap->chunks[chunk_idx])) return false;
+  // clear the chunkmap bit
+  mi_bchunk_clear(&bbitmap->chunkmap, chunk_idx, NULL);
+  // .. but a concurrent set may have happened in between our all-clear test and the clearing of the
+  // bit in the mask. We check again to catch this situation. (note: mi_bchunk_clear must be acq-rel)
+  if (!mi_bchunk_all_are_clear_relaxed(&bbitmap->chunks[chunk_idx])) {
+    mi_bchunk_set(&bbitmap->chunkmap, chunk_idx, NULL);
+    return false;
+  }
+  mi_bbitmap_chunkmap_set_max(bbitmap, chunk_idx);
+  return true;
+}
+
+
+/* --------------------------------------------------------------------------------
+  mi_bbitmap_setN, try_clearN, and is_xsetN
+  (used to find free pages)
+-------------------------------------------------------------------------------- */
+
+// Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
+bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  const size_t maxbits = mi_bbitmap_max_bits(bbitmap);
+  mi_assert_internal(idx + n <= maxbits);
+  if (idx+n > maxbits) { // paranoia
+    if (idx >= maxbits) return false;
+    n = maxbits - idx;
+  }
+
+  // iterate through the chunks
+  size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  size_t cidx = idx % MI_BCHUNK_BITS;
+  bool were_allclear = true;
+  while (n > 0) {
+    const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
+    were_allclear = mi_bchunk_setN(&bbitmap->chunks[chunk_idx], cidx, m, NULL) && were_allclear;
+    mi_bbitmap_chunkmap_set(bbitmap, chunk_idx, true); // set afterwards
+    mi_assert_internal(m <= n);
+    n -= m;
+    cidx = 0;
+    chunk_idx++;
+  }
+  return were_allclear;
+}
+
+// ------- mi_bbitmap_try_clearNC ---------------------------------------
+
+// Try to clear `n` bits at `idx` where `n <= MI_BCHUNK_BITS`.
+bool mi_bbitmap_try_clearNC(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(n<=MI_BCHUNK_BITS);
+  mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap));
+
+  const size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  const size_t cidx = idx % MI_BCHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);  // don't cross chunks (for now)
+  mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
+  if (cidx + n > MI_BCHUNK_BITS) return false;
+  bool maybe_all_clear = false;
+  const bool cleared = mi_bchunk_try_clearN(&bbitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear);
+  if (cleared && maybe_all_clear) { mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx); }
+  // note: we don't set the size class for an explicit try_clearN (only used by purging)
+  return cleared;
+}
+
+
+
+// ------- mi_bbitmap_is_xset ---------------------------------------
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  const size_t maxbits = mi_bbitmap_max_bits(bbitmap);
+  mi_assert_internal(idx + n <= maxbits);
+  if (idx+n > maxbits) { // paranoia
+    if (idx >= maxbits) return false;
+    n = maxbits - idx;
+  }
+
+  // iterate through the chunks
+  size_t chunk_idx = idx / MI_BCHUNK_BITS;
+  size_t cidx = idx % MI_BCHUNK_BITS;
+  bool xset = true;
+  while (n > 0 && xset) {
+    const size_t m = (cidx + n > MI_BCHUNK_BITS ? MI_BCHUNK_BITS - cidx : n);
+    xset = mi_bchunk_is_xsetN(set, &bbitmap->chunks[chunk_idx], cidx, m) && xset;
+    mi_assert_internal(m <= n);
+    n -= m;
+    cidx = 0;
+    chunk_idx++;
+  }
+  return xset;
+}
+
+
+
+
+/* --------------------------------------------------------------------------------
+  mi_bbitmap_find
+  (used to find free pages)
+-------------------------------------------------------------------------------- */
+
+typedef bool (mi_bchunk_try_find_and_clear_fun_t)(mi_bchunk_t* chunk, size_t n, size_t* idx);
+
+// Go through the bbitmap and for every sequence of `n` set bits, call the visitor function.
+// If it returns `true` stop the search.
+//
+// This is used for finding free blocks and it is important to be efficient (with 2-level bitscan)
+// but also reduce fragmentation (through size bins).
+static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* on_find)
+{
+  // we space out threads to reduce contention
+  const size_t cmap_max_count  = _mi_divide_up(mi_bbitmap_chunk_count(bbitmap),MI_BFIELD_BITS);
+  const size_t chunk_acc       = mi_atomic_load_relaxed(&bbitmap->chunk_max_accessed);
+  const size_t cmap_acc        = chunk_acc / MI_BFIELD_BITS;
+  const size_t cmap_acc_bits   = 1 + (chunk_acc % MI_BFIELD_BITS);
+
+  // create a mask over the chunkmap entries to iterate over them efficiently
+  mi_assert_internal(MI_BFIELD_BITS >= MI_BCHUNK_FIELDS);
+  const mi_bfield_t cmap_mask  = mi_bfield_mask(cmap_max_count,0);
+  const size_t cmap_cycle      = cmap_acc+1;
+  const mi_chunkbin_t bbin = mi_chunkbin_of(n);
+  // visit each cmap entry
+  size_t cmap_idx = 0;
+  mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X)
+  {
+    // and for each chunkmap entry we iterate over its bits to find the chunks
+    const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[cmap_idx]);
+    const size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits);
+    if (cmap_entry == 0) {
+      continue;
+    }
+
+    // get size bin masks
+    mi_bfield_t cmap_bins[MI_CBIN_COUNT] = { 0 };
+    cmap_bins[MI_CBIN_NONE] = cmap_entry;
+    for (mi_chunkbin_t ibin = MI_CBIN_SMALL; ibin < MI_CBIN_NONE; ibin = mi_chunkbin_inc(ibin)) {
+      const mi_bfield_t cmap_bin = mi_atomic_load_relaxed(&bbitmap->chunkmap_bins[ibin].bfields[cmap_idx]);
+      cmap_bins[ibin] = cmap_bin & cmap_entry;
+      cmap_bins[MI_CBIN_NONE] &= ~cmap_bin;      // clear bits that are in an assigned size bin
+    }
+
+    // consider only chunks for a particular size bin at a time
+    // this picks the best bin only within a cmap entry (~ 1GiB address space), but avoids multiple
+    // iterations through all entries.
+    mi_assert_internal(bbin < MI_CBIN_NONE);
+    for (mi_chunkbin_t ibin = MI_CBIN_SMALL; ibin <= MI_CBIN_NONE;
+          // skip from bbin to NONE (so, say, a SMALL will never be placed in a OTHER, MEDIUM, or LARGE chunk to reduce fragmentation)
+          ibin = (ibin == bbin ? MI_CBIN_NONE : mi_chunkbin_inc(ibin)))
+    {
+      mi_assert_internal(ibin < MI_CBIN_COUNT);
+      const mi_bfield_t cmap_bin = cmap_bins[ibin];
+      size_t eidx = 0;
+      mi_bfield_cycle_iterate(cmap_bin, tseq, cmap_entry_cycle, eidx, Y)
+      {
+        // assertion doesn't quite hold as the max_accessed may be out-of-date
+        // mi_assert_internal(cmap_entry_cycle > eidx || ibin == MI_CBIN_NONE);
+
+        // get the chunk
+        const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx;
+        mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx];
+
+        size_t cidx;
+        if ((*on_find)(chunk, n, &cidx)) {
+          if (cidx==0 && ibin == MI_CBIN_NONE) { // only the first block determines the size bin
+            // this chunk is now reserved for the `bbin` size class
+            mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, bbin);
+          }
+          *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
+          mi_assert_internal(*pidx + n <= mi_bbitmap_max_bits(bbitmap));
+          return true;
+        }
+        else {
+          // todo: should _on_find_ return a boolean if there is a chance all are clear to avoid calling `try_clear?`
+          // we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation.
+          mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx);
+        }
+      }
+      mi_bfield_cycle_iterate_end(Y);
     }
   }
+  mi_bfield_cycle_iterate_end(X);
   return false;
 }
 
-// Helper for masks across fields; returns the mid count, post_mask may be 0
-static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, size_t* pre_mask, size_t* mid_mask, size_t* post_mask) {
-  MI_UNUSED(bitmap_fields);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  if mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS) {
-    *pre_mask = mi_bitmap_mask_(count, bitidx);
-    *mid_mask = 0;
-    *post_mask = 0;
-    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) < bitmap_fields);
-    return 0;
+/* --------------------------------------------------------------------------------
+  mi_bbitmap_try_find_and_clear -- used to find free pages
+  note: the compiler will fully inline the indirect function calls
+-------------------------------------------------------------------------------- */
+
+bool mi_bbitmap_try_find_and_clear(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) {
+  return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, 1, pidx, &mi_bchunk_try_find_and_clear_1);
+}
+
+bool mi_bbitmap_try_find_and_clear8(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) {
+  return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, 8, pidx, &mi_bchunk_try_find_and_clear_8);
+}
+
+// bool mi_bbitmap_try_find_and_clearX(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) {
+//   return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, MI_BFIELD_BITS, pidx, &mi_bchunk_try_find_and_clear_X);
+// }
+
+bool mi_bbitmap_try_find_and_clearNX(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx) {
+  mi_assert_internal(n<=MI_BFIELD_BITS);
+  return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearNX);
+}
+
+bool mi_bbitmap_try_find_and_clearNC(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx) {
+  mi_assert_internal(n<=MI_BCHUNK_BITS);
+  return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearNC);
+}
+
+
+/* --------------------------------------------------------------------------------
+  mi_bbitmap_try_find_and_clear for huge objects spanning multiple chunks
+-------------------------------------------------------------------------------- */
+
+// Try to atomically clear `n` bits starting at `chunk_idx` where `n` can span over multiple chunks
+static bool mi_bchunk_try_clearN_(mi_bbitmap_t* bbitmap, size_t chunk_idx, size_t n) {
+  mi_assert_internal((chunk_idx * MI_BCHUNK_BITS) + n <= mi_bbitmap_max_bits(bbitmap));
+
+  size_t m = n;      // bits to go
+  size_t count = 0;  // chunk count
+  while (m > 0) {
+    mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx + count];
+    if (!mi_bchunk_try_clearN(chunk, 0, (m > MI_BCHUNK_BITS ? MI_BCHUNK_BITS : m), NULL)) {
+      goto rollback;
+    }
+    m = (m <= MI_BCHUNK_BITS ? 0 : m - MI_BCHUNK_BITS);
+    count++;
   }
-  else {
-    const size_t pre_bits = MI_BITMAP_FIELD_BITS - bitidx;
-    mi_assert_internal(pre_bits < count);
-    *pre_mask = mi_bitmap_mask_(pre_bits, bitidx);
-    count -= pre_bits;
-    const size_t mid_count = (count / MI_BITMAP_FIELD_BITS);
-    *mid_mask = MI_BITMAP_FIELD_FULL;
-    count %= MI_BITMAP_FIELD_BITS;
-    *post_mask = (count==0 ? 0 : mi_bitmap_mask_(count, 0));
-    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) + mid_count + (count==0 ? 0 : 1) < bitmap_fields);
-    return mid_count;
-  }
-}
-
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  size_t pre_mask;
-  size_t mid_mask;
-  size_t post_mask;
-  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
-  bool all_one = true;
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);   // clear first part
-  if ((prev & pre_mask) != pre_mask) all_one = false;
-  while(mid_count-- > 0) {
-    prev = mi_atomic_and_acq_rel(field++, ~mid_mask);        // clear mid part
-    if ((prev & mid_mask) != mid_mask) all_one = false;
-  }
-  if (post_mask!=0) {
-    prev = mi_atomic_and_acq_rel(field, ~post_mask);         // clear end part
-    if ((prev & post_mask) != post_mask) all_one = false;
-  }
-  return all_one;
-}
-
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) {
-  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  size_t pre_mask;
-  size_t mid_mask;
-  size_t post_mask;
-  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
-  bool all_zero = true;
-  bool any_zero = false;
-  _Atomic(size_t)*field = &bitmap[idx];
-  size_t prev = mi_atomic_or_acq_rel(field++, pre_mask);
-  if ((prev & pre_mask) != 0) all_zero = false;
-  if ((prev & pre_mask) != pre_mask) any_zero = true;
-  while (mid_count-- > 0) {
-    prev = mi_atomic_or_acq_rel(field++, mid_mask);
-    if ((prev & mid_mask) != 0) all_zero = false;
-    if ((prev & mid_mask) != mid_mask) any_zero = true;
-  }
-  if (post_mask!=0) {
-    prev = mi_atomic_or_acq_rel(field, post_mask);
-    if ((prev & post_mask) != 0) all_zero = false;
-    if ((prev & post_mask) != post_mask) any_zero = true;
-  }
-  if (pany_zero != NULL) { *pany_zero = any_zero; }
-  return all_zero;
-}
-
-
-// Returns `true` if all `count` bits were 1.
-// `any_ones` is `true` if there was at least one bit set to one.
-static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) {
-  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  size_t pre_mask;
-  size_t mid_mask;
-  size_t post_mask;
-  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
-  bool all_ones = true;
-  bool any_ones = false;
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t prev = mi_atomic_load_relaxed(field++);
-  if ((prev & pre_mask) != pre_mask) all_ones = false;
-  if ((prev & pre_mask) != 0) any_ones = true;
-  while (mid_count-- > 0) {
-    prev = mi_atomic_load_relaxed(field++);
-    if ((prev & mid_mask) != mid_mask) all_ones = false;
-    if ((prev & mid_mask) != 0) any_ones = true;
-  }
-  if (post_mask!=0) {
-    prev = mi_atomic_load_relaxed(field);
-    if ((prev & post_mask) != post_mask) all_ones = false;
-    if ((prev & post_mask) != 0) any_ones = true;
-  }
-  if (pany_ones != NULL) { *pany_ones = any_ones; }
-  return all_ones;
-}
-
-bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL);
-}
-
-bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  bool any_ones;
-  mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
-  return any_ones;
+  return true;
+
+rollback:
+  // we only need to reset chunks the we just fully cleared
+  while (count > 0) {
+    count--;
+    mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx + count];
+    mi_bchunk_setN(chunk, 0, MI_BCHUNK_BITS, NULL);
+  }
+  return false;
+}
+
+// Go through the bbitmap to find a sequence of `n` bits and clear them atomically where `n > MI_ARENA_MAX_CHUNK_OBJ_SIZE`
+// Since these are very large object allocations we always search from the start and only consider starting at the start
+// of a chunk (for fragmentation and efficiency).
+// Todo: for now we try to find full empty chunks to cover `n` but we can allow a partial chunk at the end
+// Todo: This scans directly through the chunks -- we might want to consult the cmap as well?
+bool mi_bbitmap_try_find_and_clearN_(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx) {
+  MI_UNUSED(tseq);
+  mi_assert(n > 0); if (n==0) { return false; }
+
+  const size_t chunk_max = mi_bbitmap_chunk_count(bbitmap);
+  const size_t chunk_req = _mi_divide_up(n, MI_BCHUNK_BITS);  // minimal number of chunks needed
+  if (chunk_max < chunk_req) { return false; }
+
+  // iterate through the chunks
+  size_t chunk_idx = 0;
+  while (chunk_idx <= chunk_max - chunk_req)
+  {
+    size_t count = 0;  // chunk count
+    do {
+      mi_assert_internal(chunk_idx + count < chunk_max);
+      mi_bchunk_t* const chunk = &bbitmap->chunks[chunk_idx + count];
+      if (!mi_bchunk_all_are_set_relaxed(chunk)) {
+        break;
+      }
+      else {
+        count++;
+      }
+    }
+    while (count < chunk_req);
+
+    // did we find a suitable range?
+    if (count == chunk_req) {
+      // now try to claim it!
+      if (mi_bchunk_try_clearN_(bbitmap, chunk_idx, n)) {
+        *pidx = (chunk_idx * MI_BCHUNK_BITS);
+        for (size_t i = 0; i < count; i++) {
+          mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx + i, MI_CBIN_HUGE);
+        }
+        mi_assert_internal(*pidx + n <= mi_bbitmap_max_bits(bbitmap));
+        return true;
+      }
+    }
+
+    // keep searching but skip the scanned range
+    chunk_idx += count+1;
+  }
+  return false;
 }
+
+
+
+
+
diff --git a/system/lib/mimalloc/src/bitmap.h b/system/lib/mimalloc/src/bitmap.h
index d8316b83f40f1..2ecc3141d30ed 100644
--- a/system/lib/mimalloc/src/bitmap.h
+++ b/system/lib/mimalloc/src/bitmap.h
@@ -1,115 +1,343 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
+Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 
 /* ----------------------------------------------------------------------------
-Concurrent bitmap that can set/reset sequences of bits atomically,
-represented as an array of fields where each field is a machine word (`size_t`)
-
-There are two api's; the standard one cannot have sequences that cross
-between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
-(this is used in region allocation)
-
-The `_across` postfixed functions do allow sequences that can cross over
-between the fields. (This is used in arena allocation)
+Concurrent bitmap that can set/reset sequences of bits atomically
 ---------------------------------------------------------------------------- */
 #pragma once
 #ifndef MI_BITMAP_H
 #define MI_BITMAP_H
 
-/* -----------------------------------------------------------
-  Bitmap definition
------------------------------------------------------------ */
+/* --------------------------------------------------------------------------------
+  Atomic bitmaps with release/acquire guarantees:
+
+  `mi_bfield_t`: is a single machine word that can efficiently be bit counted (usually `size_t`)
+      each bit usually represents a single MI_ARENA_SLICE_SIZE in an arena (64 KiB).
+      We need 16K bits to represent a 1GiB arena.
+
+  `mi_bchunk_t`: a chunk of bfield's of a total of MI_BCHUNK_BITS (= 512 on 64-bit, 256 on 32-bit)
+      allocations never span across chunks -- so MI_ARENA_MAX_OBJ_SIZE is the number
+      of bits in a chunk times the MI_ARENA_SLICE_SIZE (512 * 64KiB = 32 MiB).
+      These chunks are cache-aligned and we can use AVX2/AVX512/NEON/SVE/SVE2/etc. instructions
+      to scan for bits (perhaps) more efficiently.
+
+      We allocate byte-sized ranges aligned to bytes in the bfield, and bfield-sized
+      ranges aligned to a bfield.
+
+    Searching linearly through the chunks would be too slow (16K bits per GiB).
+    Instead we add a "chunkmap" to do a two-level search (more or less a btree of depth 2).
+
+   `mi_bchunkmap_t` (== `mi_bchunk_t`): for each chunk we track if it has (potentially) any bit set.
+      The chunkmap has 1 bit per chunk that is set if the chunk potentially has a bit set.
+      This is used to avoid scanning every chunk. (and thus strictly an optimization)
+      It is conservative: it is fine to set a bit in the chunk map even if the chunk turns out
+      to have no bits set. It is also allowed to briefly have a clear bit even if the
+      chunk has bits set -- as long as we guarantee that the bit will be set later on;
+      (this allows us to set the chunkmap bit right after we set a bit in the corresponding chunk).
+
+      However, when we clear a bit in a chunk, and the chunk is indeed all clear, we
+      cannot safely clear the bit corresponding to the chunk in the chunkmap since it
+      may race with another thread setting a bit in the same chunk. Therefore, when
+      clearing, we first test if a chunk is clear, then clear the chunkmap bit, and
+      then test again to catch any set bits that we may have missed.
+
+      Since the chunkmap may thus be briefly out-of-sync, this means that we may sometimes
+      not find a free page even though it's there (but we accept this as we avoid taking
+      full locks). (Another way to do this is to use an epoch but we like to avoid that complexity
+      for now).
+
+   `mi_bitmap_t`: a bitmap with N chunks. A bitmap has a chunkmap of MI_BCHUNK_BITS (512)
+      and thus has at most 512 chunks (=2^18 bits x 64 KiB slices = 16 GiB max arena size).
+      The minimum is 1 chunk which is a 32 MiB arena.
+
+   For now, the implementation assumes MI_HAS_FAST_BITSCAN and uses trailing-zero-count
+   and pop-count (but we think it can be adapted work reasonably well on older hardware too)
+--------------------------------------------------------------------------------------------- */
+
+// A word-size bit field.
+typedef size_t mi_bfield_t;
+
+#define MI_BFIELD_BITS_SHIFT         (MI_SIZE_SHIFT+3)
+#define MI_BFIELD_BITS               (1 << MI_BFIELD_BITS_SHIFT)
+#define MI_BFIELD_SIZE               (MI_BFIELD_BITS/8)
+#define MI_BFIELD_LO_BIT8            (((~(mi_bfield_t)0))/0xFF)         // 0x01010101 ..
+#define MI_BFIELD_HI_BIT8            (MI_BFIELD_LO_BIT8 << 7)           // 0x80808080 ..
+
+#define MI_BCHUNK_SIZE               (MI_BCHUNK_BITS / 8)
+#define MI_BCHUNK_FIELDS             (MI_BCHUNK_BITS / MI_BFIELD_BITS)  // 8 on both 64- and 32-bit
+
+
+// some compiler (msvc in C mode) cannot have expressions in the alignment attribute
+#if MI_BCHUNK_SIZE==64
+#define mi_decl_bchunk_align  mi_decl_align(64)
+#elif MI_BCHUNK_SIZE==32
+#define mi_decl_bchunk_align  mi_decl_align(32)
+#else
+#define mi_decl_bchunk_align  mi_decl_align(MI_BCHUNK_SIZE)
+#endif
+
+
+// A bitmap chunk contains 512 bits on 64-bit  (256 on 32-bit)
+typedef mi_decl_bchunk_align struct mi_bchunk_s {
+  _Atomic(mi_bfield_t) bfields[MI_BCHUNK_FIELDS];
+} mi_bchunk_t;
+
+
+// The chunkmap has one bit per corresponding chunk that is set if the chunk potentially has bits set.
+// The chunkmap is itself a chunk.
+typedef mi_bchunk_t mi_bchunkmap_t;
+
+#define MI_BCHUNKMAP_BITS             MI_BCHUNK_BITS
+
+#define MI_BITMAP_MAX_CHUNK_COUNT     (MI_BCHUNKMAP_BITS)
+#define MI_BITMAP_MIN_CHUNK_COUNT     (1)
+#if MI_SIZE_BITS > 32
+#define MI_BITMAP_DEFAULT_CHUNK_COUNT     (64)  // 2 GiB on 64-bit -- this is for the page map
+#else
+#define MI_BITMAP_DEFAULT_CHUNK_COUNT      (1)
+#endif
+#define MI_BITMAP_MAX_BIT_COUNT       (MI_BITMAP_MAX_CHUNK_COUNT * MI_BCHUNK_BITS)  // 16 GiB arena
+#define MI_BITMAP_MIN_BIT_COUNT       (MI_BITMAP_MIN_CHUNK_COUNT * MI_BCHUNK_BITS)  // 32 MiB arena
+#define MI_BITMAP_DEFAULT_BIT_COUNT   (MI_BITMAP_DEFAULT_CHUNK_COUNT * MI_BCHUNK_BITS)  // 2 GiB arena
+
+
+// An atomic bitmap
+typedef mi_decl_bchunk_align struct mi_bitmap_s {
+  _Atomic(size_t)  chunk_count;         // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
+  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 1];    // suppress warning on msvc
+  mi_bchunkmap_t   chunkmap;
+  mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
+} mi_bitmap_t;
+
+
+static inline size_t mi_bitmap_chunk_count(const mi_bitmap_t* bitmap) {
+  return mi_atomic_load_relaxed(&((mi_bitmap_t*)bitmap)->chunk_count);
+}
+
+static inline size_t mi_bitmap_max_bits(const mi_bitmap_t* bitmap) {
+  return (mi_bitmap_chunk_count(bitmap) * MI_BCHUNK_BITS);
+}
+
+
+
+/* --------------------------------------------------------------------------------
+  Atomic bitmap operations
+-------------------------------------------------------------------------------- */
+
+// Many operations are generic over setting or clearing the bit sequence: we use `mi_xset_t` for this (true if setting, false if clearing)
+typedef bool  mi_xset_t;
+#define MI_BIT_SET    (true)
+#define MI_BIT_CLEAR  (false)
+
+
+// Required size of a bitmap to represent `bit_count` bits.
+size_t mi_bitmap_size(size_t bit_count, size_t* chunk_count);
+
+// Initialize a bitmap to all clear; avoid a mem_zero if `already_zero` is true
+// returns the size of the bitmap.
+size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero);
+
+// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks).
+// Not atomic so only use if still local to a thread.
+void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n);
 
-#define MI_BITMAP_FIELD_BITS   (8*MI_SIZE_SIZE)
-#define MI_BITMAP_FIELD_FULL   (~((size_t)0))   // all bits set
 
-// An atomic bitmap of `size_t` fields
-typedef _Atomic(size_t)  mi_bitmap_field_t;
-typedef mi_bitmap_field_t*  mi_bitmap_t;
+// Set a bit in the bitmap; returns `true` if it atomically transitioned from 0 to 1
+bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx);
 
-// A bitmap index is the index of the bit in a bitmap.
-typedef size_t mi_bitmap_index_t;
+// Clear a bit in the bitmap; returns `true` if it atomically transitioned from 1 to 0
+bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx);
 
-// Create a bit index.
-static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
-  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
-  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
+// Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's
+// If `already_set` is not NULL, it is set to count of bits were already all set.
+// (this is used for correct statistics if commiting over a partially committed area)
+bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set);
+
+// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 1's to 0's
+bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+// Is the bitmap completely clear?
+bool mi_bitmap_is_all_clear(mi_bitmap_t* bitmap);
+
+// Is a sequence of n bits already set?
+// (Used to check if a memory range is already committed)
+static inline bool mi_bitmap_is_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  return mi_bitmap_is_xsetN(MI_BIT_SET, bitmap, idx, n);
 }
 
-// Create a bit index.
-static inline mi_bitmap_index_t mi_bitmap_index_create_from_bit(size_t full_bitidx) {  
-  return mi_bitmap_index_create(full_bitidx / MI_BITMAP_FIELD_BITS, full_bitidx % MI_BITMAP_FIELD_BITS);
+// Is a sequence of n bits already clear?
+static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  return mi_bitmap_is_xsetN(MI_BIT_CLEAR, bitmap, idx, n);
 }
 
-// Get the field index from a bit index.
-static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
-  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
+static inline bool mi_bitmap_is_set(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_is_setN(bitmap, idx, 1);
 }
 
-// Get the bit index in a bitmap field
-static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
-  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
+static inline bool mi_bitmap_is_clear(mi_bitmap_t* bitmap, size_t idx) {
+  return mi_bitmap_is_clearN(bitmap, idx, 1);
 }
 
-// Get the full bit index
-static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
-  return bitmap_idx;
+// Called once a bit is cleared to see if the memory slice can be claimed.
+typedef bool (mi_claim_fun_t)(size_t slice_index, mi_arena_t* arena, bool* keep_set);
+
+// Find a set bits in the bitmap, atomically clear it, and check if `claim` returns true.
+// If not claimed, continue on (potentially setting the bit again depending on `keep_set`).
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
+                                                    mi_claim_fun_t* claim, mi_arena_t* arena );
+
+
+// Atomically clear a bit but only if it is set. Will block otherwise until the bit is set.
+// This is used to delay free-ing a page that it at the same time being considered to be
+// allocated from `mi_arena_try_abandoned` (and is in the `claim` function of `mi_bitmap_try_find_and_claim`).
+void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx);
+
+
+// If a bit is set in the bitmap, return `true` and set `idx` to the index of the highest bit.
+// Otherwise return `false` (and `*idx` is undefined).
+// Used for unloading arena's
+bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx);
+
+// Return count of all set bits in a bitmap.
+size_t mi_bitmap_popcount(mi_bitmap_t* bitmap);
+
+
+typedef bool (mi_forall_set_fun_t)(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg2);
+
+// Visit all set bits in a bitmap (`slice_count == 1`)
+bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
+
+// Visit all set bits in a bitmap with larger ranges if possible (`slice_count >= 1`)
+// Ranges will never cross chunk boundaries though (and `slice_count <= MI_BCHUNK_BITS`)
+bool _mi_bitmap_forall_setc_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
+
+// Visit all set bits in a bitmap with at least `rngslices` at a time (and aligned to `rngslices`). 
+// This is used by purging to not break up transparent huge pages for example.
+// Ranges will never cross chunk boundaries (and `slice_count <= MI_BCHUNK_BITS`).
+bool _mi_bitmap_forall_setc_rangesn(mi_bitmap_t* bitmap, size_t rngslices, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg);
+
+// Count all set bits in given range in the bitmap.
+size_t mi_bitmap_popcountN( mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+/* ----------------------------------------------------------------------------
+  Binned concurrent bitmap
+  Assigns a size class to each chunk such that small blocks don't cause too
+  much fragmentation since we keep chunks for larger blocks separate.
+---------------------------------------------------------------------------- */
+
+// mi_chunkbin_t is defined in mimalloc-stats.h
+
+static inline mi_chunkbin_t mi_chunkbin_inc(mi_chunkbin_t bbin) {
+  mi_assert_internal(bbin < MI_CBIN_COUNT);
+  return (mi_chunkbin_t)((int)bbin + 1);
+}
+
+static inline mi_chunkbin_t mi_chunkbin_dec(mi_chunkbin_t bbin) {
+  mi_assert_internal(bbin > MI_CBIN_NONE);
+  return (mi_chunkbin_t)((int)bbin - 1);
+}
+
+static inline mi_chunkbin_t mi_chunkbin_of(size_t slice_count) {
+  if (slice_count==1) return MI_CBIN_SMALL;
+  if (slice_count==8) return MI_CBIN_MEDIUM;
+  #if MI_ENABLE_LARGE_PAGES
+  if (slice_count==MI_BFIELD_BITS) return MI_CBIN_LARGE;
+  #endif
+  if (slice_count > MI_BCHUNK_BITS) return MI_CBIN_HUGE;
+  return MI_CBIN_OTHER;
 }
 
-/* -----------------------------------------------------------
-  Claim a bit sequence atomically
------------------------------------------------------------ */
+// An atomic "binned" bitmap for the free slices where we keep chunks reserved for particular size classes
+typedef mi_decl_bchunk_align struct mi_bbitmap_s {
+  _Atomic(size_t)  chunk_count;         // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
+  _Atomic(size_t)  chunk_max_accessed;  // max chunk index that was once cleared or set
+  #if (MI_BCHUNK_SIZE / MI_SIZE_SIZE) > 2
+  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc by aligning manually
+  #endif
+  mi_bchunkmap_t   chunkmap;
+  mi_bchunkmap_t   chunkmap_bins[MI_CBIN_COUNT - 1];             // chunkmaps with bit set if the chunk is in that size class (excluding MI_CBIN_NONE)
+  mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
+} mi_bbitmap_t;
 
-// Try to atomically claim a sequence of `count` bits in a single
-// field at `idx` in `bitmap`. Returns `true` on success.
-bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
 
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
-bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+static inline size_t mi_bbitmap_chunk_count(const mi_bbitmap_t* bbitmap) {
+  return mi_atomic_load_relaxed(&((mi_bbitmap_t*)bbitmap)->chunk_count);
+}
 
-// Like _mi_bitmap_try_find_from_claim but with an extra predicate that must be fullfilled
-typedef bool (mi_cdecl *mi_bitmap_pred_fun_t)(mi_bitmap_index_t bitmap_idx, void* pred_arg);
-bool _mi_bitmap_try_find_from_claim_pred(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_pred_fun_t pred_fun, void* pred_arg, mi_bitmap_index_t* bitmap_idx);
+static inline size_t mi_bbitmap_max_bits(const mi_bbitmap_t* bbitmap) {
+  return (mi_bbitmap_chunk_count(bbitmap) * MI_BCHUNK_BITS);
+}
 
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+mi_chunkbin_t mi_bbitmap_debug_get_bin(const mi_bchunk_t* chunkmap_bins, size_t chunk_idx);
 
-// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. 
-// Returns `true` if successful when all previous `count` bits were 0.
-bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+size_t mi_bbitmap_size(size_t bit_count, size_t* chunk_count);
 
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero);
+// If a bit is clear in the bitmap, return `true` and set `idx` to the index of the highest bit that was clear.
+// Otherwise return `false` (and `*idx` is undefined).
+// Used for debug output.
+bool mi_bbitmap_bsr_inv(mi_bbitmap_t* bbitmap, size_t* idx);
 
-bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+// Initialize a bitmap to all clear; avoid a mem_zero if `already_zero` is true
+// returns the size of the bitmap.
+size_t mi_bbitmap_init(mi_bbitmap_t* bbitmap, size_t bit_count, bool already_zero);
 
+// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks).
+// Not atomic so only use if still local to a thread.
+void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
 
-//--------------------------------------------------------------------------
-// the `_across` functions work on bitmaps where sequences can cross over
-// between the fields. This is used in arena allocation
-//--------------------------------------------------------------------------
 
-// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats);
+// Set a sequence of `n` bits in the bbitmap; returns `true` if atomically transitioned from all 0's to 1's
+bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
 
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
 
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero);
+// Is a sequence of n bits already all set/cleared?
+bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n);
 
-bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+// Is a sequence of n bits already set?
+// (Used to check if a memory range is already committed)
+static inline bool mi_bbitmap_is_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  return mi_bbitmap_is_xsetN(MI_BIT_SET, bbitmap, idx, n);
+}
 
-#endif
+// Is a sequence of n bits already clear?
+static inline bool mi_bbitmap_is_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
+  return mi_bbitmap_is_xsetN(MI_BIT_CLEAR, bbitmap, idx, n);
+}
+
+
+// Try to atomically transition `n` bits from all set to all clear. Returns `true` on succes.
+// `n` cannot cross chunk boundaries, where `n <= MI_CHUNK_BITS`.
+bool mi_bbitmap_try_clearNC(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
+
+
+// Specialized versions for common bit sequence sizes
+bool mi_bbitmap_try_find_and_clear(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx);  // 1-bit
+bool mi_bbitmap_try_find_and_clear8(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // 8-bits
+// bool mi_bbitmap_try_find_and_clearX(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS
+bool mi_bbitmap_try_find_and_clearNX(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS
+bool mi_bbitmap_try_find_and_clearNC(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS
+bool mi_bbitmap_try_find_and_clearN_(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BCHUNK_BITS
+
+// Find a sequence of `n` bits in the bbitmap with all bits set, and try to atomically clear all.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+mi_decl_nodiscard static inline bool mi_bbitmap_try_find_and_clearN(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx) {
+  if (n==1) return mi_bbitmap_try_find_and_clear(bbitmap, tseq, pidx);               // small pages
+  if (n==8) return mi_bbitmap_try_find_and_clear8(bbitmap, tseq, pidx);              // medium pages
+  // if (n==MI_BFIELD_BITS) return mi_bbitmap_try_find_and_clearX(bbitmap, tseq, pidx); // large pages
+  if (n==0) return false;
+  if (n<=MI_BFIELD_BITS) return mi_bbitmap_try_find_and_clearNX(bbitmap, tseq, n, pidx);
+  if (n<=MI_BCHUNK_BITS) return mi_bbitmap_try_find_and_clearNC(bbitmap, tseq, n, pidx);
+  return mi_bbitmap_try_find_and_clearN_(bbitmap, tseq, n, pidx);
+}
+
+
+#endif // MI_BITMAP_H
diff --git a/system/lib/mimalloc/src/free.c b/system/lib/mimalloc/src/free.c
index b9cb634616958..72bb2823fa879 100644
--- a/system/lib/mimalloc/src/free.c
+++ b/system/lib/mimalloc/src/free.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -9,14 +9,13 @@ terms of the MIT license. A copy of the license can be found in the file
 // add includes help an IDE
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"   // _mi_prim_thread_id()
 #endif
 
 // forward declarations
 static void   mi_check_padding(const mi_page_t* page, const mi_block_t* block);
 static bool   mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block);
-static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block);
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block, bool was_guarded);
 static void   mi_stat_free(const mi_page_t* page, const mi_block_t* block);
 
 
@@ -24,268 +23,370 @@ static void   mi_stat_free(const mi_page_t* page, const mi_block_t* block);
 // Free
 // ------------------------------------------------------
 
-// forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static mi_decl_noinline void mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block);
-
 // regular free of a (thread local) block pointer
 // fast path written carefully to prevent spilling on the stack
-static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool track_stats, bool check_full)
+static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool was_guarded, bool track_stats, bool check_full)
 {
-  // checks
+  MI_UNUSED(was_guarded);
+  // checks  
   if mi_unlikely(mi_check_is_double_free(page, block)) return;
-  mi_check_padding(page, block);
+  if (!was_guarded) { mi_check_padding(page, block); }
   if (track_stats) { mi_stat_free(page, block); }
   #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
-  if (!mi_page_is_huge(page)) {   // huge page content may be already decommitted
-    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
-  }
+  memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
   #endif
-  if (track_stats) { mi_track_free_size(block, mi_page_usable_size_of(page, block)); } // faster then mi_usable_size as we already know the page and that p is unaligned
+  if (track_stats) { mi_track_free_size(block, mi_page_usable_size_of(page, block, was_guarded)); } // faster then mi_usable_size as we already know the page and that p is unaligned
 
   // actual free: push on the local free list
   mi_block_set_next(page, block, page->local_free);
   page->local_free = block;
   if mi_unlikely(--page->used == 0) {
-    _mi_page_retire(page);
+    if (page->retire_expire==0) { // no need to re-retire retired pages (happens when we alloc/free one block repeatedly in an empty page)
+      _mi_page_retire(page); 
+    }
   }
   else if mi_unlikely(check_full && mi_page_is_in_full(page)) {
     _mi_page_unfull(page);
   }
 }
 
+// Forward declaration for multi-threaded collect
+static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept;
+
+// Free a block multi-threaded
+static inline void mi_free_block_mt(mi_page_t* page, mi_block_t* block, bool was_guarded) mi_attr_noexcept
+{
+  MI_UNUSED(was_guarded);
+  // adjust stats (after padding check and potentially recursive `mi_free` above)
+  mi_stat_free(page, block);    // stat_free may access the padding
+  mi_track_free_size(block, mi_page_usable_size_of(page, block, was_guarded));
+
+  // _mi_padding_shrink(page, block, sizeof(mi_block_t));
+#if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
+  if (!was_guarded) {
+    size_t dbgsize = mi_usable_size(block);
+    if (dbgsize > MI_MiB) { dbgsize = MI_MiB; }
+    _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize);
+  }
+#endif
+
+  // push atomically on the page thread free list
+  mi_thread_free_t tf_new;
+  mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    mi_block_set_next(page, block, mi_tf_block(tf_old));
+    tf_new = mi_tf_create(block, true /* always use owned: try to claim it if the page is abandoned */);
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new)); // todo: release is enough?
+
+  // and atomically try to collect the page if it was abandoned
+  const bool is_owned_now = !mi_tf_is_owned(tf_old);
+  if (is_owned_now) {
+    mi_assert_internal(mi_page_is_abandoned(page));
+    mi_free_try_collect_mt(page,block);
+  }
+}
+
+
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
-// note: this can be called from `mi_free_generic_mt` where a non-owning thread accesses the 
-// `page_start` and `block_size` fields; however these are constant and the page won't be 
+// note: this can be called from `mi_free_generic_mt` where a non-owning thread accesses the
+// `page_start` and `block_size` fields; however these are constant and the page won't be
 // deallocated (as the block we are freeing keeps it alive) and thus safe to read concurrently.
 mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
 
-  size_t diff = (uint8_t*)p - page->page_start;
-  size_t adjust;
-  if mi_likely(page->block_size_shift != 0) {
-    adjust = diff & (((size_t)1 << page->block_size_shift) - 1);
+  const size_t diff = (uint8_t*)p - mi_page_start(page);
+  const size_t block_size = mi_page_block_size(page);
+  const size_t adjust = (_mi_is_power_of_two(block_size) ? diff & (block_size - 1) : diff % block_size);
+  return (mi_block_t*)((uintptr_t)p - adjust);
+}
+
+// forward declaration for a MI_GUARDED build
+#if MI_GUARDED
+static void mi_block_unguard(mi_page_t* page, mi_block_t* block, void* p); // forward declaration
+static inline bool mi_block_check_unguard(mi_page_t* page, mi_block_t* block, void* p) {
+  if (mi_block_ptr_is_guarded(block, p)) { 
+    mi_block_unguard(page, block, p); 
+    return true;
   }
   else {
-    adjust = diff % mi_page_block_size(page);
+    return false;
   }
+}
+#else
+static inline bool mi_block_check_unguard(mi_page_t* page, mi_block_t* block, void* p) {
+  MI_UNUSED(page); MI_UNUSED(block); MI_UNUSED(p);
+  return false;
+}
+#endif
 
-  return (mi_block_t*)((uintptr_t)p - adjust);
+static inline mi_block_t* mi_validate_block_from_ptr( const mi_page_t* page, void* p ) {
+  mi_assert(_mi_page_ptr_unalign(page,p) == (mi_block_t*)p); // should never be an interior pointer
+  #if MI_SECURE > 0
+  // in secure mode we always unalign to guard against free-ing interior pointers
+  return _mi_page_ptr_unalign(page,p);
+  #else
+  MI_UNUSED(page);
+  return (mi_block_t*)p;
+  #endif
 }
 
+
 // free a local pointer  (page parameter comes first for better codegen)
-static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
-  MI_UNUSED(segment);
-  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p);
-  mi_free_block_local(page, block, true /* track stats */, true /* check for a full page */);
+static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, void* p) mi_attr_noexcept {
+  mi_assert_internal(p!=NULL && page != NULL);
+  mi_block_t* const block = (mi_page_has_interior_pointers(page) ? _mi_page_ptr_unalign(page, p) : mi_validate_block_from_ptr(page,p));
+  const bool was_guarded = mi_block_check_unguard(page, block, p);
+  mi_free_block_local(page, block, was_guarded, true /* track stats */, true /* check for a full page */);
 }
 
 // free a pointer owned by another thread (page parameter comes first for better codegen)
-static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
-  mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
-  mi_free_block_mt(page, segment, block);
+static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, void* p) mi_attr_noexcept {
+  mi_assert_internal(p!=NULL && page != NULL);
+  mi_block_t* const block = (mi_page_has_interior_pointers(page) ? _mi_page_ptr_unalign(page, p) : mi_validate_block_from_ptr(page,p));
+  const bool was_guarded = mi_block_check_unguard(page, block, p);
+  mi_free_block_mt(page, block, was_guarded);
 }
 
 // generic free (for runtime integration)
-void mi_decl_noinline _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
-  if (is_local) mi_free_generic_local(page,segment,p);
-           else mi_free_generic_mt(page,segment,p);
+void mi_decl_noinline _mi_free_generic(mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+  if (is_local) mi_free_generic_local(page,p);
+           else mi_free_generic_mt(page,p);
 }
 
-// Get the segment data belonging to a pointer
-// This is just a single `and` in release mode but does further checks in debug mode
-// (and secure mode) to see if this was a valid pointer.
-static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
-{
-  MI_UNUSED(msg);
 
-#if (MI_DEBUG>0)
-  if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) {
+// Get the page belonging to a pointer
+// Does further checks in debug mode to see if this was a valid pointer.
+static inline mi_page_t* mi_validate_ptr_page(const void* p, const char* msg)
+{
+  MI_UNUSED_RELEASE(msg);
+  #if MI_DEBUG
+  if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0 && !mi_option_is_enabled(mi_option_guarded_precise)) {
     _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
     return NULL;
   }
-#endif
-
-  mi_segment_t* const segment = _mi_ptr_segment(p);
-  if mi_unlikely(segment==NULL) return segment;
-
-#if (MI_DEBUG>0)
-  if mi_unlikely(!mi_is_in_heap_region(p)) {
-  #if (MI_INTPTR_SIZE == 8 && defined(__linux__))
-    if (((uintptr_t)p >> 40) != 0x7F) { // linux tends to align large blocks above 0x7F000000000 (issue #640)
+  mi_page_t* page = _mi_safe_ptr_page(p);
+  if (p != NULL && page == NULL) {
+    _mi_error_message(EINVAL, "%s: invalid pointer: %p\n", msg, p);
+  }
+  return page;
   #else
-    {
+  return _mi_ptr_page(p);
   #endif
-      _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
-        "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
-      if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
-        _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
-      }
-    }
-  }
-#endif
-#if (MI_DEBUG>0 || MI_SECURE>=4)
-  if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) {
-    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
-    return NULL;
-  }
-#endif
-
-  return segment;
 }
 
 // Free a block
 // Fast path written carefully to prevent register spilling on the stack
-void mi_free(void* p) mi_attr_noexcept
+static mi_decl_forceinline void mi_free_ex(void* p, size_t* usable, mi_page_t* page)  
 {
-  mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
-  if mi_unlikely(segment==NULL) return;
-
-  const bool is_local = (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-  mi_page_t* const page = _mi_segment_page_of(segment, p);
-
-  if mi_likely(is_local) {                        // thread-local free?
-    if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
-      // thread-local, aligned, and not a full page
-      mi_block_t* const block = (mi_block_t*)p;
-      mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */);
-    }
-    else {
-      // page is full or contains (inner) aligned blocks; use generic path
-      mi_free_generic_local(page, segment, p);
-    }
+  if mi_unlikely(page==NULL) return;  // page will be NULL if p==NULL
+  mi_assert_internal(p!=NULL && page!=NULL);
+  if (usable!=NULL) { *usable = mi_page_usable_block_size(page); }
+
+  const mi_threadid_t xtid = (_mi_prim_thread_id() ^ mi_page_xthread_id(page));
+  if mi_likely(xtid == 0) {                        // `tid == mi_page_thread_id(page) && mi_page_flags(page) == 0`
+    // thread-local, aligned, and not a full page
+    mi_block_t* const block = mi_validate_block_from_ptr(page,p);
+    mi_free_block_local(page, block, false /* was guarded */, true /* track stats */, false /* no need to check if the page is full */);
+  }
+  else if (xtid <= MI_PAGE_FLAG_MASK) {            // `tid == mi_page_thread_id(page) && mi_page_flags(page) != 0`
+    // page is local, but is full or contains (inner) aligned blocks; use generic path
+    mi_free_generic_local(page, p);
+  }
+  // free-ing in a page owned by a theap in another thread, or an abandoned page (not belonging to a theap)
+  else if ((xtid & MI_PAGE_FLAG_MASK) == 0) {      // `tid != mi_page_thread_id(page) && mi_page_flags(page) == 0`
+    // blocks are aligned (and not a full page); push on the thread_free list
+    mi_block_t* const block = mi_validate_block_from_ptr(page,p);
+    mi_free_block_mt(page,block,false /* was_guarded */);
   }
   else {
-    // not thread-local; use generic path
-    mi_free_generic_mt(page, segment, p);
+    // page is full or contains (inner) aligned blocks; use generic multi-thread path
+    mi_free_generic_mt(page, p);
   }
 }
 
-// return true if successful
-bool _mi_free_delayed_block(mi_block_t* block) {
-  // get segment and page
-  mi_assert_internal(block!=NULL);
-  const mi_segment_t* const segment = _mi_ptr_segment(block);
-  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(_mi_thread_id() == segment->thread_id);
-  mi_page_t* const page = _mi_segment_page_of(segment, block);
-
-  // Clear the no-delayed flag so delayed freeing is used again for this page.
-  // This must be done before collecting the free lists on this page -- otherwise
-  // some blocks may end up in the page `thread_free` list with no blocks in the
-  // heap `thread_delayed_free` list which may cause the page to be never freed!
-  // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`)
-  if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) {
-    return false;
-  }
+void mi_free(void* p) mi_attr_noexcept {
+  mi_page_t* const page = mi_validate_ptr_page(p,"mi_free");  
+  mi_free_ex(p, NULL, page);
+}
 
-  // collect all other non-local frees (move from `thread_free` to `free`) to ensure up-to-date `used` count
-  _mi_page_free_collect(page, false);
+void mi_ufree(void* p, size_t* usable) mi_attr_noexcept {
+  mi_page_t* const page = mi_validate_ptr_page(p,"mi_ufree");  
+  mi_free_ex(p, usable, page);
+}
 
-  // and free the block (possibly freeing the page as well since `used` is updated)
-  mi_free_block_local(page, block, false /* stats have already been adjusted */, true /* check for a full page */);
-  return true;
+void mi_free_small(void* p) mi_attr_noexcept {
+  // We can only call `mi_free_small` for pointers allocated with `mi_(heap_)malloc_small`.
+  // If we keep page info in front of the page area for small objects, we can find the info
+  // just by aligning down the pointer instead of looking it up in the page map.
+  #if MI_PAGE_META_ALIGNED_FREE_SMALL 
+    #if MI_GUARDED 
+    #warning "MI_PAGE_META_ALIGNED_FREE_SMALL ignored as MI_GUARDED is defined"
+    mi_free(p);
+    #elif MI_ARENA_SLICE_ALIGN < MI_SMALL_PAGE_SIZE
+    #warning "MI_PAGE_META_ALIGNED_FREE_SMALL ignored as the MI_ARENA_SLICE_ALIGN is less than the small page size"
+    mi_free(p);
+    #else
+      mi_page_t* const page = (mi_page_t*)_mi_align_down_ptr(p,MI_SMALL_PAGE_SIZE);
+      mi_assert(page == mi_validate_ptr_page(p,"mi_free_small"));
+      mi_assert((void*)page == _mi_align_down_ptr(page->page_start,MI_SMALL_PAGE_SIZE));
+      mi_assert(page->block_size <= MI_SMALL_SIZE_MAX);  // note: not `MI_SMALL_MAX_OBJ_SIZE` as we need to match `mi_(heap_)malloc_small`
+      mi_free_ex(p, NULL, page);
+    #endif
+  #else
+  mi_free(p);
+  #endif  
 }
 
-// ------------------------------------------------------
-// Multi-threaded Free (`_mt`)
-// ------------------------------------------------------
 
-// Push a block that is owned by another thread on its page-local thread free
-// list or it's heap delayed free list. Such blocks are later collected by
-// the owning thread in `_mi_free_delayed_block`.
-static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block )
+// --------------------------------------------------------------------------------------------
+// `mi_free_try_collect_mt`: Potentially collect a page in a free in an abandoned page.
+// 1. if the page becomes empty, free it
+// 2. if it can be reclaimed, reclaim it in our theap
+// 3. if it went to < 7/8th used, re-abandon to be mapped (so it can be found by theaps looking for free pages)
+// --------------------------------------------------------------------------------------------
+
+// Helper for mi_free_try_collect_mt: free if the page has no more used blocks (this is updated by `_mi_page_free_collect(_partly)`)
+static bool mi_abandoned_page_try_free(mi_page_t* page)
 {
-  // Try to put the block on either the page-local thread free list,
-  // or the heap delayed free list (if this is the first non-local free in that page)
-  mi_thread_free_t tfreex;
-  bool use_delayed;
-  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
-  do {
-    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
-    if mi_unlikely(use_delayed) {
-      // unlikely: this only happens on the first concurrent free in a page that is in the full list
-      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
-    }
-    else {
-      // usual: directly add to page thread_free list
-      mi_block_set_next(page, block, mi_tf_block(tfree));
-      tfreex = mi_tf_set_block(tfree,block);
-    }
-  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-
-  // If this was the first non-local free, we need to push it on the heap delayed free list instead
-  if mi_unlikely(use_delayed) {
-    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
-    mi_assert_internal(heap != NULL);
-    if (heap != NULL) {
-      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
-      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-      do {
-        mi_block_set_nextx(heap,block,dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
-    }
+  if (!mi_page_all_free(page)) return false;
+  // first remove it from the abandoned pages in the arena (if mapped, this might wait for any readers to finish)
+  _mi_arenas_page_unabandon(page,NULL);
+  _mi_arenas_page_free(page,NULL); // we can now free the page directly
+  return true;
+}
 
-    // and reset the MI_DELAYED_FREEING flag
-    tfree = mi_atomic_load_relaxed(&page->xthread_free);
-    do {
-      tfreex = tfree;
-      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
-      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
-    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+// Helper for mi_free_try_collect_mt: try if we can reabandon a previously abandoned mostly full page to be mapped
+static bool mi_abandoned_page_try_reabandon_to_mapped(mi_page_t* page)
+{
+  // if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations
+  // We only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
+  if (mi_page_is_mostly_used(page)) return false;   // not too full
+  if (page->memid.memkind != MI_MEM_ARENA || mi_page_is_abandoned_mapped(page)) return false;  // and not already mapped (or unmappable)
+
+  mi_assert(!mi_page_is_full(page));
+  return _mi_arenas_page_try_reabandon_to_mapped(page);
+}
+
+// Release ownership of a page. This may free or reabandond the page if other blocks are concurrently
+// freed in the meantime. Returns `true` if the page was freed.
+// By passing the captured `expected_thread_free`, we can often avoid calling `mi_page_free_collect`.
+static void mi_abandoned_page_unown_from_free(mi_page_t* page, mi_block_t* expected_thread_free) {
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(!mi_page_all_free(page));
+  // try to cas atomically the original free list (`mt_free`) back with the ownership cleared.
+  mi_thread_free_t tf_expect = mi_tf_create(expected_thread_free, true);
+  mi_thread_free_t tf_new    = mi_tf_create(expected_thread_free, false);
+  while mi_unlikely(!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_expect, tf_new)) {
+    mi_assert_internal(mi_tf_is_owned(tf_expect));
+    // while the xthread_free list is not empty..
+    while (mi_tf_block(tf_expect) != NULL) {
+      // if there were concurrent updates to the thread-free list, we retry to free or reabandon to mapped (if it became !mosty_used).
+      _mi_page_free_collect(page,false);  // update used count
+      if (mi_abandoned_page_try_free(page)) return;
+      if (mi_abandoned_page_try_reabandon_to_mapped(page)) return;
+      // otherwise continue un-owning
+      tf_expect = mi_atomic_load_relaxed(&page->xthread_free);
+    }
+    // and try again to release ownership
+    mi_assert_internal(mi_tf_block(tf_expect)==NULL);
+    tf_new = mi_tf_create(NULL, false);
   }
 }
 
-// Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block)
+static inline bool mi_page_queue_len_is_atmost( mi_theap_t* theap, size_t block_size, long atmost) {
+  if (atmost < 0) return false;
+  mi_page_queue_t* const pq = mi_page_queue(theap,block_size);
+  mi_assert_internal(pq!=NULL);
+  return (pq->count <= (size_t)atmost);
+}
+
+// Helper for mi_free_try_collect_mt:  try to reclaim the page for ourselves
+static mi_decl_noinline bool mi_abandoned_page_try_reclaim(mi_page_t* page, long reclaim_on_free) mi_attr_noexcept
 {
-  // first see if the segment was abandoned and if we can reclaim it into our thread
-  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) &&
-      #if MI_HUGE_PAGE_ABANDON
-      segment->page_kind != MI_PAGE_HUGE &&
-      #endif
-      mi_atomic_load_relaxed(&segment->thread_id) == 0)
-  {
-    // the segment is abandoned, try to reclaim it into our heap
-    if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
-      mi_assert_internal(_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-      mi_free(block);  // recursively free as now it will be a local free in our heap
-      return;
-    }
+  // note: reclaiming can improve benchmarks like `larson` or `rbtree-ck` a lot even in the single-threaded case,
+  // since free-ing from an owned page avoids atomic operations. However, if we reclaim too eagerly in
+  // a multi-threaded scenario we may start to hold on to too much memory and reduce reuse among threads.
+  // If the current theap is where the page originally came from, we reclaim much more eagerly while
+  // 'cross-thread' reclaiming on free is by default off (and we only 'reclaim' these by finding the abandoned
+  // pages when we allocate a fresh page).
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(!mi_page_all_free(page));
+  mi_assert_internal(page->block_size <= MI_SMALL_SIZE_MAX);
+  mi_assert_internal(reclaim_on_free >= 0);
+
+  // dont reclaim if we just have terminated this thread and we should
+  // not reinitialize the theap for this thread. (can happen due to thread-local destructors for example -- issue #944)
+  if (!_mi_thread_is_initialized()) return false;
+
+  // get our theap 
+  mi_theap_t* const theap = _mi_page_associated_theap_peek(page);
+  if (theap==NULL || !theap->allow_page_reclaim) return false;
+  
+  // todo: cache `is_in_threadpool` and `exclusive_arena` directly in the theap for performance?
+  // set max_reclaim limit
+  long max_reclaim = 0;
+  if mi_likely(theap == page->theap) {  // did this page originate from the current theap? (and thus allocated from this thread)
+    // originating theap
+    max_reclaim = _mi_option_get_fast(theap->tld->is_in_threadpool ? mi_option_page_cross_thread_max_reclaim : mi_option_page_max_reclaim);
+  }
+  else if (reclaim_on_free == 1 &&               // if cross-thread is allowed
+            !theap->tld->is_in_threadpool &&      // and we are not part of a threadpool
+            !mi_page_is_mostly_used(page) &&     // and the page is not too full
+            _mi_arena_memid_is_suitable(page->memid, _mi_theap_heap(theap)->exclusive_arena)) {   // and it fits our memory
+    // across threads
+    max_reclaim = _mi_option_get_fast(mi_option_page_cross_thread_max_reclaim);
   }
 
-  // The padding check may access the non-thread-owned page for the key values.
-  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
-  mi_check_padding(page, block);
+  // are we within the reclaim limit?
+  if (max_reclaim >= 0 && !mi_page_queue_len_is_atmost(theap, page->block_size, max_reclaim)) {
+    return false;
+  }
 
-  // adjust stats (after padding check and potentially recursive `mi_free` above)
-  mi_stat_free(page, block);    // stat_free may access the padding
-  mi_track_free_size(block, mi_page_usable_size_of(page,block));
+  // reclaim the page into this theap
+  // first remove it from the abandoned pages in the arena -- this might wait for any readers to finish
+  _mi_arenas_page_unabandon(page, theap);
+  _mi_theap_page_reclaim(theap, page);
+  mi_theap_stat_counter_increase(theap, pages_reclaim_on_free, 1);
+  return true;
+}
 
-  // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
-  _mi_padding_shrink(page, block, sizeof(mi_block_t));
 
-  if (segment->kind == MI_SEGMENT_HUGE) {
-    #if MI_HUGE_PAGE_ABANDON
-    // huge page segments are always abandoned and can be freed immediately
-    _mi_segment_huge_page_free(segment, page, block);
-    return;
-    #else
-    // huge pages are special as they occupy the entire segment
-    // as these are large we reset the memory occupied by the page so it is available to other threads
-    // (as the owning thread needs to actually free the memory later).
-    _mi_segment_huge_page_reset(segment, page, block);
-    #endif
+// We freed a block in an abandoned page (that was not owned). Try to collect
+static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept
+{
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(mt_free != NULL);
+  // we own the page now, and it is safe to collect the thread atomic free list
+  if (page->block_size <= MI_SMALL_SIZE_MAX) {
+    // use the `_partly` version to avoid atomic operations since we already have the `mt_free` pointing into the thread free list
+    // (after this the `used` count might be too high (as some blocks may have been concurrently added to the thread free list and are yet uncounted).
+    //  however, if the page became completely free, the used count is guaranteed to be 0.)
+    mi_assert_internal(page->reserved>=16); // below this even one freed block goes from full to no longer mostly used.
+    _mi_page_free_collect_partly(page, mt_free);    
   }
   else {
-    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
-    memset(block, MI_DEBUG_FREED, mi_usable_size(block));
-    #endif
+    // for larger blocks we use the regular collect 
+    _mi_page_free_collect(page,false /* no force */);
+    mt_free = NULL; // expected page->xthread_free value after collection
   }
+  const long reclaim_on_free = _mi_option_get_fast(mi_option_page_reclaim_on_free);
+  #if MI_DEBUG > 1
+  if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_all_free(page)); }
+  if (mi_page_is_full(page))      { mi_assert(mi_page_is_mostly_used(page)); }
+  #endif
 
-  // and finally free the actual block by pushing it on the owning heap
-  // thread_delayed free list (or heap delayed free list)
-  mi_free_block_delayed_mt(page,block);
+  // try to: 1. free it, 2. reclaim it, or 3. reabandon it to be mapped
+  if (mi_abandoned_page_try_free(page)) return;
+  if (page->block_size <= MI_SMALL_SIZE_MAX && reclaim_on_free >= 0) {  // early test for better codegen
+    if (mi_abandoned_page_try_reclaim(page, reclaim_on_free)) return;
+  }
+  if (mi_abandoned_page_try_reabandon_to_mapped(page)) return;
+  
+  // otherwise unown the page again
+  mi_abandoned_page_unown_from_free(page, mt_free);
 }
 
 
@@ -296,19 +397,19 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* seg
 // Bytes available in a block
 static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_page_t* page, const void* p) mi_attr_noexcept {
   const mi_block_t* block = _mi_page_ptr_unalign(page, p);
-  const size_t size = mi_page_usable_size_of(page, block);
+  const bool is_guarded = mi_block_ptr_is_guarded(block,p);
+  const size_t size = mi_page_usable_size_of(page, block, is_guarded);
   const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
   mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
-  return (size - adjust);
+  const size_t aligned_size = (size - adjust);  
+  return aligned_size;
 }
 
-static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
-  const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
-  if mi_unlikely(segment==NULL) return 0;
-  const mi_page_t* const page = _mi_segment_page_of(segment, p);
-  if mi_likely(!mi_page_has_aligned(page)) {
+static inline size_t _mi_usable_size(const void* p, const mi_page_t* page) mi_attr_noexcept {
+  if mi_unlikely(page==NULL) return 0;
+  if mi_likely(!mi_page_has_interior_pointers(page)) {
     const mi_block_t* block = (const mi_block_t*)p;
-    return mi_page_usable_size_of(page, block);
+    return mi_page_usable_size_of(page, block, false /* is guarded */);
   }
   else {
     // split out to separate routine for improved code generation
@@ -317,7 +418,8 @@ static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noe
 }
 
 mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept {
-  return _mi_usable_size(p, "mi_usable_size");
+  const mi_page_t* const page = mi_validate_ptr_page(p,"mi_usable_size");
+  return _mi_usable_size(p,page);
 }
 
 
@@ -327,7 +429,11 @@ mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept {
 
 void mi_free_size(void* p, size_t size) mi_attr_noexcept {
   MI_UNUSED_RELEASE(size);
-  mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size"));
+  #if MI_DEBUG
+  const mi_page_t* const page = mi_validate_ptr_page(p,"mi_free_size");  
+  const size_t available = _mi_usable_size(p,page);
+  mi_assert(p == NULL || size <= available || available == 0 /* invalid pointer */ );
+  #endif
   mi_free(p);
 }
 
@@ -396,7 +502,7 @@ static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block
 
 
 // ---------------------------------------------------------------------------
-// Check for heap block overflow by setting up padding at the end of the block
+// Check for theap block overflow by setting up padding at the end of the block
 // ---------------------------------------------------------------------------
 
 #if MI_PADDING // && !MI_TRACK_ENABLED
@@ -409,22 +515,28 @@ static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* bloc
   uintptr_t keys[2];
   keys[0] = page->keys[0];
   keys[1] = page->keys[1];
-  bool ok = ((uint32_t)mi_ptr_encode(page,block,keys) == canary && *delta <= *bsize);
+  bool ok = (mi_ptr_encode_canary(page,block,keys) == canary && *delta <= *bsize);
   mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
   return ok;
 }
 
 // Return the exact usable size of a block.
-static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
-  return (ok ? bsize - delta : 0);
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block, bool is_guarded) {
+  if (is_guarded) {
+    const size_t bsize = mi_page_block_size(page);
+    return (bsize - _mi_os_page_size());
+  }
+  else {
+    size_t bsize;
+    size_t delta;
+    bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+    mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
+    return (ok ? bsize - delta : 0);
+  }
 }
 
 // When a non-thread-local block is freed, it becomes part of the thread delayed free
-// list that is freed later by the owning heap. If the exact usable size is too small to
+// list that is freed later by the owning theap. If the exact usable size is too small to
 // contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
 // so it will later not trigger an overflow error in `mi_free_block`.
 void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
@@ -443,15 +555,13 @@ void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const si
   mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
 }
 #else
-static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(block);
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block, bool is_guarded) {
+  MI_UNUSED(is_guarded); MI_UNUSED(block);
   return mi_page_usable_block_size(page);
 }
 
 void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
-  MI_UNUSED(min_size);
+  MI_UNUSED(page); MI_UNUSED(block); MI_UNUSED(min_size);
 }
 #endif
 
@@ -485,7 +595,7 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
   size_t size;
   size_t wrong;
   if (!mi_verify_padding(page,block,&size,&wrong)) {
-    _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
+    _mi_error_message(EFAULT, "buffer overflow in theap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
   }
 }
 
@@ -501,30 +611,48 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
 // only maintain stats for smaller objects if requested
 #if (MI_STAT>0)
 static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-  #if (MI_STAT < 2)
   MI_UNUSED(block);
-  #endif
-  mi_heap_t* const heap = mi_heap_get_default();
+  mi_theap_t* const theap = _mi_theap_default();
+  if (!mi_theap_is_initialized(theap)) return; // (for now) skip statistics if free'd after thread_done was called (usually a thread cleanup call by the OS)
+
   const size_t bsize = mi_page_usable_block_size(page);
-  #if (MI_STAT>1)
-  const size_t usize = mi_page_usable_size_of(page, block);
-  mi_heap_stat_decrease(heap, malloc, usize);
-  #endif
-  if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, normal, bsize);
+  // #if (MI_STAT>1)
+  // const size_t usize = mi_page_usable_size_of(page, block);
+  // mi_theap_stat_decrease(theap, malloc_requested, usize);
+  // #endif
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
+    mi_theap_stat_decrease(theap, malloc_normal, bsize);
     #if (MI_STAT > 1)
-    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
+    mi_theap_stat_decrease(theap, malloc_bins[_mi_bin(bsize)], 1);
     #endif
   }
-  else if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, large, bsize);
-  }
   else {
-    mi_heap_stat_decrease(heap, huge, bsize);
+    const size_t bpsize = mi_page_block_size(page);  // match stat in page.c:mi_huge_page_alloc
+    mi_theap_stat_decrease(theap, malloc_huge, bpsize);
   }
 }
 #else
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
   MI_UNUSED(page); MI_UNUSED(block);
 }
 #endif
+
+
+// Remove guard page when building with MI_GUARDED
+#if MI_GUARDED
+static void mi_block_unguard(mi_page_t* page, mi_block_t* block, void* p) {
+  MI_UNUSED(p);
+  mi_assert_internal(mi_block_ptr_is_guarded(block, p));
+  mi_assert_internal(mi_page_has_interior_pointers(page));
+  mi_assert_internal((uint8_t*)p - (uint8_t*)block >= (ptrdiff_t)sizeof(mi_block_t));
+  mi_assert_internal(block->next == MI_BLOCK_TAG_GUARDED);
+
+  const size_t bsize = mi_page_block_size(page);
+  const size_t psize = _mi_os_page_size();
+  mi_assert_internal(bsize > psize);
+  mi_assert_internal(!page->memid.is_pinned);
+  void* gpage = (uint8_t*)block + bsize - psize;
+  mi_assert_internal(_mi_is_aligned(gpage, psize));
+  _mi_os_unprotect(gpage, psize);
+}
+#endif
diff --git a/system/lib/mimalloc/src/heap.c b/system/lib/mimalloc/src/heap.c
index e498fdb2093fb..35787301329ec 100644
--- a/system/lib/mimalloc/src/heap.c
+++ b/system/lib/mimalloc/src/heap.c
@@ -1,5 +1,5 @@
 /*----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -7,647 +7,258 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
-#include "mimalloc/prim.h"  // mi_prim_get_default_heap
+#include "mimalloc/prim.h"  // _mi_theap_default
 
-#include <string.h>  // memset, memcpy
-
-#if defined(_MSC_VER) && (_MSC_VER < 1920)
-#pragma warning(disable:4204)  // non-constant aggregate initializer
-#endif
 
 /* -----------------------------------------------------------
-  Helpers
+  Heap's
 ----------------------------------------------------------- */
 
-// return `true` if ok, `false` to break
-typedef bool (heap_page_visitor_fun)(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2);
-
-// Visit all pages in a heap; returns `false` if break was called.
-static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void* arg1, void* arg2)
-{
-  if (heap==NULL || heap->page_count==0) return 0;
-
-  // visit all pages
-  #if MI_DEBUG>1
-  size_t total = heap->page_count;
-  size_t count = 0;
-  #endif
-
-  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
-    mi_page_queue_t* pq = &heap->pages[i];
-    mi_page_t* page = pq->first;
-    while(page != NULL) {
-      mi_page_t* next = page->next; // save next in case the page gets removed from the queue
-      mi_assert_internal(mi_page_heap(page) == heap);
-      #if MI_DEBUG>1
-      count++;
-      #endif
-      if (!fn(heap, pq, page, arg1, arg2)) return false;
-      page = next; // and continue
-    }
-  }
-  mi_assert_internal(count == total);
-  return true;
+mi_theap_t* mi_heap_theap(mi_heap_t* heap) {
+  return _mi_heap_theap(heap);
 }
 
-
-#if MI_DEBUG>=2
-static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
-  MI_UNUSED(arg1);
-  MI_UNUSED(arg2);
-  MI_UNUSED(pq);
-  mi_assert_internal(mi_page_heap(page) == heap);
-  mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert_internal(segment->thread_id == heap->thread_id);
-  mi_assert_expensive(_mi_page_is_valid(page));
-  return true;
-}
-#endif
-#if MI_DEBUG>=3
-static bool mi_heap_is_valid(mi_heap_t* heap) {
-  mi_assert_internal(heap!=NULL);
-  mi_heap_visit_pages(heap, &mi_heap_page_is_valid, NULL, NULL);
-  return true;
+void mi_heap_set_numa_affinity(mi_heap_t* heap, int numa_node) {
+  if (heap==NULL) { heap = mi_heap_main(); }
+  heap->numa_node = (numa_node < 0 ? -1 : numa_node % _mi_os_numa_node_count());
 }
-#endif
 
-
-
-
-/* -----------------------------------------------------------
-  "Collect" pages by migrating `local_free` and `thread_free`
-  lists and freeing empty pages. This is done when a thread
-  stops (and in that case abandons pages if there are still
-  blocks alive)
------------------------------------------------------------ */
-
-typedef enum mi_collect_e {
-  MI_NORMAL,
-  MI_FORCE,
-  MI_ABANDON
-} mi_collect_t;
-
-
-static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg_collect, void* arg2 ) {
-  MI_UNUSED(arg2);
-  MI_UNUSED(heap);
-  mi_assert_internal(mi_heap_page_is_valid(heap, pq, page, NULL, NULL));
-  mi_collect_t collect = *((mi_collect_t*)arg_collect);
-  _mi_page_free_collect(page, collect >= MI_FORCE);
-  if (collect == MI_FORCE) {
-    // note: call before a potential `_mi_page_free` as the segment may be freed if this was the last used page in that segment.
-    mi_segment_t* segment = _mi_page_segment(page);
-    _mi_segment_collect(segment, true /* force? */, &heap->tld->segments);
-  }
-  if (mi_page_all_free(page)) {
-    // no more used blocks, free the page.
-    // note: this will free retired pages as well.
-    _mi_page_free(page, pq, collect >= MI_FORCE);
-  }
-  else if (collect == MI_ABANDON) {
-    // still used blocks but the thread is done; abandon the page
-    _mi_page_abandon(page, pq);
-  }
-  return true; // don't break
+void mi_heap_stats_merge_to_subproc(mi_heap_t* heap) {
+  if (heap==NULL) { heap = mi_heap_main(); }
+  _mi_stats_merge_into(&heap->subproc->stats, &heap->stats);
 }
 
-static bool mi_heap_page_never_delayed_free(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
-  MI_UNUSED(arg1);
-  MI_UNUSED(arg2);
-  MI_UNUSED(heap);
-  MI_UNUSED(pq);
-  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
-  return true; // don't break
+void mi_heap_stats_merge_to_main(mi_heap_t* heap) {
+  if (heap==NULL) return;
+  _mi_stats_merge_into(&mi_heap_main()->stats, &heap->stats);
 }
 
-static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
+static mi_decl_noinline mi_theap_t* mi_heap_init_theap(const mi_heap_t* const_heap)
 {
-  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
-
-  const bool force = (collect >= MI_FORCE);
-  _mi_deferred_free(heap, force);
-
-  // python/cpython#112532: we may be called from a thread that is not the owner of the heap
-  const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id());
-
-  // note: never reclaim on collect but leave it to threads that need storage to reclaim
-  const bool force_main =
-    #ifdef NDEBUG
-      collect == MI_FORCE
-    #else
-      collect >= MI_FORCE
-    #endif
-      && is_main_thread && mi_heap_is_backing(heap) && !heap->no_reclaim;
-
-  if (force_main) {
-    // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
-    // if all memory is freed by now, all segments should be freed.
-    _mi_abandoned_reclaim_all(heap, &heap->tld->segments);
-  }
+  mi_heap_t* heap = (mi_heap_t*)const_heap;
+  mi_assert_internal(heap!=NULL);
 
-  // if abandoning, mark all pages to no longer add to delayed_free
-  if (collect == MI_ABANDON) {
-    mi_heap_visit_pages(heap, &mi_heap_page_never_delayed_free, NULL, NULL);
+  if (_mi_is_heap_main(heap)) {
+    // this can be called if the (main) thread is not yet initialized (as no allocation happened)
+    // but `theap_main_init_get()` will call `mi_thread_init()`
+    mi_theap_t* const theap = _mi_theap_main_safe();
+    mi_assert_internal(theap!=NULL && _mi_is_heap_main(_mi_theap_heap(theap)));
+    return theap;
   }
 
-  // free all current thread delayed blocks.
-  // (if abandoning, after this there are no more thread-delayed references into the pages.)
-  _mi_heap_delayed_free_all(heap);
-
-  // collect retired pages
-  _mi_heap_collect_retired(heap, force);
-
-  // collect all pages owned by this thread
-  mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
-  mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
-
-  // collect abandoned segments (in particular, purge expired parts of segments in the abandoned segment list)
-  // note: forced purge can be quite expensive if many threads are created/destroyed so we do not force on abandonment
-  _mi_abandoned_collect(heap, collect == MI_FORCE /* force? */, &heap->tld->segments);
-  
-  // if forced, collect thread data cache on program-exit (or shared library unload)
-  if (force && is_main_thread && mi_heap_is_backing(heap)) {
-    _mi_thread_data_collect();  // collect thread data cache
+  // otherwise initialize the theap for this heap
+  // get the thread local
+  mi_assert_internal(heap->theap != 0);
+  if (heap->theap==0) {  // paranoia
+    _mi_error_message(EFAULT, "no thread-local reserved for heap (%p)\n", heap);
+    return NULL;
   }
-  
-  // collect arenas (this is program wide so don't force purges on abandonment of threads)
-  _mi_arenas_collect(collect == MI_FORCE /* force purge? */, &heap->tld->stats);  
-}
-
-void _mi_heap_collect_abandon(mi_heap_t* heap) {
-  mi_heap_collect_ex(heap, MI_ABANDON);
-}
-
-void mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept {
-  mi_heap_collect_ex(heap, (force ? MI_FORCE : MI_NORMAL));
-}
-
-void mi_collect(bool force) mi_attr_noexcept {
-  mi_heap_collect(mi_prim_get_default_heap(), force);
-}
-
-
-/* -----------------------------------------------------------
-  Heap new
------------------------------------------------------------ */
-
-mi_heap_t* mi_heap_get_default(void) {
-  mi_thread_init();
-  return mi_prim_get_default_heap();
-}
-
-static bool mi_heap_is_default(const mi_heap_t* heap) {
-  return (heap == mi_prim_get_default_heap());
+  mi_theap_t* theap = (mi_theap_t*)_mi_thread_local_get(heap->theap);
+
+  // create a fresh theap?
+  if (theap==NULL) {
+    theap = _mi_theap_create(heap, _mi_theap_default_safe()->tld);
+    if (theap==NULL) {
+      _mi_error_message(EFAULT, "unable to allocate memory for a thread local heap\n");
+      return NULL;
+    }
+    if (!_mi_thread_local_set(heap->theap, theap)) {
+      _mi_error_message(EFAULT, "unable to allocate memory for a thread local storage\n");
+      return NULL;
+    }
+  }
+  return theap;
 }
 
 
-mi_heap_t* mi_heap_get_backing(void) {
-  mi_heap_t* heap = mi_heap_get_default();
-  mi_assert_internal(heap!=NULL);
-  mi_heap_t* bheap = heap->tld->heap_backing;
-  mi_assert_internal(bheap!=NULL);
-  mi_assert_internal(bheap->thread_id == _mi_thread_id());
-  return bheap;
-}
-
-void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag) {
-  _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
-  heap->tld = tld;
-  heap->thread_id  = _mi_thread_id();
-  heap->arena_id   = arena_id;
-  heap->no_reclaim = noreclaim;
-  heap->tag        = tag;
-  if (heap == tld->heap_backing) {
-    _mi_random_init(&heap->random);
+// get the theap for a heap without initializing (and return NULL in that case)
+mi_theap_t* _mi_heap_theap_get_peek(const mi_heap_t* heap) {
+  if (heap==NULL || _mi_is_heap_main(heap)) {
+    return _mi_theap_main_safe(); 
   }
   else {
-    _mi_random_split(&tld->heap_backing->random, &heap->random);
+    return (mi_theap_t*)_mi_thread_local_get(heap->theap);
   }
-  heap->cookie  = _mi_heap_random_next(heap) | 1;
-  heap->keys[0] = _mi_heap_random_next(heap);
-  heap->keys[1] = _mi_heap_random_next(heap);
-  // push on the thread local heaps list
-  heap->next = heap->tld->heaps;
-  heap->tld->heaps = heap;
-}
-
-mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) {
-  mi_heap_t* bheap = mi_heap_get_backing();
-  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
-  if (heap == NULL) return NULL;
-  // don't reclaim abandoned pages or otherwise destroy is unsafe  
-  _mi_heap_init(heap, bheap->tld, arena_id, true /* no reclaim */, 0 /* default tag */);
-  return heap;
-}
-
-mi_decl_nodiscard mi_heap_t* mi_heap_new(void) {
-  return mi_heap_new_in_arena(_mi_arena_id_none());
 }
 
-bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid) {
-  return _mi_arena_memid_is_suitable(memid, heap->arena_id);
+// get (and possibly create) the theap belonging to a heap
+mi_theap_t* _mi_heap_theap_get_or_init(const mi_heap_t* heap)
+{
+  mi_theap_t* theap = _mi_heap_theap_peek(heap);
+  if mi_unlikely(theap==NULL) {
+    theap = mi_heap_init_theap(heap);
+    if (theap==NULL) { return (mi_theap_t*)&_mi_theap_empty_wrong; }  // this will return NULL from page.c:_mi_malloc_generic
+  }
+  _mi_theap_cached_set(theap);
+  return theap;
 }
 
-uintptr_t _mi_heap_random_next(mi_heap_t* heap) {
-  return _mi_random_next(&heap->random);
-}
 
-// zero out the page queues
-static void mi_heap_reset_pages(mi_heap_t* heap) {
-  mi_assert_internal(heap != NULL);
-  mi_assert_internal(mi_heap_is_initialized(heap));
-  // TODO: copy full empty heap instead?
-  memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
-  _mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages));
-  heap->thread_delayed_free = NULL;
-  heap->page_count = 0;
-}
+mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t exclusive_arena_id) {
+  // always allocate heap data in the (subprocess) main heap
+  mi_heap_t* const heap_main = mi_heap_main();
+  // todo: allocate heap data in the exclusive arena ?
+  mi_heap_t* const heap = (mi_heap_t*)mi_heap_zalloc( heap_main, sizeof(mi_heap_t) );
+  if (heap==NULL) return NULL;
 
-// called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources.
-static void mi_heap_free(mi_heap_t* heap) {
-  mi_assert(heap != NULL);
-  mi_assert_internal(mi_heap_is_initialized(heap));
-  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
-  if (mi_heap_is_backing(heap)) return; // dont free the backing heap
-
-  // reset default
-  if (mi_heap_is_default(heap)) {
-    _mi_heap_set_default_direct(heap->tld->heap_backing);
+  // reserve a thread local slot for this heap (see also issue #1230)
+  const mi_thread_local_t theap_slot = _mi_thread_local_create();
+  if (theap_slot == 0) {
+    _mi_error_message(EFAULT, "unable to dynamically create a thread local for a heap\n");
+    mi_free(heap);
+    return NULL;
   }
 
-  // remove ourselves from the thread local heaps list
-  // linear search but we expect the number of heaps to be relatively small
-  mi_heap_t* prev = NULL;
-  mi_heap_t* curr = heap->tld->heaps;
-  while (curr != heap && curr != NULL) {
-    prev = curr;
-    curr = curr->next;
+  // init fields
+  heap->theap = theap_slot;
+  heap->subproc = heap_main->subproc;
+  heap->heap_seq = mi_atomic_increment_relaxed(&heap_main->subproc->heap_total_count);
+  heap->exclusive_arena = _mi_arena_from_id(exclusive_arena_id);
+  heap->numa_node = -1; // no initial affinity
+
+  mi_lock_init(&heap->theaps_lock);
+  mi_lock_init(&heap->os_abandoned_pages_lock);
+  mi_lock_init(&heap->arena_pages_lock);
+
+  // push onto the subproc heaps
+  mi_lock(&heap->subproc->heaps_lock) {
+    mi_heap_t* head = heap->subproc->heaps;
+    heap->prev = NULL;
+    heap->next = head;
+    if (head!=NULL) { head->prev = heap;  }
+    heap->subproc->heaps = heap;
   }
-  mi_assert_internal(curr == heap);
-  if (curr == heap) {
-    if (prev != NULL) { prev->next = heap->next; }
-                 else { heap->tld->heaps = heap->next; }
-  }
-  mi_assert_internal(heap->tld->heaps != NULL);
-
-  // and free the used memory
-  mi_free(heap);
+  mi_atomic_increment_relaxed(&heap_main->subproc->heap_count);
+  mi_subproc_stat_increase(heap_main->subproc, heaps, 1);
+  return heap;
 }
 
-// return a heap on the same thread as `heap` specialized for the specified tag (if it exists)
-mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag) {
-  if (heap->tag == tag) {
-    return heap;
-  }
-  for (mi_heap_t *curr = heap->tld->heaps; curr != NULL; curr = curr->next) {
-    if (curr->tag == tag) {
-      return curr;
+mi_heap_t* mi_heap_new(void) {
+  return mi_heap_new_in_arena(0);
+}
+
+// free all theaps belonging to this heap (without deleting their pages as we do this arena wise for efficiency)
+static void mi_heap_free_theaps(mi_heap_t* heap) {
+  // This can run concurrently with a thread that terminates (see `init.c:mi_thread_theaps_done`), 
+  // and we need to ensure we free theaps atomically.
+  // We do this in a loop where we release the theaps_lock at every potential re-iteration to unblock 
+  // potential concurrent thread termination which tries to remove the theap from our theaps list.
+  bool all_freed;
+  do {
+    all_freed = true;
+    mi_theap_t* theap = NULL;
+    mi_lock(&heap->theaps_lock) { 
+      theap = heap->theaps; 
+      while(theap != NULL) {
+        mi_theap_t* next = theap->hnext;
+        if (!_mi_theap_free(theap, false /* dont re-acquire the heap->theaps_lock */, true /* acquire the tld->theaps_lock though */ )) {
+          all_freed = false;
+        }
+        theap = next;
+      }      
     }
+    if (!all_freed) { mi_heap_stat_counter_increase(heap,heaps_delete_wait,1); mi_atomic_yield(); }
+               else { mi_assert_internal(heap->theaps==NULL); }               
   }
-  return NULL;
+  while(!all_freed);
 }
 
-/* -----------------------------------------------------------
-  Heap destroy
------------------------------------------------------------ */
-
-static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
-  MI_UNUSED(arg1);
-  MI_UNUSED(arg2);
-  MI_UNUSED(heap);
-  MI_UNUSED(pq);
-
-  // ensure no more thread_delayed_free will be added
-  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
-
-  // stats
-  const size_t bsize = mi_page_block_size(page);
-  if (bsize > MI_MEDIUM_OBJ_SIZE_MAX) {
-    if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-      mi_heap_stat_decrease(heap, large, bsize);
-    }
-    else {
-      mi_heap_stat_decrease(heap, huge, bsize);
+// free the heap resources (assuming the pages are already moved/destroyed, and all theaps have been freed)
+static void mi_heap_free(mi_heap_t* heap) {
+  mi_assert_internal(heap!=NULL && !_mi_is_heap_main(heap));
+
+  // free all arena pages infos
+  mi_lock(&heap->arena_pages_lock) {
+    for (size_t i = 0; i < MI_MAX_ARENAS; i++) {
+      mi_arena_pages_t* arena_pages = mi_atomic_load_ptr_relaxed(mi_arena_pages_t, &heap->arena_pages[i]);
+      if (arena_pages!=NULL) {
+        mi_atomic_store_ptr_relaxed(mi_arena_pages_t, &heap->arena_pages[i], NULL);
+        mi_free(arena_pages);
+      }
     }
   }
-#if (MI_STAT)
-  _mi_page_free_collect(page, false);  // update used count
-  const size_t inuse = page->used;
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, normal, bsize * inuse);
-#if (MI_STAT>1)
-    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], inuse);
-#endif
-  }
-  mi_heap_stat_decrease(heap, malloc, bsize * inuse);  // todo: off for aligned blocks...
-#endif
-
-  /// pretend it is all free now
-  mi_assert_internal(mi_page_thread_free(page) == NULL);
-  page->used = 0;
-
-  // and free the page
-  // mi_page_free(page,false);
-  page->next = NULL;
-  page->prev = NULL;
-  _mi_segment_page_free(page,false /* no force? */, &heap->tld->segments);
-
-  return true; // keep going
-}
 
-void _mi_heap_destroy_pages(mi_heap_t* heap) {
-  mi_heap_visit_pages(heap, &_mi_heap_page_destroy, NULL, NULL);
-  mi_heap_reset_pages(heap);
-}
-
-#if MI_TRACK_HEAP_DESTROY
-static bool mi_cdecl mi_heap_track_block_free(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) {
-  MI_UNUSED(heap); MI_UNUSED(area);  MI_UNUSED(arg); MI_UNUSED(block_size);
-  mi_track_free_size(block,mi_usable_size(block));
-  return true;
-}
-#endif
-
-void mi_heap_destroy(mi_heap_t* heap) {
-  mi_assert(heap != NULL);
-  mi_assert(mi_heap_is_initialized(heap));
-  mi_assert(heap->no_reclaim);
-  mi_assert_expensive(mi_heap_is_valid(heap));
-  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
-  if (!heap->no_reclaim) {
-    // don't free in case it may contain reclaimed pages
-    mi_heap_delete(heap);
-  }
-  else {
-    // track all blocks as freed
-    #if MI_TRACK_HEAP_DESTROY
-    mi_heap_visit_blocks(heap, true, mi_heap_track_block_free, NULL);
-    #endif
-    // free all pages
-    _mi_heap_destroy_pages(heap);
-    mi_heap_free(heap);
+  // remove the heap from the subproc
+  mi_heap_stats_merge_to_main(heap);
+  mi_atomic_decrement_relaxed(&heap->subproc->heap_count);
+  mi_subproc_stat_decrease(heap->subproc, heaps, 1);
+  mi_lock(&heap->subproc->heaps_lock) {
+    if (heap->next!=NULL) { heap->next->prev = heap->prev; }
+    if (heap->prev!=NULL) { heap->prev->next = heap->next; }
+                     else { heap->subproc->heaps = heap->next; }
   }
-}
 
-// forcefully destroy all heaps in the current thread
-void _mi_heap_unsafe_destroy_all(void) {
-  mi_heap_t* bheap = mi_heap_get_backing();
-  mi_heap_t* curr = bheap->tld->heaps;
-  while (curr != NULL) {
-    mi_heap_t* next = curr->next;
-    if (curr->no_reclaim) {
-      mi_heap_destroy(curr);
-    }
-    else {
-      _mi_heap_destroy_pages(curr);
-    }
-    curr = next;
-  }
-}
-
-/* -----------------------------------------------------------
-  Safe Heap delete
------------------------------------------------------------ */
-
-// Transfer the pages from one heap to the other
-static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
-  mi_assert_internal(heap!=NULL);
-  if (from==NULL || from->page_count == 0) return;
-
-  // reduce the size of the delayed frees
-  _mi_heap_delayed_free_partial(from);
-
-  // transfer all pages by appending the queues; this will set a new heap field
-  // so threads may do delayed frees in either heap for a while.
-  // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state
-  // so after this only the new heap will get delayed frees
-  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
-    mi_page_queue_t* pq = &heap->pages[i];
-    mi_page_queue_t* append = &from->pages[i];
-    size_t pcount = _mi_page_queue_append(heap, pq, append);
-    heap->page_count += pcount;
-    from->page_count -= pcount;
-  }
-  mi_assert_internal(from->page_count == 0);
-
-  // and do outstanding delayed frees in the `from` heap
-  // note: be careful here as the `heap` field in all those pages no longer point to `from`,
-  // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a
-  // the regular `_mi_free_delayed_block` which is safe.
-  _mi_heap_delayed_free_all(from);
-  #if !defined(_MSC_VER) || (_MSC_VER > 1900) // somehow the following line gives an error in VS2015, issue #353
-  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_block_t,&from->thread_delayed_free) == NULL);
-  #endif
-
-  // and reset the `from` heap
-  mi_heap_reset_pages(from);
+  _mi_thread_local_free(heap->theap);
+  mi_lock_done(&heap->theaps_lock);
+  mi_lock_done(&heap->os_abandoned_pages_lock);
+  mi_free(heap);
 }
 
-// Safe delete a heap without freeing any still allocated blocks in that heap.
-void mi_heap_delete(mi_heap_t* heap)
-{
-  mi_assert(heap != NULL);
-  mi_assert(mi_heap_is_initialized(heap));
-  mi_assert_expensive(mi_heap_is_valid(heap));
-  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
-
-  if (!mi_heap_is_backing(heap)) {
-    // transfer still used pages to the backing heap
-    mi_heap_absorb(heap->tld->heap_backing, heap);
+void mi_heap_delete(mi_heap_t* heap) {
+  if (heap==NULL) return;
+  if (_mi_is_heap_main(heap)) {
+    _mi_warning_message("cannot delete the main heap\n");
+    return;
   }
-  else {
-    // the backing heap abandons its pages
-    _mi_heap_collect_abandon(heap);
-  }
-  mi_assert_internal(heap->page_count==0);
+  mi_heap_free_theaps(heap);
+  _mi_heap_move_pages(heap, mi_heap_main());
   mi_heap_free(heap);
 }
 
-mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
-  mi_assert(heap != NULL);
-  mi_assert(mi_heap_is_initialized(heap));
-  if (heap==NULL || !mi_heap_is_initialized(heap)) return NULL;
-  mi_assert_expensive(mi_heap_is_valid(heap));
-  mi_heap_t* old = mi_prim_get_default_heap();
-  _mi_heap_set_default_direct(heap);
-  return old;
+void _mi_heap_force_destroy(mi_heap_t* heap) {
+  if (heap==NULL) return;
+  mi_heap_free_theaps(heap);
+  _mi_heap_destroy_pages(heap);
+  if (!_mi_is_heap_main(heap)) { mi_heap_free(heap); }
 }
 
-
-
-
-/* -----------------------------------------------------------
-  Analysis
------------------------------------------------------------ */
-
-// static since it is not thread safe to access heaps from other threads.
-static mi_heap_t* mi_heap_of_block(const void* p) {
-  if (p == NULL) return NULL;
-  mi_segment_t* segment = _mi_ptr_segment(p);
-  bool valid = (_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(valid);
-  if mi_unlikely(!valid) return NULL;
-  return mi_page_heap(_mi_segment_page_of(segment,p));
-}
-
-bool mi_heap_contains_block(mi_heap_t* heap, const void* p) {
-  mi_assert(heap != NULL);
-  if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
-  return (heap == mi_heap_of_block(p));
-}
-
-
-static bool mi_heap_page_check_owned(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* p, void* vfound) {
-  MI_UNUSED(heap);
-  MI_UNUSED(pq);
-  bool* found = (bool*)vfound;
-  void* start = mi_page_start(page);
-  void* end   = (uint8_t*)start + (page->capacity * mi_page_block_size(page));
-  *found = (p >= start && p < end);
-  return (!*found); // continue if not found
-}
-
-bool mi_heap_check_owned(mi_heap_t* heap, const void* p) {
-  mi_assert(heap != NULL);
-  if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
-  if (((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) return false;  // only aligned pointers
-  bool found = false;
-  mi_heap_visit_pages(heap, &mi_heap_page_check_owned, (void*)p, &found);
-  return found;
-}
-
-bool mi_check_owned(const void* p) {
-  return mi_heap_check_owned(mi_prim_get_default_heap(), p);
-}
-
-/* -----------------------------------------------------------
-  Visit all heap blocks and areas
-  Todo: enable visiting abandoned pages, and
-        enable visiting all blocks of all heaps across threads
------------------------------------------------------------ */
-
-// Separate struct to keep `mi_page_t` out of the public interface
-typedef struct mi_heap_area_ex_s {
-  mi_heap_area_t area;
-  mi_page_t*     page;
-} mi_heap_area_ex_t;
-
-static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_visit_fun* visitor, void* arg) {
-  mi_assert(xarea != NULL);
-  if (xarea==NULL) return true;
-  const mi_heap_area_t* area = &xarea->area;
-  mi_page_t* page = xarea->page;
-  mi_assert(page != NULL);
-  if (page == NULL) return true;
-
-  _mi_page_free_collect(page,true);
-  mi_assert_internal(page->local_free == NULL);
-  if (page->used == 0) return true;
-
-  const size_t bsize = mi_page_block_size(page);
-  const size_t ubsize = mi_page_usable_block_size(page); // without padding
-  size_t   psize;
-  uint8_t* pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
-
-  if (page->capacity == 1) {
-    // optimize page with one block
-    mi_assert_internal(page->used == 1 && page->free == NULL);
-    return visitor(mi_page_heap(page), area, pstart, ubsize, arg);
-  }
-
-  // create a bitmap of free blocks.
-  #define MI_MAX_BLOCKS   (MI_SMALL_PAGE_SIZE / sizeof(void*))
-  uintptr_t free_map[MI_MAX_BLOCKS / sizeof(uintptr_t)];
-  memset(free_map, 0, sizeof(free_map));
-
-  #if MI_DEBUG>1
-  size_t free_count = 0;
-  #endif
-  for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
-    #if MI_DEBUG>1
-    free_count++;
-    #endif
-    mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize));
-    size_t offset = (uint8_t*)block - pstart;
-    mi_assert_internal(offset % bsize == 0);
-    size_t blockidx = offset / bsize;  // Todo: avoid division?
-    mi_assert_internal( blockidx < MI_MAX_BLOCKS);
-    size_t bitidx = (blockidx / sizeof(uintptr_t));
-    size_t bit = blockidx - (bitidx * sizeof(uintptr_t));
-    free_map[bitidx] |= ((uintptr_t)1 << bit);
-  }
-  mi_assert_internal(page->capacity == (free_count + page->used));
-
-  // walk through all blocks skipping the free ones
-  #if MI_DEBUG>1
-  size_t used_count = 0;
-  #endif
-  for (size_t i = 0; i < page->capacity; i++) {
-    size_t bitidx = (i / sizeof(uintptr_t));
-    size_t bit = i - (bitidx * sizeof(uintptr_t));
-    uintptr_t m = free_map[bitidx];
-    if (bit == 0 && m == UINTPTR_MAX) {
-      i += (sizeof(uintptr_t) - 1); // skip a run of free blocks
-    }
-    else if ((m & ((uintptr_t)1 << bit)) == 0) {
-      #if MI_DEBUG>1
-      used_count++;
-      #endif
-      uint8_t* block = pstart + (i * bsize);
-      if (!visitor(mi_page_heap(page), area, block, ubsize, arg)) return false;
-    }
+void mi_heap_destroy(mi_heap_t* heap) {
+  if (heap==NULL) return;
+  if (_mi_is_heap_main(heap)) {
+    _mi_warning_message("cannot destroy the main heap\n");
+    return;
   }
-  mi_assert_internal(page->used == used_count);
-  return true;
+  _mi_heap_force_destroy(heap);
 }
 
-typedef bool (mi_heap_area_visit_fun)(const mi_heap_t* heap, const mi_heap_area_ex_t* area, void* arg);
-
-
-static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* vfun, void* arg) {
-  MI_UNUSED(heap);
-  MI_UNUSED(pq);
-  mi_heap_area_visit_fun* fun = (mi_heap_area_visit_fun*)vfun;
-  mi_heap_area_ex_t xarea;
-  const size_t bsize = mi_page_block_size(page);
-  const size_t ubsize = mi_page_usable_block_size(page);
-  xarea.page = page;
-  xarea.area.reserved = page->reserved * bsize;
-  xarea.area.committed = page->capacity * bsize;
-  xarea.area.blocks = mi_page_start(page);
-  xarea.area.used = page->used;   // number of blocks in use (#553)
-  xarea.area.block_size = ubsize;
-  xarea.area.full_block_size = bsize;
-  return fun(heap, &xarea, arg);
+mi_heap_t* mi_heap_of(const void* p) {
+  mi_page_t* page = _mi_safe_ptr_page(p);
+  if (page==NULL) return NULL;
+  return mi_page_heap(page);
 }
 
-// Visit all heap pages as areas
-static bool mi_heap_visit_areas(const mi_heap_t* heap, mi_heap_area_visit_fun* visitor, void* arg) {
-  if (visitor == NULL) return false;
-  return mi_heap_visit_pages((mi_heap_t*)heap, &mi_heap_visit_areas_page, (void*)(visitor), arg); // note: function pointer to void* :-{
+bool mi_any_heap_contains(const void* p) {
+  return (mi_heap_of(p)!=NULL);
 }
 
-// Just to pass arguments
-typedef struct mi_visit_blocks_args_s {
-  bool  visit_blocks;
-  mi_block_visit_fun* visitor;
-  void* arg;
-} mi_visit_blocks_args_t;
-
-static bool mi_heap_area_visitor(const mi_heap_t* heap, const mi_heap_area_ex_t* xarea, void* arg) {
-  mi_visit_blocks_args_t* args = (mi_visit_blocks_args_t*)arg;
-  if (!args->visitor(heap, &xarea->area, NULL, xarea->area.block_size, args->arg)) return false;
-  if (args->visit_blocks) {
-    return mi_heap_area_visit_blocks(xarea, args->visitor, args->arg);
-  }
-  else {
-    return true;
-  }
+bool mi_heap_contains(const mi_heap_t* heap, const void* p) {
+  if (heap==NULL) { heap = mi_heap_main(); }
+  return (heap==mi_heap_of(p));
 }
 
-// Visit all blocks in a heap
-bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
-  mi_visit_blocks_args_t args = { visit_blocks, visitor, arg };
-  return mi_heap_visit_areas(heap, &mi_heap_area_visitor, &args);
+// deprecated
+bool mi_check_owned(const void* p) {
+  return mi_any_heap_contains(p);
+}
+
+// unsafe heap utilization function for DragonFly (see issue #1258)
+// If the page of pointer `p` belongs to `heap` (or `heap==NULL`) and has less than `perc_threshold` used blocks in its used area return `true`.
+// This function is unsafe in general as it assumes we are the only thread accessing the page of `p`.
+bool mi_unsafe_heap_page_is_under_utilized(mi_heap_t* heap, void* p, size_t perc_threshold) mi_attr_noexcept {
+  if (p==NULL) return false;
+  const mi_page_t* const page = _mi_safe_ptr_page(p);   // Get the page containing this pointer
+  if (page==NULL || page->used==page->capacity || page->capacity < page->reserved) return false;
+  // If the page is the head of the queue, it is currently being used for 
+  // allocations; we skip it to avoid immediate thrashing.
+  if (page->prev == NULL)  return false;
+
+  // match heap?
+  const mi_heap_t* const page_heap = mi_page_heap(page);
+  if (page_heap==NULL) return false;
+  if (heap!=NULL && page_heap!=heap) return false;
+    
+  // check utilization
+  if (page->capacity==0)   return false;
+  if (perc_threshold>=100) return true;
+  return (perc_threshold >= ((100*page->used) / page->capacity));
 }
diff --git a/system/lib/mimalloc/src/init.c b/system/lib/mimalloc/src/init.c
index 6f51ca8923c33..093f642ca95ce 100644
--- a/system/lib/mimalloc/src/init.c
+++ b/system/lib/mimalloc/src/init.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -11,35 +11,34 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <string.h>  // memcpy, memset
 #include <stdlib.h>  // atexit
 
+#define MI_MEMID_INIT(kind)   {{{NULL,0}}, kind, true /* pinned */, true /* committed */, false /* zero */ }
+#define MI_MEMID_STATIC       MI_MEMID_INIT(MI_MEM_STATIC)
 
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
-  0,
-  false, false, false, false,
-  0,       // capacity
-  0,       // reserved capacity
-  { 0 },   // flags
-  false,   // is_zero
-  0,       // retire_expire
-  NULL,    // free
-  NULL,    // local_free
-  0,       // used
-  0,       // block size shift
-  0,       // heap tag
-  0,       // block_size
-  NULL,    // page_start
+  MI_ATOMIC_VAR_INIT(0),  // xthread_id
+  NULL,                   // free
+  0,                      // used
+  0,                      // capacity
+  0,                      // reserved capacity
+  0,                      // retire_expire
+  false,                  // is_zero
+  NULL,                   // local_free
+  MI_ATOMIC_VAR_INIT(0),  // xthread_free
+  0,                      // block_size
+  NULL,                   // page_start
   #if (MI_PADDING || MI_ENCODE_FREELIST)
-  { 0, 0 },
+  { 0, 0 },               // keys
   #endif
-  MI_ATOMIC_VAR_INIT(0), // xthread_free
-  MI_ATOMIC_VAR_INIT(0), // xheap
-  NULL, NULL
-  , { 0 }  // padding
+  NULL,                   // theap
+  NULL,                   // heap
+  NULL, NULL,             // next, prev
+  MI_ARENA_SLICE_SIZE,    // page_committed
+  MI_MEMID_STATIC         // memid
 };
 
 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
 
-#if (MI_SMALL_WSIZE_MAX==128)
 #if (MI_PADDING>0) && (MI_INTPTR_SIZE >= 8)
 #define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
 #elif (MI_PADDING>0)
@@ -47,12 +46,10 @@ const mi_page_t _mi_page_empty = {
 #else
 #define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY() }
 #endif
-#else
-#error "define right initialization sizes corresponding to MI_SMALL_WSIZE_MAX"
-#endif
+
 
 // Empty page queues for every bin
-#define QNULL(sz)  { NULL, NULL, (sz)*sizeof(uintptr_t) }
+#define QNULL(sz)  { NULL, NULL, 0, (sz)*sizeof(uintptr_t) }
 #define MI_PAGE_QUEUES_EMPTY \
   { QNULL(1), \
     QNULL(     1), QNULL(     2), QNULL(     3), QNULL(     4), QNULL(     5), QNULL(     6), QNULL(     7), QNULL(     8), /* 8 */ \
@@ -64,313 +61,602 @@ const mi_page_t _mi_page_empty = {
     QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \
     QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \
     QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), QNULL(393216), QNULL(458752), QNULL(524288), /* 72 */ \
-    QNULL(MI_MEDIUM_OBJ_WSIZE_MAX + 1  /* 655360, Huge queue */), \
-    QNULL(MI_MEDIUM_OBJ_WSIZE_MAX + 2) /* Full queue */ }
+    QNULL(MI_LARGE_MAX_OBJ_WSIZE + 1  /* 655360, Huge queue */), \
+    QNULL(MI_LARGE_MAX_OBJ_WSIZE + 2) /* Full queue */ }
 
-#define MI_STAT_COUNT_NULL()  {0,0,0,0}
+#define MI_STAT_COUNT_NULL()  {0,0,0}
 
 // Empty statistics
-#if MI_STAT>1
-#define MI_STAT_COUNT_END_NULL()  , { MI_STAT_COUNT_NULL(), MI_INIT32(MI_STAT_COUNT_NULL) }
-#else
-#define MI_STAT_COUNT_END_NULL()
-#endif
-
 #define MI_STATS_NULL  \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), \
-  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
-  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
-  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
-  { 0, 0 } \
-  MI_STAT_COUNT_END_NULL()
-
-
-// Empty slice span queues for every bin
-#define SQNULL(sz)  { NULL, NULL, sz }
-#define MI_SEGMENT_SPAN_QUEUES_EMPTY \
-  { SQNULL(1), \
-    SQNULL(     1), SQNULL(     2), SQNULL(     3), SQNULL(     4), SQNULL(     5), SQNULL(     6), SQNULL(     7), SQNULL(    10), /*  8 */ \
-    SQNULL(    12), SQNULL(    14), SQNULL(    16), SQNULL(    20), SQNULL(    24), SQNULL(    28), SQNULL(    32), SQNULL(    40), /* 16 */ \
-    SQNULL(    48), SQNULL(    56), SQNULL(    64), SQNULL(    80), SQNULL(    96), SQNULL(   112), SQNULL(   128), SQNULL(   160), /* 24 */ \
-    SQNULL(   192), SQNULL(   224), SQNULL(   256), SQNULL(   320), SQNULL(   384), SQNULL(   448), SQNULL(   512), SQNULL(   640), /* 32 */ \
-    SQNULL(   768), SQNULL(   896), SQNULL(  1024) /* 35 */ }
-
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  { 0 }, { 0 }, \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  { 0 }, { 0 }, { 0 }, { 0 }, \
+  { 0 }, { 0 }, { 0 }, { 0 }, \
+  \
+  { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, \
+  MI_INIT6(MI_STAT_COUNT_NULL), \
+  { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, \
+  \
+  { MI_INIT4(MI_STAT_COUNT_NULL) }, \
+  { { 0 }, { 0 }, { 0 }, { 0 } }, \
+  \
+  { MI_INIT74(MI_STAT_COUNT_NULL) }, \
+  { MI_INIT74(MI_STAT_COUNT_NULL) }, \
+  { MI_INIT5(MI_STAT_COUNT_NULL) }
 
 // --------------------------------------------------------
-// Statically allocate an empty heap as the initial
-// thread local value for the default heap,
-// and statically allocate the backing heap for the main
+// Statically allocate an empty theap as the initial
+// thread local value for the default theap,
+// and statically allocate the backing theap for the main
 // thread so it can function without doing any allocation
 // itself (as accessing a thread local for the first time
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------
 
-mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
-  NULL,
-  MI_ATOMIC_VAR_INIT(NULL),
-  0,                // tid
-  0,                // cookie
-  0,                // arena id
-  { 0, 0 },         // keys
-  { {0}, {0}, 0, true }, // random
-  0,                // page count
-  MI_BIN_FULL, 0,   // page retired min/max
-  NULL,             // next
-  false,            // can reclaim
-  0,                // tag
+static mi_decl_cache_align mi_subproc_t subproc_main
+#if __cplusplus
+= { };     // empty initializer to prevent running the constructor (with msvc)
+#else
+= { 0 };   // C zero initialize
+#endif
+
+static mi_subproc_t* subprocs = &subproc_main;
+static mi_lock_t     subprocs_lock;
+
+static mi_decl_cache_align mi_tld_t tld_empty = {
+  0,                      // thread_id
+  0,                      // thread_seq
+  0,                      // default numa node
+  &subproc_main,          // subproc
+  NULL,                   // theaps list
+  {0},                    // theaps lock
+  false,                  // recurse
+  false,                  // is_in_threadpool
+  MI_MEMID_STATIC         // memid
+};
+
+mi_decl_cache_align const mi_theap_t _mi_theap_empty = {
+  &tld_empty,             // tld
+  MI_ATOMIC_VAR_INIT(NULL), // heap
+  MI_ATOMIC_VAR_INIT(1),  // refcount
+  0,                      // heartbeat
+  0,                      // cookie
+  { {0}, {0}, 0, true },  // random
+  0,                      // page count
+  MI_BIN_FULL, 0,         // page retired min/max
+  0,                      // pages_full_size
+  0, 0,                   // generic count
+  NULL, NULL,             // tnext, tprev
+  NULL, NULL,             // hnext, hprev
+  0,                      // full page retain
+  false,                  // allow reclaim
+  true,                   // allow abandon
+  #if MI_GUARDED
+  0, 0, 0, 1,             // sample count is 1 so we never write to it (see `internal.h:mi_theap_malloc_use_guarded`)
+  #endif
   MI_SMALL_PAGES_EMPTY,
-  MI_PAGE_QUEUES_EMPTY
+  MI_PAGE_QUEUES_EMPTY,
+  MI_MEMID_STATIC,
+  { sizeof(mi_stats_t), MI_STAT_VERSION, MI_STATS_NULL },      // stats
 };
 
-#define tld_empty_stats  ((mi_stats_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,stats)))
-#define tld_empty_os     ((mi_os_tld_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,os)))
+mi_decl_cache_align const mi_theap_t _mi_theap_empty_wrong = {
+  &tld_empty,             // tld
+  MI_ATOMIC_VAR_INIT(NULL), // heap
+  MI_ATOMIC_VAR_INIT(1),  // refcount
+  0,                      // heartbeat
+  0,                      // cookie
+  { {0}, {0}, 0, true },  // random
+  0,                      // page count
+  MI_BIN_FULL, 0,         // page retired min/max
+  0,                      // pages_full_size
+  0, 0,                   // generic count
+  NULL, NULL,             // tnext, tprev
+  NULL, NULL,             // hnext, hprev
+  0,                      // full page retain
+  false,                  // allow reclaim
+  true,                   // allow abandon
+  #if MI_GUARDED
+  0, 0, 0, 1,             // sample count is 1 so we never write to it (see `internal.h:mi_theap_malloc_use_guarded`)
+  #endif
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY,
+  MI_MEMID_STATIC,
+  { sizeof(mi_stats_t), MI_STAT_VERSION, MI_STATS_NULL },      // stats
+};
 
-mi_decl_cache_align static const mi_tld_t tld_empty = {
-  0,
-  false,
-  NULL, NULL,
-  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, tld_empty_stats, tld_empty_os }, // segments
-  { 0, tld_empty_stats }, // os
-  { MI_STATS_NULL }       // stats
+// Heap for the main thread
+
+extern mi_decl_hidden mi_decl_cache_align mi_theap_t theap_main;
+extern mi_decl_hidden mi_decl_cache_align mi_heap_t  heap_main;
+
+static mi_decl_cache_align mi_tld_t tld_main = {
+  0,                      // thread_id
+  0,                      // thread_seq
+  0,                      // numa node
+  &subproc_main,          // subproc
+  &theap_main,            // theaps list
+  {0},                    // theaps lock
+  false,                  // recurse
+  false,                  // is_in_threadpool
+  MI_MEMID_STATIC         // memid
 };
 
+mi_decl_cache_align mi_theap_t theap_main = {
+  &tld_main,              // thread local data
+  MI_ATOMIC_VAR_INIT(&heap_main), // main heap
+  MI_ATOMIC_VAR_INIT(1),  // refcount
+  0,                      // heartbeat
+  0,                      // initial cookie
+  { {0x846ca68b}, {0}, 0, true },  // random
+  0,                      // page count
+  MI_BIN_FULL, 0,         // page retired min/max
+  0,                      // pages_full_size
+  0, 0,                   // generic count
+  NULL, NULL,             // tnext, tprev
+  NULL, NULL,             // hnext, hprev
+  2,                      // full page retain
+  true,                   // allow page reclaim
+  true,                   // allow page abandon
+  #if MI_GUARDED
+  0, 0, 0, 0,
+  #endif
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY,
+  MI_MEMID_STATIC,
+  { sizeof(mi_stats_t), MI_STAT_VERSION, MI_STATS_NULL },      // stats
+};
+
+mi_decl_cache_align mi_heap_t heap_main
+#if __cplusplus
+  = { };     // empty initializer to prevent running the constructor (with msvc)
+#else
+  = { 0 };   // C zero initialize
+#endif
+
+// the theap belonging to the main heap
+mi_decl_hidden mi_decl_thread mi_theap_t* __mi_theap_main = NULL;
+
 mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
   return _mi_prim_thread_id();
 }
 
-// the thread-local default heap for allocation
-mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
+#if MI_TLS_MODEL_THREAD_LOCAL
+// the thread-local main theap for allocation
+mi_decl_hidden mi_decl_thread mi_theap_t* __mi_theap_default = (mi_theap_t*)&_mi_theap_empty;
+// the last used non-main theap
+mi_decl_hidden mi_decl_thread mi_theap_t* __mi_theap_cached = (mi_theap_t*)&_mi_theap_empty;
+#endif
 
-extern mi_heap_t _mi_heap_main;
+bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.
 
-static mi_tld_t tld_main = {
-  0, false,
-  &_mi_heap_main, & _mi_heap_main,
-  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, &tld_main.stats, &tld_main.os }, // segments
-  { 0, &tld_main.stats },  // os
-  { MI_STATS_NULL }       // stats
-};
+mi_stats_t _mi_stats_main = { sizeof(mi_stats_t), MI_STAT_VERSION, MI_STATS_NULL };
 
-mi_heap_t _mi_heap_main = {
-  &tld_main,
-  MI_ATOMIC_VAR_INIT(NULL),
-  0,                // thread id
-  0,                // initial cookie
-  0,                // arena id
-  { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
-  { {0x846ca68b}, {0}, 0, true },  // random
-  0,                // page count
-  MI_BIN_FULL, 0,   // page retired min/max
-  NULL,             // next heap
-  false,            // can reclaim
-  0,                // tag
-  MI_SMALL_PAGES_EMPTY,
-  MI_PAGE_QUEUES_EMPTY
-};
+#if MI_GUARDED
+mi_decl_export void mi_theap_guarded_set_sample_rate(mi_theap_t* theap, size_t sample_rate, size_t seed) {
+  theap->guarded_sample_rate  = sample_rate;
+  theap->guarded_sample_count = sample_rate;  // count down samples
+  if (theap->guarded_sample_rate > 1) {
+    if (seed == 0) {
+      seed = _mi_theap_random_next(theap);
+    }
+    theap->guarded_sample_count = (seed % theap->guarded_sample_rate) + 1;  // start at random count between 1 and `sample_rate`
+  }
+}
 
-bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.
+mi_decl_export void mi_theap_guarded_set_size_bound(mi_theap_t* theap, size_t min, size_t max) {
+  theap->guarded_size_min = min;
+  theap->guarded_size_max = (min > max ? min : max);
+}
 
-mi_stats_t _mi_stats_main = { MI_STATS_NULL };
+void _mi_theap_guarded_init(mi_theap_t* theap) {
+  mi_theap_guarded_set_sample_rate(theap,
+    (size_t)mi_option_get_clamp(mi_option_guarded_sample_rate, 0, LONG_MAX),
+    (size_t)mi_option_get(mi_option_guarded_sample_seed));
+  mi_theap_guarded_set_size_bound(theap,
+    (size_t)mi_option_get_clamp(mi_option_guarded_min, 0, LONG_MAX),
+    (size_t)mi_option_get_clamp(mi_option_guarded_max, 0, LONG_MAX) );
+}
+#else
+mi_decl_export void mi_theap_guarded_set_sample_rate(mi_theap_t* theap, size_t sample_rate, size_t seed) {
+  MI_UNUSED(theap); MI_UNUSED(sample_rate); MI_UNUSED(seed);
+}
 
+mi_decl_export void mi_theap_guarded_set_size_bound(mi_theap_t* theap, size_t min, size_t max) {
+  MI_UNUSED(theap); MI_UNUSED(min); MI_UNUSED(max);
+}
+void _mi_theap_guarded_init(mi_theap_t* theap) {
+  MI_UNUSED(theap);
+}
+#endif
 
-static void mi_heap_main_init(void) {
-  if (_mi_heap_main.cookie == 0) {
-    _mi_heap_main.thread_id = _mi_thread_id();
-    _mi_heap_main.cookie = 1;
-    #if defined(_WIN32) && !defined(MI_SHARED_LIB)
-      _mi_random_init_weak(&_mi_heap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking
+/* -----------------------------------------------------------
+  Initialization
+  Note: on some platforms lock_init or just a thread local access
+  can cause allocation and induce recursion during initialization.
+----------------------------------------------------------- */
+
+
+// Initialize main subproc
+static void mi_subproc_main_init(void) {
+  if (subproc_main.memid.memkind != MI_MEM_STATIC) {
+    subproc_main.memid = _mi_memid_create(MI_MEM_STATIC);
+    subproc_main.heaps = &heap_main;
+    subproc_main.heap_total_count = 1;
+    subproc_main.heap_count = 1;
+    mi_atomic_store_ptr_release(mi_heap_t, &subproc_main.heap_main, &heap_main);
+    __mi_stat_increase_mt(&subproc_main.stats.heaps, 1);
+    mi_lock_init(&subproc_main.arena_reserve_lock);
+    mi_lock_init(&subproc_main.heaps_lock);
+    mi_lock_init(&subprocs_lock);
+    mi_lock_init(&tld_empty.theaps_lock); 
+  }
+}
+
+// Initialize main tld
+static void mi_tld_main_init(void) {
+  if (tld_main.thread_id == 0) {
+    tld_main.thread_id = _mi_prim_thread_id();
+    mi_lock_init(&tld_main.theaps_lock);
+  }
+}
+
+void _mi_theap_options_init(mi_theap_t* theap) {
+  theap->allow_page_reclaim = (mi_option_get(mi_option_page_reclaim_on_free) >= 0);
+  theap->allow_page_abandon = (mi_option_get(mi_option_page_full_retain) >= 0);
+  theap->page_full_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
+}
+
+// Initialization of the (statically allocated) main theap, and the main tld and subproc.
+static void mi_theap_main_init(void) {
+  if mi_unlikely(theap_main.memid.memkind != MI_MEM_STATIC) {
+    // theap
+    theap_main.memid = _mi_memid_create(MI_MEM_STATIC);
+    #if defined(__APPLE__) || defined(_WIN32) && !defined(MI_SHARED_LIB)
+      _mi_random_init_weak(&theap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking (issue #1185)
     #else
-      _mi_random_init(&_mi_heap_main.random);
+      _mi_random_init(&theap_main.random);
     #endif
-    _mi_heap_main.cookie  = _mi_heap_random_next(&_mi_heap_main);
-    _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
-    _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
+    theap_main.cookie  = _mi_theap_random_next(&theap_main);
+    _mi_theap_options_init(&theap_main);
+    _mi_theap_guarded_init(&theap_main);
   }
 }
 
-mi_heap_t* _mi_heap_main_get(void) {
-  mi_heap_main_init();
-  return &_mi_heap_main;
+// Initialize main heap
+static void mi_heap_main_init(void) {
+  if mi_unlikely(heap_main.subproc == NULL) {
+    heap_main.subproc = &subproc_main;
+    heap_main.theaps = &theap_main;
+
+    mi_theap_main_init();
+    mi_subproc_main_init();
+    mi_tld_main_init();
+
+    mi_lock_init(&heap_main.theaps_lock);
+    mi_lock_init(&heap_main.os_abandoned_pages_lock);
+    mi_lock_init(&heap_main.arena_pages_lock);
+  }
 }
 
 
 /* -----------------------------------------------------------
-  Initialization and freeing of the thread local heaps
+  Thread local data
 ----------------------------------------------------------- */
 
-// note: in x64 in release build `sizeof(mi_thread_data_t)` is under 4KiB (= OS page size).
-typedef struct mi_thread_data_s {
-  mi_heap_t  heap;   // must come first due to cast in `_mi_heap_done`
-  mi_tld_t   tld;
-  mi_memid_t memid;  // must come last due to zero'ing
-} mi_thread_data_t;
-
-
-// Thread meta-data is allocated directly from the OS. For
-// some programs that do not use thread pools and allocate and
-// destroy many OS threads, this may causes too much overhead
-// per thread so we maintain a small cache of recently freed metadata.
-
-#define TD_CACHE_SIZE (16)
-static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE];
-
-static mi_thread_data_t* mi_thread_data_zalloc(void) {
-  // try to find thread metadata in the cache
-  bool is_zero = false;
-  mi_thread_data_t* td = NULL;
-  for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
-    if (td != NULL) {
-      // found cached allocation, try use it
-      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
-      if (td != NULL) {
-        break;
-      }
-    }
+// Allocate fresh tld
+static mi_tld_t* mi_tld_alloc(void) {
+  if (_mi_is_main_thread()) {
+    mi_atomic_increment_relaxed(&tld_main.subproc->thread_count);
+    return &tld_main;
   }
-
-  // if that fails, allocate as meta data
-  if (td == NULL) {
+  else {
+    // allocate tld meta-data
+    // note: we need to be careful to not access the tld from `_mi_meta_zalloc`
+    // (and in turn from `_mi_arena_alloc_aligned` and `_mi_os_alloc_aligned`).
     mi_memid_t memid;
-    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid, &_mi_stats_main);
-    if (td == NULL) {
-      // if this fails, try once more. (issue #257)
-      td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid, &_mi_stats_main);
-      if (td == NULL) {
-        // really out of memory
-        _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
-      }
-    }
-    if (td != NULL) {
-      td->memid = memid;
-      is_zero = memid.initially_zero;
+    mi_tld_t* tld = (mi_tld_t*)_mi_meta_zalloc(sizeof(mi_tld_t), &memid);
+    if (tld==NULL) {
+      _mi_error_message(ENOMEM, "unable to allocate memory for thread local data\n");
+      return NULL;
     }
+    tld->memid = memid;
+    tld->theaps = NULL;
+    mi_lock_init(&tld->theaps_lock);
+    tld->subproc = &subproc_main;
+    tld->numa_node = _mi_os_numa_node();
+    tld->thread_id = _mi_prim_thread_id();
+    tld->thread_seq = mi_atomic_increment_relaxed(&tld->subproc->thread_total_count);
+    tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool();
+    mi_atomic_increment_relaxed(&tld->subproc->thread_count);
+    return tld;
   }
+}
 
-  if (td != NULL && !is_zero) {
-    _mi_memzero_aligned(td, offsetof(mi_thread_data_t,memid));
+#define MI_TLD_INVALID  ((mi_tld_t*)1)
+
+mi_decl_noinline static void mi_tld_free(mi_tld_t* tld) {
+  mi_lock_done(&tld->theaps_lock);
+  if (tld != NULL && tld != MI_TLD_INVALID) {
+    mi_atomic_decrement_relaxed(&tld->subproc->thread_count);
+    _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid);
   }
-  return td;
+  #if 0
+  // do not read/write to `thread_tld` on older macOS <= 14 as that will re-initialize the thread local storage
+  // (since we are calling this during pthread shutdown)
+  // (and this could happen on other systems as well, so let's never do it)
+  thread_tld = MI_TLD_INVALID;
+  #endif
 }
 
-static void mi_thread_data_free( mi_thread_data_t* tdfree ) {
-  // try to add the thread metadata to the cache
-  for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
-    if (td == NULL) {
-      mi_thread_data_t* expected = NULL;
-      if (mi_atomic_cas_ptr_weak_acq_rel(mi_thread_data_t, &td_cache[i], &expected, tdfree)) {
-        return;
-      }
+// return the thread local heap ensuring it is initialized (and not `NULL` or `&_mi_theap_empty`);
+mi_theap_t* _mi_theap_default_safe(void) {
+  mi_theap_t* theap = _mi_theap_default();
+  if mi_likely(mi_theap_is_initialized(theap)) return theap;
+  mi_thread_init();
+  mi_assert_internal(mi_theap_is_initialized(_mi_theap_default()));
+  return _mi_theap_default();
+}
+
+// return the main theap ensuring it is initialized. 
+mi_theap_t* _mi_theap_main_safe(void) {
+  mi_theap_t* theap = __mi_theap_main;
+  if mi_unlikely(theap==NULL) {  // if thread_init or default_set was never called
+    mi_thread_init();            // sets the default slot to the main theap
+    theap = _mi_theap_default();
+    mi_assert_internal(theap!=NULL);
+    mi_assert_internal(_mi_is_theap_main(theap));
+    if (_mi_is_theap_main(theap)) {
+      __mi_theap_main = theap;
     }
   }
-  // if that fails, just free it directly
-  _mi_os_free(tdfree, sizeof(mi_thread_data_t), tdfree->memid, &_mi_stats_main);
+  mi_assert_internal(theap!=NULL && _mi_is_theap_main(theap));    
+  return theap;
 }
 
-void _mi_thread_data_collect(void) {
-  // free all thread metadata from the cache
-  for (int i = 0; i < TD_CACHE_SIZE; i++) {
-    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
-    if (td != NULL) {
-      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
-      if (td != NULL) {
-        _mi_os_free(td, sizeof(mi_thread_data_t), td->memid, &_mi_stats_main);
-      }
-    }
+
+mi_subproc_t* _mi_subproc_main(void) {
+  return &subproc_main;
+}
+
+mi_subproc_t* _mi_subproc(void) {
+  // should work without doing initialization (as it may be called from `_mi_tld -> mi_tld_alloc ... -> os_alloc -> _mi_subproc()`
+  // todo: this will still fail on OS systems where the first access to a thread-local causes allocation.
+  //       on such systems we can check for this with the _mi_prim_get_default_theap as those are protected (by being
+  //       stored in a TLS slot for example)
+  mi_theap_t* theap = _mi_theap_default();
+  if (theap == NULL) {
+    return _mi_subproc_main();
+  }
+  else {
+    return theap->tld->subproc;  // avoid using thread local storage (`thread_tld`)
   }
 }
 
-// Initialize the thread local default heap, called from `mi_thread_init`
-static bool _mi_thread_heap_init(void) {
-  if (mi_heap_is_initialized(mi_prim_get_default_heap())) return true;
-  if (_mi_is_main_thread()) {
-    // mi_assert_internal(_mi_heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
-    // the main heap is statically allocated
-    mi_heap_main_init();
-    _mi_heap_set_default_direct(&_mi_heap_main);
-    //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap());
+mi_heap_t* _mi_subproc_heap_main(mi_subproc_t* subproc) {
+  mi_heap_t* heap = mi_atomic_load_ptr_relaxed(mi_heap_t,&subproc->heap_main);
+  if mi_likely(heap!=NULL) {
+    return heap;
   }
   else {
-    // use `_mi_os_alloc` to allocate directly from the OS
-    mi_thread_data_t* td = mi_thread_data_zalloc();
-    if (td == NULL) return false;
+    mi_heap_main_init();
+    mi_assert_internal(mi_atomic_load_relaxed(&subproc->heap_main) != NULL);
+    return mi_atomic_load_ptr_relaxed(mi_heap_t,&subproc->heap_main);
+  }
+}
+
+mi_heap_t* mi_heap_main(void) {
+  return _mi_subproc_heap_main(_mi_subproc()); // don't use mi_theap_main_init_get() so this call works during process_init
+}
 
-    mi_tld_t*  tld = &td->tld;
-    mi_heap_t* heap = &td->heap;
-    _mi_tld_init(tld, heap);  // must be before `_mi_heap_init`
-    _mi_heap_init(heap, tld, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */);
-    _mi_heap_set_default_direct(heap);   
+bool _mi_is_heap_main(const mi_heap_t* heap) {
+  mi_assert_internal(heap!=NULL);
+  return (_mi_subproc_heap_main(heap->subproc) == heap);
+}
+
+bool _mi_is_theap_main(const mi_theap_t* theap) {
+  return (mi_theap_is_initialized(theap) && _mi_is_heap_main(_mi_theap_heap(theap)));
+}
+
+/* -----------------------------------------------------------
+  Sub process
+----------------------------------------------------------- */
+
+mi_subproc_id_t mi_subproc_main(void) {
+  return _mi_subproc_main();
+}
+
+mi_subproc_id_t mi_subproc_current(void) {
+  return _mi_subproc();
+}
+
+mi_subproc_id_t mi_subproc_new(void) {
+  static _Atomic(size_t) subproc_total_count;
+  mi_memid_t memid;
+  mi_subproc_t* subproc = (mi_subproc_t*)_mi_meta_zalloc(sizeof(mi_subproc_t),&memid);
+  if (subproc == NULL) return NULL;
+  subproc->memid = memid;
+  subproc->subproc_seq = mi_atomic_increment_relaxed(&subproc_total_count) + 1;
+  mi_lock_init(&subproc->arena_reserve_lock);
+  mi_lock_init(&subproc->heaps_lock);
+  mi_lock(&subprocs_lock) {
+    // push on subproc list
+    subproc->next = subprocs;
+    if (subprocs!=NULL) { subprocs->prev = subproc; }
+    subprocs = subproc;
   }
-  return false;
+  return subproc;
 }
 
-// initialize thread local data
-void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
-  _mi_memcpy_aligned(tld, &tld_empty, sizeof(mi_tld_t));
-  tld->heap_backing = bheap;
-  tld->heaps = NULL;
-  tld->segments.stats = &tld->stats;
-  tld->segments.os = &tld->os;
-  tld->os.stats = &tld->stats;
+mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id) {
+  return (subproc_id == NULL ? &subproc_main : (mi_subproc_t*)subproc_id);
 }
 
-// Free the thread local default heap (called from `mi_thread_done`)
-static bool _mi_thread_heap_done(mi_heap_t* heap) {
-  if (!mi_heap_is_initialized(heap)) return true;
+// destroy all subproc resources including arena's, heap's etc.
+static void mi_subproc_unsafe_destroy(mi_subproc_t* subproc)
+{
+  // remove from the subproc list
+  mi_lock(&subprocs_lock) {
+    if (subproc->next!=NULL) { subproc->next->prev = subproc->prev;  }
+    if (subproc->prev!=NULL) { subproc->prev->next = subproc->next;  }
+                        else { mi_assert_internal(subprocs==subproc);  subprocs = subproc->next; }
+  }
 
-  // reset default heap
-  _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
+  // destroy all subproc heaps
+  mi_lock(&subproc->heaps_lock) {
+    mi_heap_t* heap = subproc->heaps;
+    while (heap != NULL) {
+      mi_heap_t* next = heap->next;
+      if (heap!=subproc->heap_main) { mi_heap_destroy(heap); }
+      heap = next;
+    }
+    mi_assert_internal(subproc->heaps == subproc->heap_main);
+    _mi_heap_force_destroy(subproc->heap_main);  // no warning if destroying the main heap
+  }
 
-  // switch to backing heap
-  heap = heap->tld->heap_backing;
-  if (!mi_heap_is_initialized(heap)) return false;
+  // remove associated arenas
+  _mi_arenas_unsafe_destroy_all(subproc);
 
-  // delete all non-backing heaps in this thread
-  mi_heap_t* curr = heap->tld->heaps;
-  while (curr != NULL) {
-    mi_heap_t* next = curr->next; // save `next` as `curr` will be freed
-    if (curr != heap) {
-      mi_assert_internal(!mi_heap_is_backing(curr));
-      mi_heap_delete(curr);
+  // merge stats back into the main subproc?
+  if (subproc!=&subproc_main) {
+    _mi_stats_merge_into(&subproc_main.stats, &subproc->stats);
+  }
+
+  // safe to release
+  // todo: should we refcount subprocesses?
+  mi_lock_done(&subproc->arena_reserve_lock);
+  mi_lock_done(&subproc->heaps_lock);
+  if (subproc!=&subproc_main) {
+    _mi_meta_free(subproc, sizeof(mi_subproc_t), subproc->memid);
+  }
+  else {
+    // for the main subproc, also release the global page map
+    _mi_page_map_unsafe_destroy(&subproc_main);
+  }
+}
+
+void mi_subproc_destroy(mi_subproc_id_t subproc_id) {
+  if (subproc_id == NULL) return;
+  mi_subproc_unsafe_destroy(_mi_subproc_from_id(subproc_id));
+}
+
+static void mi_subprocs_unsafe_destroy_all(void) {
+  mi_lock(&subprocs_lock) {
+    mi_subproc_t* subproc = subprocs;
+    while (subproc!=NULL) {
+      mi_subproc_t* next = subproc->next;
+      if (subproc!=&subproc_main) {
+        mi_subproc_unsafe_destroy(subproc);
+      }
+      subproc = next;
     }
-    curr = next;
   }
-  mi_assert_internal(heap->tld->heaps == heap && heap->next == NULL);
-  mi_assert_internal(mi_heap_is_backing(heap));
+  mi_subproc_unsafe_destroy(&subproc_main);
+}
+
+
+void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
+  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
+  mi_tld_t* const tld = _mi_theap_default_safe()->tld;
+  mi_assert(tld->subproc== &subproc_main);
+  if (tld->subproc != &subproc_main) {
+    _mi_warning_message("unable to add thread to the subprocess as it was already in another subprocess (id: %p)\n", subproc);
+    return;
+  }
+  tld->subproc = subproc;
+  tld->thread_seq = mi_atomic_increment_relaxed(&subproc->thread_total_count);
+  mi_atomic_decrement_relaxed(&subproc_main.thread_count);
+  mi_atomic_increment_relaxed(&subproc->thread_count);
+}
 
-  // collect if not the main thread
-  if (heap != &_mi_heap_main) {
-    _mi_heap_collect_abandon(heap);
+
+bool mi_subproc_visit_heaps(mi_subproc_id_t subproc_id, mi_heap_visit_fun* visitor, void* arg) {
+  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
+  if (subproc==NULL) return false;
+  bool ok = true;
+  mi_lock(&subproc->heaps_lock) {
+    for (mi_heap_t* heap = subproc->heaps; heap!=NULL && ok; heap = heap->next) {
+      ok = (*visitor)(heap, arg);
+    }
   }
+  return ok;
+}
 
-  // merge stats
-  _mi_stats_done(&heap->tld->stats);
 
-  // free if not the main thread
-  if (heap != &_mi_heap_main) {
-    // the following assertion does not always hold for huge segments as those are always treated
-    // as abondened: one may allocate it in one thread, but deallocate in another in which case
-    // the count can be too large or negative. todo: perhaps not count huge segments? see issue #363
-    // mi_assert_internal(heap->tld->segments.count == 0 || heap->thread_id != _mi_thread_id());
-    mi_thread_data_free((mi_thread_data_t*)heap);
+/* -----------------------------------------------------------
+  Allocate theap data
+----------------------------------------------------------- */
+
+// Initialize the thread local default theap, called from `mi_thread_init`
+static mi_theap_t* _mi_thread_init_theap_default(void) {
+  mi_theap_t* theap = _mi_theap_default();
+  if (mi_theap_is_initialized(theap)) return theap;
+  if (_mi_is_main_thread()) {
+    mi_heap_main_init();
+    theap = &theap_main;
   }
   else {
-    #if 0
-    // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
-    // there may still be delete/free calls after the mi_fls_done is called. Issue #207
-    _mi_heap_destroy_pages(heap);
-    mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main);
-    #endif
+    // allocates tld data
+    // note: we cannot access thread-locals yet as that can cause (recursive) allocation
+    // (on macOS <= 14 for example where the loader allocates thread-local data on demand).
+    mi_tld_t* tld = mi_tld_alloc();
+    // allocate and initialize the theap for the main heap
+    theap = _mi_theap_create(mi_heap_main(), tld);
   }
-  return false;
+  // associate the theap with this thread
+  // (this is safe, on macOS for example, the theap is set in a dedicated TLS slot and thus does not cause recursive allocation)
+  _mi_theap_default_set(theap);
+  return theap;
+}
+
+
+// Free the thread local theaps
+static void mi_thread_theaps_done(mi_tld_t* tld)
+{
+  // reset the thread local theaps
+  _mi_theap_default_set((mi_theap_t*)&_mi_theap_empty);
+  _mi_theap_cached_set((mi_theap_t*)&_mi_theap_empty);
+  __mi_theap_main = NULL;
+  
+  // abandon the pages of all theaps in this thread
+  mi_lock(&tld->theaps_lock) {
+    mi_theap_t* theap = tld->theaps;
+    while (theap != NULL) {
+      mi_theap_t* next = theap->tnext; 
+      // never destroy theaps; if a dll is linked statically with mimalloc,
+      // there may still be delete/free calls after the mi_fls_done is called. Issue #207
+      _mi_theap_collect_abandon(theap);
+      mi_assert_internal(theap->page_count==0);
+      theap = next;
+    }
+  }
+
+  // free the theaps of this thread.
+  // This can run concurrently with a `mi_heap_free_theaps` and we need to ensure we free theaps atomically.
+  // We do this in a loop where we release the theaps_lock at every potential re-iteration to unblock 
+  // potential concurrent `mi_heap_free_theaps` which tries to remove the theap from our theaps list.
+  bool all_freed;
+  do {
+    all_freed = true;
+    mi_lock(&tld->theaps_lock) {
+      mi_theap_t* theap = tld->theaps;
+      while (theap != NULL) {
+        mi_theap_t* next = theap->tnext;
+        mi_assert_internal(theap->page_count==0);
+        if (!_mi_theap_free(theap, true /* acquire heap->theaps_lock */, false /* dont re-acquire the tld->theaps_lock*/ )) {
+          all_freed = false;
+        }
+        theap = next;
+      }
+    }
+    if (!all_freed) { mi_subproc_stat_counter_increase(tld->subproc,heaps_delete_wait,1); mi_atomic_yield(); }
+               else { mi_assert_internal(tld->theaps==NULL); }       
+  } while (!all_freed);
+
+  mi_assert(_mi_theap_default()==(mi_theap_t*)&_mi_theap_empty); // careful to not re-initialize the default theap during theap_delete
+  mi_assert(!mi_theap_is_initialized(_mi_theap_default()));
 }
 
 
@@ -383,7 +669,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
 // 1. windows dynamic library:
 //     call from DllMain on DLL_THREAD_DETACH
 // 2. windows static library:
-//     use `FlsAlloc` to call a destructor when the thread is done
+//     use special linker section to call a destructor when the thread is done
 // 3. unix, pthreads:
 //     use a pthread key to call a destructor when a pthread is done
 //
@@ -397,213 +683,417 @@ static void mi_process_setup_auto_thread_done(void) {
   if (tls_initialized) return;
   tls_initialized = true;
   _mi_prim_thread_init_auto_done();
-  _mi_heap_set_default_direct(&_mi_heap_main);
+  _mi_theap_default_set(&theap_main);
 }
 
 
 bool _mi_is_main_thread(void) {
-  return (_mi_heap_main.thread_id==0 || _mi_heap_main.thread_id == _mi_thread_id());
+  return (tld_main.thread_id==0 || tld_main.thread_id == _mi_thread_id());
 }
 
-static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1);
-
-size_t  _mi_current_thread_count(void) {
-  return mi_atomic_load_relaxed(&thread_count);
-}
 
-// This is called from the `mi_malloc_generic`
+// Initialize thread
 void mi_thread_init(void) mi_attr_noexcept
 {
   // ensure our process has started already
   mi_process_init();
+  // if the theap_default is already set we have already initialized
+  if (_mi_thread_is_initialized()) return;
 
-  // initialize the thread local default heap
-  // (this will call `_mi_heap_set_default_direct` and thus set the
-  //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
-  if (_mi_thread_heap_init()) return;  // returns true if already initialized
+  // initialize the default theap
+  _mi_thread_init_theap_default();
 
-  _mi_stat_increase(&_mi_stats_main.threads, 1);
-  mi_atomic_increment_relaxed(&thread_count);
-  //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
+  mi_heap_stat_increase(mi_heap_main(), threads, 1);
+  // _mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
 }
 
 void mi_thread_done(void) mi_attr_noexcept {
   _mi_thread_done(NULL);
 }
 
-void _mi_thread_done(mi_heap_t* heap)
+void _mi_thread_done(mi_theap_t* _theap_main)
 {
-  // calling with NULL implies using the default heap
-  if (heap == NULL) {
-    heap = mi_prim_get_default_heap();
-    if (heap == NULL) return;
+  // NULL can be passed on some platforms
+  if (_theap_main==NULL) {
+    _theap_main = __mi_theap_main;  // don't call `mi_theap_main_safe` as that re-initializes the thread
+    if (_theap_main==NULL) {        // can happen if `mi_theap_main_safe` is never called; but then the default is main
+      _theap_main = _mi_theap_default();
+      mi_assert_internal(_theap_main==NULL || _mi_is_theap_main(_theap_main));
+    }
   }
 
-  // prevent re-entrancy through heap_done/heap_set_default_direct (issue #699)
-  if (!mi_heap_is_initialized(heap)) {
+  // prevent re-entrancy through theap_done/theap_set_default_direct (issue #699)
+  if (!mi_theap_is_initialized(_theap_main)) {
     return;
   }
 
+  // release dynamic thread_local's
+  _mi_thread_locals_thread_done();
+
+  // note: we store the tld as we should avoid reading `thread_tld` at this point (to avoid reinitializing the thread local storage)
+  mi_tld_t* const tld = _theap_main->tld;
+
   // adjust stats
-  mi_atomic_decrement_relaxed(&thread_count);
-  _mi_stat_decrease(&_mi_stats_main.threads, 1);
-
-  // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
-  if (heap->thread_id != _mi_thread_id()) return;
-
-  // abandon the thread local heap
-  if (_mi_thread_heap_done(heap)) return;  // returns true if already ran
-}
-
-void _mi_heap_set_default_direct(mi_heap_t* heap)  {
-  mi_assert_internal(heap != NULL);
-  #if defined(MI_TLS_SLOT)
-  mi_prim_tls_slot_set(MI_TLS_SLOT,heap);
-  #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
-  *mi_prim_tls_pthread_heap_slot() = heap;
-  #elif defined(MI_TLS_PTHREAD)
-  // we use _mi_heap_default_key
-  #else
-  _mi_heap_default = heap;
-  #endif
+  mi_heap_stat_decrease(_mi_subproc_heap_main(tld->subproc), threads, 1);  // todo: or `_theap_main->heap`?
+
+  // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local theaps...
+  if (tld->thread_id != _mi_prim_thread_id()) return;
+
+  // delete the thread local theaps
+  mi_thread_theaps_done(tld);
 
-  // ensure the default heap is passed to `_mi_thread_done`
-  // setting to a non-NULL value also ensures `mi_thread_done` is called.
-  _mi_prim_thread_associate_default_heap(heap);
+  // free thread local data
+  mi_tld_free(tld);
 }
 
 
-// --------------------------------------------------------
-// Run functions on process init/done, and thread init/done
-// --------------------------------------------------------
-static void mi_cdecl mi_process_done(void);
+mi_decl_cold mi_decl_noinline mi_theap_t* _mi_theap_empty_get(void) {
+  return (mi_theap_t*)&_mi_theap_empty;
+}
 
-static bool os_preloading = true;    // true until this module is initialized
-static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
+#if MI_TLS_MODEL_DYNAMIC_WIN32
 
-// Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
-bool mi_decl_noinline _mi_preloading(void) {
-  return os_preloading;
+// If we can, we use one of the 64 direct TLS slots (but fall back to expansion slots if needed)
+// See <https://en.wikipedia.org/wiki/Win32_Thread_Information_Block> for the offsets.
+#if MI_SIZE_SIZE==4
+#define MI_TLS_DIRECT_FIRST             (0x0E10 / MI_SIZE_SIZE)
+#else
+#define MI_TLS_DIRECT_FIRST             (0x1480 / MI_SIZE_SIZE)
+#endif
+#define MI_TLS_DIRECT_SLOTS             (64)
+#define MI_TLS_EXPANSION_SLOTS          (1024)
+
+#if !MI_WIN_DIRECT_TLS
+#define MI_TLS_INITIAL_SLOT             MI_TLS_EXPANSION_SLOT
+#define MI_TLS_INITIAL_EXPANSION_SLOT   (MI_TLS_EXPANSION_SLOTS-1)
+#else
+// with only direct entries, use the "arbitrary user data" field 
+// and assume it is NULL (see also <http://www.nynaeve.net/?p=98>)
+#define MI_TLS_INITIAL_SLOT             (5)
+#define MI_TLS_INITIAL_EXPANSION_SLOT   (0)
+#endif
+
+// we initially use the last of the expansion slots as the default NULL.
+// note: this will fail if the program allocates exactly 1024+64 slots with TlsAlloc (which is quite unlikely)
+mi_decl_hidden mi_decl_cache_align size_t _mi_theap_default_slot = MI_TLS_INITIAL_SLOT;
+mi_decl_hidden size_t _mi_theap_default_expansion_slot = MI_TLS_INITIAL_EXPANSION_SLOT;
+mi_decl_hidden size_t _mi_theap_cached_slot            = MI_TLS_INITIAL_SLOT;
+mi_decl_hidden size_t _mi_theap_cached_expansion_slot  = MI_TLS_INITIAL_EXPANSION_SLOT;
+
+static DWORD mi_tls_raw_index_default = TLS_OUT_OF_INDEXES;
+static DWORD mi_tls_raw_index_cached  = TLS_OUT_OF_INDEXES;
+
+static bool mi_win_tls_slot_alloc(size_t* slot, size_t* extended, DWORD* raw_index) {
+  const DWORD index = TlsAlloc();
+  *raw_index = index;
+  if (index==TLS_OUT_OF_INDEXES) {
+    *extended = 0;
+    *slot = 0;
+    return false;
+  }
+  else if (index<MI_TLS_DIRECT_SLOTS) {
+    *extended = 0;
+    *slot = index + MI_TLS_DIRECT_FIRST;
+    return true;
+  }
+  #if !MI_WIN_DIRECT_TLS
+  else if (index < MI_TLS_DIRECT_SLOTS + MI_TLS_EXPANSION_SLOTS - 1) { // check maximum number of expansion slots - 1 (as we use the last one as the default)    
+    *extended = index - MI_TLS_DIRECT_SLOTS;
+    *slot = MI_TLS_EXPANSION_SLOT;
+    return true;
+  }
+  #endif
+  else {
+    // to high an index for us
+    _mi_error_message(EFAULT, "returned tls index was too high (%u)\n", index);
+    TlsFree(index);
+    *raw_index = TLS_OUT_OF_INDEXES; 
+    *extended = 0;
+    *slot = 0;
+    return false;
+  }
 }
 
-mi_decl_nodiscard bool mi_is_redirected(void) mi_attr_noexcept {
-  return mi_redirected;
+static void mi_win_tls_slot_free(DWORD* raw_index) {
+  if (*raw_index != TLS_OUT_OF_INDEXES) {
+    TlsFree(*raw_index);
+    *raw_index = TLS_OUT_OF_INDEXES;
+  }
 }
 
-// Communicate with the redirection module on Windows
-#if defined(_WIN32) && defined(MI_SHARED_LIB) && !defined(MI_WIN_NOREDIRECT)
-#ifdef __cplusplus
-extern "C" {
-#endif
-mi_decl_export void _mi_redirect_entry(DWORD reason) {
-  // called on redirection; careful as this may be called before DllMain
-  if (reason == DLL_PROCESS_ATTACH) {
-    mi_redirected = true;
+static void mi_tls_slots_init(void) {
+  static mi_atomic_once_t tls_slots_init;
+  if (mi_atomic_once(&tls_slots_init)) {
+    bool ok = mi_win_tls_slot_alloc(&_mi_theap_default_slot, &_mi_theap_default_expansion_slot, &mi_tls_raw_index_default);
+    if (ok) {
+      ok = mi_win_tls_slot_alloc(&_mi_theap_cached_slot, &_mi_theap_cached_expansion_slot, &mi_tls_raw_index_cached);
+    }
+    if (!ok) {
+      _mi_error_message(EFAULT, "unable to allocate fast TLS user slot (0x%zx)\n", _mi_theap_cached_slot);
+    }
   }
-  else if (reason == DLL_PROCESS_DETACH) {
-    mi_redirected = false;
+}
+
+static void mi_tls_slots_done(void) {
+  mi_win_tls_slot_free(&mi_tls_raw_index_default);
+  mi_win_tls_slot_free(&mi_tls_raw_index_cached );
+}
+
+static void mi_win_tls_slot_set(size_t slot, size_t extended_slot, void* value) {
+  mi_assert_internal((slot >= MI_TLS_DIRECT_FIRST && slot < MI_TLS_DIRECT_FIRST + MI_TLS_DIRECT_SLOTS) || slot == MI_TLS_EXPANSION_SLOT);
+  if (slot < MI_TLS_DIRECT_FIRST + MI_TLS_DIRECT_SLOTS) {
+    mi_prim_tls_slot_set(slot, value);
   }
-  else if (reason == DLL_THREAD_DETACH) {
-    mi_thread_done();
+  else {
+    mi_assert_internal(extended_slot < MI_TLS_EXPANSION_SLOTS);
+    TlsSetValue((DWORD)(extended_slot + MI_TLS_DIRECT_SLOTS), value);  // use TlsSetValue to initialize the TlsExpansion array if needed
   }
 }
-__declspec(dllimport) bool mi_cdecl mi_allocator_init(const char** message);
-__declspec(dllimport) void mi_cdecl mi_allocator_done(void);
-#ifdef __cplusplus
+
+#elif MI_TLS_MODEL_DYNAMIC_PTHREADS
+
+// only for pthreads for now
+mi_decl_hidden pthread_key_t _mi_theap_default_key = 0;
+mi_decl_hidden pthread_key_t _mi_theap_cached_key = 0;
+
+static void mi_tls_slots_init(void) {
+  static mi_atomic_once_t tls_keys_init;
+  if (mi_atomic_once(&tls_keys_init)) {
+    int err = pthread_key_create(&_mi_theap_default_key, NULL);
+    if (err==0) {
+      err = pthread_key_create(&_mi_theap_cached_key, NULL);
+    }
+    if (err!=0) {
+      _mi_error_message(EFAULT, "unable to allocate pthread keys (error %d)\n", err);
+    }
+  }
 }
-#endif
+
+static void mi_tls_slots_done(void) {
+  if (_mi_theap_default_key != 0) {
+    pthread_key_delete(_mi_theap_default_key);
+    _mi_theap_default_key = 0;
+  }
+  if (_mi_theap_cached_key != 0) {
+    pthread_key_delete(_mi_theap_cached_key);
+    _mi_theap_cached_key = 0;
+  }
+}
+
 #else
-static bool mi_allocator_init(const char** message) {
-  if (message != NULL) *message = NULL;
-  return true;
+
+static void mi_tls_slots_init(void) {
+  // nothing
 }
-static void mi_allocator_done(void) {
-  // nothing to do
+
+static void mi_tls_slots_done(void) {
+  // nothing
 }
+
 #endif
 
-// Called once by the process loader
-static void mi_process_load(void) {
-  mi_heap_main_init();
-  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
-  volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
-  if (dummy == NULL) return;                    // use dummy or otherwise the access may get optimized away (issue #697)
+void _mi_theap_cached_set(mi_theap_t* theap) {
+  mi_theap_t* prev = _mi_theap_cached();
+  if (prev==theap) return;
+  // set
+  mi_tls_slots_init();
+  #if MI_TLS_MODEL_THREAD_LOCAL
+    __mi_theap_cached = theap;
+  #elif MI_TLS_MODEL_FIXED_SLOT
+    mi_prim_tls_slot_set(MI_TLS_MODEL_FIXED_SLOT_CACHED, theap);
+  #elif MI_TLS_MODEL_DYNAMIC_WIN32
+    mi_win_tls_slot_set(_mi_theap_cached_slot, _mi_theap_cached_expansion_slot, theap);
+  #elif MI_TLS_MODEL_DYNAMIC_PTHREADS
+    if (_mi_theap_cached_key!=0) pthread_setspecific(_mi_theap_cached_key, theap);
   #endif
+  // update refcounts (so cached theap memory keeps available until no longer cached)
+  _mi_theap_incref(theap);
+  _mi_theap_decref(prev);    
+}
+
+void _mi_theap_default_set(mi_theap_t* theap)  {
+  mi_theap_t* const theap_old = _mi_theap_default();
+  mi_assert_internal(theap != NULL);
+  mi_assert_internal(theap->tld->thread_id==0 || theap->tld->thread_id==_mi_thread_id());
+  mi_tls_slots_init();
+  #if MI_TLS_MODEL_THREAD_LOCAL
+    __mi_theap_default = theap;
+  #elif MI_TLS_MODEL_FIXED_SLOT
+    mi_prim_tls_slot_set(MI_TLS_MODEL_FIXED_SLOT_DEFAULT, theap);
+  #elif MI_TLS_MODEL_DYNAMIC_WIN32
+    mi_win_tls_slot_set(_mi_theap_default_slot, _mi_theap_default_expansion_slot, theap);
+  #elif MI_TLS_MODEL_DYNAMIC_PTHREADS
+    if (_mi_theap_default_key!=0) pthread_setspecific(_mi_theap_default_key, theap);
+  #endif
+
+  // set theap main if needed
+  if (mi_theap_is_initialized(theap)) {
+    // ensure the default theap is passed to `_mi_thread_done` as on some platforms we cannot access TLS at thread termination (as it would allocate again)
+    _mi_prim_thread_associate_default_theap(theap);
+    if (_mi_is_heap_main(_mi_theap_heap(theap))) {
+      __mi_theap_main = theap;
+    }
+  }
+
+  // ensure either the default slot contains the main theap, or __mi_theap_main is initialized 
+  if (mi_theap_is_initialized(theap_old) && _mi_is_heap_main(_mi_theap_heap(theap_old))) {
+    __mi_theap_main = theap_old;
+  }
+}
+
+void mi_thread_set_in_threadpool(void) mi_attr_noexcept {
+  mi_theap_t* theap = _mi_theap_default_safe();
+  theap->tld->is_in_threadpool = true;
+}
+
+// --------------------------------------------------------
+// Run functions on process init/done, and thread init/done
+// --------------------------------------------------------
+static bool os_preloading = true;    // true until this module is initialized
+
+// Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
+bool mi_decl_noinline _mi_preloading(void) {
+  return os_preloading;
+}
+
+// Returns true if mimalloc was redirected
+mi_decl_nodiscard bool mi_is_redirected(void) mi_attr_noexcept {
+  return _mi_is_redirected();
+}
+
+// Called once by the process loader from `src/prim/prim.c`
+void _mi_auto_process_init(void) {
+  // mi_heap_main_init();
+  // #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
+  // volatile mi_theap_t* dummy = __mi_theap_default; // access TLS to allocate it before setting tls_initialized to true;
+  // if (dummy == NULL) return;                       // use dummy or otherwise the access may get optimized away (issue #697)
+  // #endif
+
   os_preloading = false;
   mi_assert_internal(_mi_is_main_thread());
-  #if !(defined(_WIN32) && defined(MI_SHARED_LIB))  // use Dll process detach (see below) instead of atexit (issue #521)
-  atexit(&mi_process_done);
-  #endif
-  _mi_options_init();
-  mi_process_setup_auto_thread_done();
+
   mi_process_init();
-  if (mi_redirected) _mi_verbose_message("malloc is redirected.\n");
+  mi_process_setup_auto_thread_done();
+  _mi_thread_locals_init();
+  _mi_options_post_init();  // now we can print to stderr
+  if (_mi_is_redirected()) _mi_verbose_message("malloc is redirected.\n");
 
   // show message from the redirector (if present)
   const char* msg = NULL;
-  mi_allocator_init(&msg);
+  _mi_allocator_init(&msg);
   if (msg != NULL && (mi_option_is_enabled(mi_option_verbose) || mi_option_is_enabled(mi_option_show_errors))) {
     _mi_fputs(NULL,NULL,NULL,msg);
   }
 
   // reseed random
-  _mi_random_reinit_if_weak(&_mi_heap_main.random);
+  _mi_random_reinit_if_weak(&theap_main.random);
 }
 
-#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
-#include <intrin.h>
-mi_decl_cache_align bool _mi_cpu_has_fsrm = false;
+// CPU features
+mi_decl_cache_align size_t _mi_cpu_movsb_max = 0;  // for size <= max, rep movsb is fast
+mi_decl_cache_align size_t _mi_cpu_stosb_max = 0;  // for size <= max, rep stosb is fast
+mi_decl_cache_align bool _mi_cpu_has_popcnt = false;
+
+#if (MI_ARCH_X64 || MI_ARCH_X86)
+#if defined(__GNUC__)
+// #include <cpuid.h>
+static bool mi_cpuid(uint32_t* regs4, uint32_t level, uint32_t sublevel) {
+  // note: use explicit assembly instead of __get_cpuid as we need the sublevel (in ecx)
+  // (on Ubuntu 22 with WSL the __get_cpuid does not clear ecx for level 7 which is incorrect).
+  uint32_t eax, ebx, ecx, edx;
+  __asm __volatile("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(level), "c"(sublevel) : );
+  regs4[0] = eax;
+  regs4[1] = ebx;
+  regs4[2] = ecx;
+  regs4[3] = edx;
+  return true;
+}
+
+#elif defined(_MSC_VER)
+static bool mi_cpuid(uint32_t* regs4, uint32_t level, uint32_t sublevel) {
+  __cpuidex((int32_t*)regs4, (int32_t)level, (int32_t)sublevel);
+  return true;
+}
+#else
+static bool mi_cpuid(uint32_t* regs4, uint32_t level, uint32_t sublevel) {
+  MI_UNUSED(regs4); MI_UNUSED(level); MI_UNUSED(sublevel);
+  return false;
+}
+#endif
 
 static void mi_detect_cpu_features(void) {
-  // FSRM for fast rep movsb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017))
-  int32_t cpu_info[4];
-  __cpuid(cpu_info, 7);
-  _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
+  // FSRM for fast short rep movsb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017))
+  // EMRS for fast enhanced rep movsb/stosb support (not used at the moment, memcpy always seems faster?)
+  // FSRS for fast short rep stosb
+  bool amd = false;
+  bool fsrm = false;
+  // bool erms = false;
+  bool fsrs = false;
+  uint32_t cpu_info[4];
+  if (mi_cpuid(cpu_info, 0, 0)) {
+    amd = (cpu_info[2]==0x444d4163); // (Auth enti cAMD)
+  }
+  if (mi_cpuid(cpu_info, 7, 0)) {
+    fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
+    // erms = ((cpu_info[1] & (1 << 9)) != 0); // bit 9 of EBX : see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
+  }
+  if (mi_cpuid(cpu_info, 7, 1)) {
+    fsrs = ((cpu_info[1] & (1 << 11)) != 0); // bit 11 of EBX: see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=1:_Extended_Features>
+  }
+  if (mi_cpuid(cpu_info, 1, 0)) {
+    _mi_cpu_has_popcnt = ((cpu_info[2] & (1 << 23)) != 0); // bit 23 of ECX : see <https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits>
+  }
+
+  if (fsrm) {
+    _mi_cpu_movsb_max = 127;
+  }
+  if (fsrs || (amd && fsrm)) {  // fsrm on amd implies fsrs, see: https://marc.info/?l=git-commits-head&m=168186277717803
+    _mi_cpu_stosb_max = 127;
+  }
 }
+
 #else
 static void mi_detect_cpu_features(void) {
-  // nothing
+  #if MI_ARCH_ARM64
+  _mi_cpu_has_popcnt = true;
+  #endif
 }
 #endif
 
+
 // Initialize the process; called by thread_init or the process loader
 void mi_process_init(void) mi_attr_noexcept {
   // ensure we are called once
   static mi_atomic_once_t process_init;
-	#if _MSC_VER < 1920
-	mi_heap_main_init(); // vs2017 can dynamically re-initialize _mi_heap_main
-	#endif
+	// #if _MSC_VER < 1920
+	// mi_heap_main_init(); // vs2017 can dynamically re-initialize theap_main
+	// #endif
   if (!mi_atomic_once(&process_init)) return;
-  _mi_process_is_initialized = true;
   _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
-  mi_process_setup_auto_thread_done();
 
   mi_detect_cpu_features();
+  _mi_options_init();
+  _mi_stats_init();
   _mi_os_init();
-  mi_heap_main_init();
-  #if MI_DEBUG
-  _mi_verbose_message("debug level : %d\n", MI_DEBUG);
-  #endif
-  _mi_verbose_message("secure level: %d\n", MI_SECURE);
-  _mi_verbose_message("mem tracking: %s\n", MI_TRACK_TOOL);
-  #if MI_TSAN
-  _mi_verbose_message("thread santizer enabled\n");
-  #endif
+  // the following can potentially allocate (on freeBSD for pthread keys)
+  // todo: do 2-phase so we can use stats at first, then later init the keys?
+  mi_heap_main_init(); // before page_map_init so stats are working
+  _mi_page_map_init(); // todo: this could fail.. should we abort in that case?
   mi_thread_init();
+  _mi_process_is_initialized = true;
 
-  #if defined(_WIN32)
+  #if defined(_WIN32) && defined(MI_WIN_USE_FLS)
   // On windows, when building as a static lib the FLS cleanup happens to early for the main thread.
   // To avoid this, set the FLS value for the main thread to NULL so the fls cleanup
   // will not call _mi_thread_done on the (still executing) main thread. See issue #508.
-  _mi_prim_thread_associate_default_heap(NULL);
+  _mi_prim_thread_associate_default_theap(NULL);
   #endif
 
-  mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)
+  // mi_stats_reset();  // only call stat reset *after* thread init (or the theap tld == NULL)
   mi_track_init();
-
   if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
     size_t pages = mi_option_get_clamp(mi_option_reserve_huge_os_pages, 0, 128*1024);
-    long reserve_at = mi_option_get(mi_option_reserve_huge_os_pages_at);
+    int reserve_at  = (int)mi_option_get_clamp(mi_option_reserve_huge_os_pages_at, -1, INT_MAX);
     if (reserve_at != -1) {
       mi_reserve_huge_os_pages_at(pages, reserve_at, pages*500);
     } else {
@@ -613,13 +1103,13 @@ void mi_process_init(void) mi_attr_noexcept {
   if (mi_option_is_enabled(mi_option_reserve_os_memory)) {
     long ksize = mi_option_get(mi_option_reserve_os_memory);
     if (ksize > 0) {
-      mi_reserve_os_memory((size_t)ksize*MI_KiB, true /* commit? */, true /* allow large pages? */);
+      mi_reserve_os_memory((size_t)ksize*MI_KiB, true, true);
     }
   }
 }
 
-// Called when the process is done (through `at_exit`)
-static void mi_cdecl mi_process_done(void) {
+// Called when the process is done (cdecl as it is used with `at_exit` on some platforms)
+void mi_cdecl mi_process_done(void) mi_attr_noexcept {
   // only shutdown if we were initialized
   if (!_mi_process_is_initialized) return;
   // ensure we are called once
@@ -627,6 +1117,9 @@ static void mi_cdecl mi_process_done(void) {
   if (process_done) return;
   process_done = true;
 
+  // free dynamic thread locals (if used at all)
+  _mi_thread_locals_done();
+
   // release any thread specific resources and ensure _mi_thread_done is called on all but the main thread
   _mi_prim_thread_done_auto_done();
 
@@ -635,7 +1128,7 @@ static void mi_cdecl mi_process_done(void) {
     // free all memory if possible on process exit. This is not needed for a stand-alone process
     // but should be done if mimalloc is statically linked into another shared library which
     // is repeatedly loaded/unloaded, see issue #281.
-    mi_collect(true /* force */ );
+    mi_theap_collect(_mi_theap_default(), true /* force */);
     #endif
   #endif
 
@@ -643,72 +1136,24 @@ static void mi_cdecl mi_process_done(void) {
   // since after process_done there might still be other code running that calls `free` (like at_exit routines,
   // or C-runtime termination code.
   if (mi_option_is_enabled(mi_option_destroy_on_exit)) {
-    mi_collect(true /* force */);
-    _mi_heap_unsafe_destroy_all();     // forcefully release all memory held by all heaps (of this thread only!)
-    _mi_arena_unsafe_destroy_all(& _mi_heap_main_get()->tld->stats);
+    mi_subprocs_unsafe_destroy_all(); // destroys all subprocs, arenas, and the page_map!    
   }
-
+  else {
+    mi_heap_stats_merge_to_subproc(mi_heap_main());
+  }
+  
+  // careful now to no longer access any allocator functionality 
   if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
-    mi_stats_print(NULL);
+    mi_subproc_stats_print_out(NULL, NULL, NULL);
   }
-  mi_allocator_done();
-  _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
+  mi_lock_done(&subprocs_lock);
+  mi_tls_slots_done();
+  _mi_allocator_done();
+  _mi_verbose_message("process done: 0x%zx\n", tld_main.thread_id);
   os_preloading = true; // don't call the C runtime anymore
 }
 
-
-
-#if defined(_WIN32) && defined(MI_SHARED_LIB)
-  // Windows DLL: easy to hook into process_init and thread_done
-  __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
-    MI_UNUSED(reserved);
-    MI_UNUSED(inst);
-    if (reason==DLL_PROCESS_ATTACH) {
-      mi_process_load();
-    }
-    else if (reason==DLL_PROCESS_DETACH) {
-      mi_process_done();
-    }
-    else if (reason==DLL_THREAD_DETACH) {
-      if (!mi_is_redirected()) {
-        mi_thread_done();
-      }
-    }
-    return TRUE;
-  }
-
-#elif defined(_MSC_VER)
-  // MSVC: use data section magic for static libraries
-  // See <https://www.codeguru.com/cpp/misc/misc/applicationcontrol/article.php/c6945/Running-Code-Before-and-After-Main.htm>
-  static int _mi_process_init(void) {
-    mi_process_load();
-    return 0;
-  }
-  typedef int(*_mi_crt_callback_t)(void);
-  #if defined(_M_X64) || defined(_M_ARM64)
-    __pragma(comment(linker, "/include:" "_mi_msvc_initu"))
-    #pragma section(".CRT$XIU", long, read)
-  #else
-    __pragma(comment(linker, "/include:" "__mi_msvc_initu"))
-  #endif
-  #pragma data_seg(".CRT$XIU")
-  mi_decl_externc _mi_crt_callback_t _mi_msvc_initu[] = { &_mi_process_init };
-  #pragma data_seg()
-
-#elif defined(__cplusplus)
-  // C++: use static initialization to detect process start
-  static bool _mi_process_init(void) {
-    mi_process_load();
-    return (_mi_heap_main.thread_id != 0);
-  }
-  static bool mi_initialized = _mi_process_init();
-
-#elif defined(__GNUC__) || defined(__clang__)
-  // GCC,Clang: use the constructor attribute
-  static void __attribute__((constructor)) _mi_process_init(void) {
-    mi_process_load();
-  }
-
-#else
-#pragma message("define a way to call mi_process_load on your platform")
-#endif
+void mi_cdecl _mi_auto_process_done(void) mi_attr_noexcept {
+  if (_mi_option_get_fast(mi_option_destroy_on_exit)>1) return;
+  mi_process_done();
+}
diff --git a/system/lib/mimalloc/src/libc.c b/system/lib/mimalloc/src/libc.c
index dd6b400737906..4c891c1e6521d 100644
--- a/system/lib/mimalloc/src/libc.c
+++ b/system/lib/mimalloc/src/libc.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // --------------------------------------------------------
 // This module defines various std libc functions to reduce
-// the dependency on libc, and also prevent errors caused 
+// the dependency on libc, and also prevent errors caused
 // by some libc implementations when called before `main`
 // executes (due to malloc redirection)
 // --------------------------------------------------------
@@ -65,6 +65,21 @@ size_t _mi_strnlen(const char* s, size_t max_len) {
   return len;
 }
 
+char* _mi_strnstr(char* s, size_t max_len, const char* pat) {
+  if (s==NULL) return NULL;
+  if (pat==NULL) return s;
+  const size_t m = _mi_strnlen(s, max_len);
+  const size_t n = _mi_strlen(pat);  
+  for (size_t start = 0; start + n <= m; start++) {
+    size_t i = 0;
+    while (i<n && pat[i]==s[start+i]) {
+      i++;
+    }
+    if (i==n) return &s[start];
+  }
+  return NULL;
+}
+
 #ifdef MI_NO_GETENV
 bool _mi_getenv(const char* name, char* result, size_t result_size) {
   MI_UNUSED(name);
@@ -83,7 +98,7 @@ bool _mi_getenv(const char* name, char* result, size_t result_size) {
 // Define our own limited `_mi_vsnprintf` and `_mi_snprintf`
 // This is mostly to avoid calling these when libc is not yet
 // initialized (and to reduce dependencies)
-// 
+//
 // format:      d i, p x u, s
 // prec:        z l ll L
 // width:       10
@@ -130,7 +145,7 @@ static void mi_out_alignright(char fill, char* start, size_t len, size_t extra,
 }
 
 
-static void mi_out_num(uintptr_t x, size_t base, char prefix, char** out, char* end) 
+static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char* end)
 {
   if (x == 0 || base == 0 || base > 16) {
     if (prefix != 0) { mi_outc(prefix, out, end); }
@@ -144,8 +159,8 @@ static void mi_out_num(uintptr_t x, size_t base, char prefix, char** out, char*
       mi_outc((digit <= 9 ? '0' + digit : 'A' + digit - 10),out,end);
       x = x / base;
     }
-    if (prefix != 0) { 
-      mi_outc(prefix, out, end); 
+    if (prefix != 0) {
+      mi_outc(prefix, out, end);
     }
     size_t len = *out - start;
     // and reverse in-place
@@ -160,8 +175,8 @@ static void mi_out_num(uintptr_t x, size_t base, char prefix, char** out, char*
 
 #define MI_NEXTC()  c = *in; if (c==0) break; in++;
 
-void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
-  if (buf == NULL || bufsize == 0 || fmt == NULL) return;
+int _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
+  if (buf == NULL || bufsize == 0 || fmt == NULL) return 0;
   buf[bufsize - 1] = 0;
   char* const end = buf + (bufsize - 1);
   const char* in = fmt;
@@ -171,7 +186,18 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
     char c;
     MI_NEXTC();
     if (c != '%') {
-      if ((c >= ' ' && c <= '~') || c=='\n' || c=='\r' || c=='\t') { // output visible ascii or standard control only
+      if (c == '\\') {
+        MI_NEXTC();
+        switch (c) {
+        case 'e': mi_outc('\x1B', &out, end); break;
+        case 't': mi_outc('\t', &out, end); break;
+        case 'n': mi_outc('\n', &out, end); break;
+        case 'r': mi_outc('\r', &out, end); break;
+        case '\\': mi_outc('\\', &out, end); break;
+        default: /* ignore */ break;
+        }
+      }
+      else if ((c >= ' ' && c <= '~') || c=='\n' || c=='\r' || c=='\t' || c=='\x1b') { // output visible ascii or standard control only
         mi_outc(c, &out, end);
       }
     }
@@ -181,7 +207,7 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
       size_t width = 0;
       char   numtype = 'd';
       char   numplus = 0;
-      bool   alignright = true; 
+      bool   alignright = true;
       if (c == '+' || c == ' ') { numplus = c; MI_NEXTC(); }
       if (c == '-') { alignright = false; MI_NEXTC(); }
       if (c == '0') { fill = '0'; MI_NEXTC(); }
@@ -191,7 +217,7 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
           width = (10 * width) + (c - '0'); MI_NEXTC();
         }
         if (c == 0) break;  // extra check due to while
-      }      
+      }
       if (c == 'z' || c == 't' || c == 'L') { numtype = c; MI_NEXTC(); }
       else if (c == 'l') {
         numtype = c; MI_NEXTC();
@@ -199,19 +225,23 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
       }
 
       char* start = out;
-      if (c == 's') {
+      if (c == '%') {
+        mi_outc('%', &out, end);
+      }
+      else if (c == 's') {
         // string
         const char* s = va_arg(args, const char*);
         mi_outs(s, &out, end);
       }
       else if (c == 'p' || c == 'x' || c == 'u') {
         // unsigned
-        uintptr_t x = 0;
+        uintmax_t x = 0;
         if (c == 'x' || c == 'u') {
           if (numtype == 'z')       x = va_arg(args, size_t);
           else if (numtype == 't')  x = va_arg(args, uintptr_t); // unsigned ptrdiff_t
-          else if (numtype == 'L')  x = (uintptr_t)va_arg(args, unsigned long long);
-                               else x = va_arg(args, unsigned long);
+          else if (numtype == 'L')  x = va_arg(args, unsigned long long);
+          else if (numtype == 'l')  x = va_arg(args, unsigned long);
+                               else x = va_arg(args, unsigned int);
         }
         else if (c == 'p') {
           x = va_arg(args, uintptr_t);
@@ -228,20 +258,21 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
       }
       else if (c == 'i' || c == 'd') {
         // signed
-        intptr_t x = 0;
+        intmax_t x = 0;
         if (numtype == 'z')       x = va_arg(args, intptr_t );
         else if (numtype == 't')  x = va_arg(args, ptrdiff_t);
-        else if (numtype == 'L')  x = (intptr_t)va_arg(args, long long);
-                             else x = va_arg(args, long);
+        else if (numtype == 'L')  x = va_arg(args, long long);
+        else if (numtype == 'l')  x = va_arg(args, long);
+                             else x = va_arg(args, int);
         char pre = 0;
         if (x < 0) {
           pre = '-';
-          if (x > INTPTR_MIN) { x = -x; }
+          if (x > INTMAX_MIN) { x = -x; }
         }
         else if (numplus != 0) {
           pre = numplus;
         }
-        mi_out_num((uintptr_t)x, 10, pre, &out, end);
+        mi_out_num((uintmax_t)x, 10, pre, &out, end);
       }
       else if (c >= ' ' && c <= '~') {
         // unknown format
@@ -263,11 +294,139 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
   }
   mi_assert_internal(out <= end);
   *out = 0;
+  return (int)(out - buf);
 }
 
-void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) {
+int _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
-  _mi_vsnprintf(buf, buflen, fmt, args);
+  const int written = _mi_vsnprintf(buf, buflen, fmt, args);
   va_end(args);
+  return written;
+}
+
+
+
+// --------------------------------------------------------
+// generic trailing and leading zero count, and popcount
+// --------------------------------------------------------
+
+#if !MI_HAS_FAST_BITSCAN
+
+static size_t mi_ctz_generic32(uint32_t x) {
+  // de Bruijn multiplication, see <http://keithandkatie.com/keith/papers/debruijn.html>
+  static const uint8_t debruijn[32] = {
+    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+    31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+  };
+  if (x==0) return 32;
+  return debruijn[(uint32_t)((x & -(int32_t)x) * (uint32_t)(0x077CB531U)) >> 27];
+}
+
+static size_t mi_clz_generic32(uint32_t x) {
+  // de Bruijn multiplication, see <http://keithandkatie.com/keith/papers/debruijn.html>
+  static const uint8_t debruijn[32] = {
+    31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
+    23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
+  };
+  if (x==0) return 32;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return debruijn[(uint32_t)(x * (uint32_t)(0x07C4ACDDU)) >> 27];
+}
+
+size_t _mi_ctz_generic(size_t x) {
+  if (x==0) return MI_SIZE_BITS;
+  #if (MI_SIZE_BITS <= 32)
+    return mi_ctz_generic32((uint32_t)x);
+  #else
+    const uint32_t lo = (uint32_t)x;
+    if (lo != 0) {
+      return mi_ctz_generic32(lo);
+    }
+    else {
+      return (32 + mi_ctz_generic32((uint32_t)(x>>32)));
+    }
+  #endif
 }
+
+size_t _mi_clz_generic(size_t x) {
+  if (x==0) return MI_SIZE_BITS;
+  #if (MI_SIZE_BITS <= 32)
+    return mi_clz_generic32((uint32_t)x);
+  #else
+    const uint32_t hi = (uint32_t)(x>>32);
+    if (hi != 0) {
+      return mi_clz_generic32(hi);
+    }
+    else {
+      return 32 + mi_clz_generic32((uint32_t)x);
+    }
+  #endif
+}
+
+#endif // bit scan
+
+
+#if MI_SIZE_SIZE == 4
+#define mi_mask_even_bits32      (0x55555555)
+#define mi_mask_even_pairs32     (0x33333333)
+#define mi_mask_even_nibbles32   (0x0F0F0F0F)
+
+// sum of all the bytes in `x` if it is guaranteed that the sum < 256!
+static size_t mi_byte_sum32(uint32_t x) {
+  // perform `x * 0x01010101`: the highest byte contains the sum of all bytes.
+  x += (x << 8);
+  x += (x << 16);
+  return (size_t)(x >> 24);
+}
+
+static size_t mi_popcount_generic32(uint32_t x) {
+  // first count each 2-bit group `a`, where: a==0b00 -> 00, a==0b01 -> 01, a==0b10 -> 01, a==0b11 -> 10
+  // in other words, `a - (a>>1)`; to do this in parallel, we need to mask to prevent spilling a bit pair
+  // into the lower bit-pair:
+  x = x - ((x >> 1) & mi_mask_even_bits32);
+  // add the 2-bit pair results
+  x = (x & mi_mask_even_pairs32) + ((x >> 2) & mi_mask_even_pairs32);
+  // add the 4-bit nibble results
+  x = (x + (x >> 4)) & mi_mask_even_nibbles32;
+  // each byte now has a count of its bits, we can sum them now:
+  return mi_byte_sum32(x);
+}
+
+mi_decl_noinline size_t _mi_popcount_generic(size_t x) {
+  if (x<=1) return x;
+  if (~x==0) return MI_SIZE_BITS;
+  return mi_popcount_generic32(x);
+}
+
+#else
+#define mi_mask_even_bits64      (0x5555555555555555)
+#define mi_mask_even_pairs64     (0x3333333333333333)
+#define mi_mask_even_nibbles64   (0x0F0F0F0F0F0F0F0F)
+
+// sum of all the bytes in `x` if it is guaranteed that the sum < 256!
+static size_t mi_byte_sum64(uint64_t x) {
+  x += (x << 8);
+  x += (x << 16);
+  x += (x << 32);
+  return (size_t)(x >> 56);
+}
+
+static size_t mi_popcount_generic64(uint64_t x) {
+  x = x - ((x >> 1) & mi_mask_even_bits64);
+  x = (x & mi_mask_even_pairs64) + ((x >> 2) & mi_mask_even_pairs64);
+  x = (x + (x >> 4)) & mi_mask_even_nibbles64;
+  return mi_byte_sum64(x);
+}
+
+mi_decl_noinline size_t _mi_popcount_generic(size_t x) {
+  if (x<=1) return x;
+  if (~x==0) return MI_SIZE_BITS;
+  return mi_popcount_generic64(x);
+}
+#endif
+
diff --git a/system/lib/mimalloc/src/options.c b/system/lib/mimalloc/src/options.c
index a62727dd69fe3..4027ce20516ce 100644
--- a/system/lib/mimalloc/src/options.c
+++ b/system/lib/mimalloc/src/options.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -12,8 +12,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <stdio.h>      // stdin/stdout
 #include <stdlib.h>     // abort
 
-
-
 static long mi_max_error_count   = 16; // stop outputting errors after this (use < 0 for no limit)
 static long mi_max_warning_count = 16; // stop outputting warnings after this (use < 0 for no limit)
 
@@ -30,100 +28,251 @@ int mi_version(void) mi_attr_noexcept {
 // concurrently initialized, but an initializing data race
 // is ok since they resolve to the same value.
 // --------------------------------------------------------
-typedef enum mi_init_e {
-  UNINIT,       // not yet initialized
-  DEFAULTED,    // not found in the environment, use default value
-  INITIALIZED   // found in environment or set explicitly
-} mi_init_t;
-
-typedef struct mi_option_desc_s {
-  long        value;  // the value
-  mi_init_t   init;   // is it initialized yet? (from the environment)
-  mi_option_t option; // for debugging: the option index should match the option
-  const char* name;   // option name without `mimalloc_` prefix
-  const char* legacy_name; // potential legacy option name
-} mi_option_desc_t;
+
 
 #define MI_OPTION(opt)                  mi_option_##opt, #opt, NULL
 #define MI_OPTION_LEGACY(opt,legacy)    mi_option_##opt, #opt, #legacy
 
-static mi_option_desc_t options[_mi_option_last] =
+// Some options can be set at build time for statically linked libraries
+// (use `-DMI_EXTRA_CPPDEFS="opt1=val1;opt2=val2"`)
+//
+// This is useful if we cannot pass them as environment variables
+// (and setting them programmatically would be too late)
+
+#ifndef MI_DEFAULT_VERBOSE
+#define MI_DEFAULT_VERBOSE 0
+#endif
+
+#ifndef MI_DEFAULT_ARENA_EAGER_COMMIT
+#define MI_DEFAULT_ARENA_EAGER_COMMIT 2
+#endif
+
+// in KiB
+#ifndef MI_DEFAULT_ARENA_RESERVE
+ #if (MI_INTPTR_SIZE>4)
+  #define MI_DEFAULT_ARENA_RESERVE 1024L*1024L
+ #else
+  #define MI_DEFAULT_ARENA_RESERVE 128L*1024L
+ #endif
+#endif
+
+#ifndef MI_DEFAULT_ARENA_MAX_OBJECT_SIZE
+#define MI_DEFAULT_ARENA_MAX_OBJECT_SIZE   ((MI_SIZE_BITS * MI_ARENA_MAX_CHUNK_OBJ_SIZE)/MI_KiB)  /* 2 GiB (or 256 MiB on 32-bit), larger than this is alloc'd by the OS */
+#endif
+
+#ifndef MI_DEFAULT_DISALLOW_ARENA_ALLOC
+#define MI_DEFAULT_DISALLOW_ARENA_ALLOC 0
+#endif
+
+#ifndef MI_DEFAULT_ALLOW_LARGE_OS_PAGES
+#define MI_DEFAULT_ALLOW_LARGE_OS_PAGES 0
+#endif
+
+#ifndef MI_DEFAULT_RESERVE_HUGE_OS_PAGES
+#define MI_DEFAULT_RESERVE_HUGE_OS_PAGES 0
+#endif
+
+#ifndef MI_DEFAULT_RESERVE_OS_MEMORY
+#define MI_DEFAULT_RESERVE_OS_MEMORY 0
+#endif
+
+#ifndef MI_DEFAULT_GUARDED_SAMPLE_RATE
+#if MI_GUARDED && !MI_DEBUG
+#define MI_DEFAULT_GUARDED_SAMPLE_RATE 4000
+#else
+#define MI_DEFAULT_GUARDED_SAMPLE_RATE 0
+#endif
+#endif
+
+#ifndef MI_DEFAULT_PAGEMAP_COMMIT
+#if defined(__APPLE__)  // when overloading malloc, we still get mixed pointers sometimes on macOS; this avoids a bad access
+#define MI_DEFAULT_PAGEMAP_COMMIT 1
+#else
+#define MI_DEFAULT_PAGEMAP_COMMIT 0
+#endif
+#endif
+
+#ifndef MI_DEFAULT_PAGE_MAX_RECLAIM
+#define MI_DEFAULT_PAGE_MAX_RECLAIM  (-1)               // unlimited
+#endif
+
+#ifndef MI_DEFAULT_PAGE_CROSS_THREAD_MAX_RECLAIM
+#define MI_DEFAULT_PAGE_CROSS_THREAD_MAX_RECLAIM  32
+#endif
+
+#ifndef MI_DEFAULT_ALLOW_THP
+#if defined(__ANDROID__)
+#define MI_DEFAULT_ALLOW_THP  0
+#else
+#define MI_DEFAULT_ALLOW_THP  1
+#endif
+#endif
+
+// Static options
+static mi_option_desc_t mi_options[_mi_option_last] =
 {
   // stable options
-  #if MI_DEBUG || defined(MI_SHOW_ERRORS)
-  { 1, UNINIT, MI_OPTION(show_errors) },
-  #else
-  { 0, UNINIT, MI_OPTION(show_errors) },
-  #endif
-  { 0, UNINIT, MI_OPTION(show_stats) },
-  { 0, UNINIT, MI_OPTION(verbose) },
-
-  // the following options are experimental and not all combinations make sense.
-  { 1, UNINIT, MI_OPTION(eager_commit) },               // commit per segment directly (4MiB)  (but see also `eager_commit_delay`)
-  { 2, UNINIT, MI_OPTION_LEGACY(arena_eager_commit,eager_region_commit) }, // eager commit arena's? 2 is used to enable this only on an OS that has overcommit (i.e. linux)
-  { 1, UNINIT, MI_OPTION_LEGACY(purge_decommits,reset_decommits) },        // purge decommits memory (instead of reset) (note: on linux this uses MADV_DONTNEED for decommit)
-  { 0, UNINIT, MI_OPTION_LEGACY(allow_large_os_pages,large_os_pages) },    // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
-  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },      // per 1GiB huge pages
-  {-1, UNINIT, MI_OPTION(reserve_huge_os_pages_at) },   // reserve huge pages at node N
-  { 0, UNINIT, MI_OPTION(reserve_os_memory)     },      // reserve N KiB OS memory in advance (use `option_get_size`)
-  { 0, UNINIT, MI_OPTION(deprecated_segment_cache) },   // cache N segments per thread
-  { 0, UNINIT, MI_OPTION(deprecated_page_reset) },      // reset page memory on free
-  { 0, UNINIT, MI_OPTION_LEGACY(abandoned_page_purge,abandoned_page_reset) },       // reset free page memory when a thread terminates
-  { 0, UNINIT, MI_OPTION(deprecated_segment_reset) },   // reset segment memory on free (needs eager commit)
-#if defined(__NetBSD__)
-  { 0, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed
+#if MI_DEBUG || defined(MI_SHOW_ERRORS)
+  { 1, MI_OPTION_UNINIT, MI_OPTION(show_errors) },
 #else
-  { 1, UNINIT, MI_OPTION(eager_commit_delay) },         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
+  { 0, MI_OPTION_UNINIT, MI_OPTION(show_errors) },
 #endif
-  { 10,  UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
-  { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
-  { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
-  { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
-  { 32,  UNINIT, MI_OPTION(max_errors) },               // maximum errors that are output
-  { 32,  UNINIT, MI_OPTION(max_warnings) },             // maximum warnings that are output
-  { 10,  UNINIT, MI_OPTION(max_segment_reclaim)},       // max. percentage of the abandoned segments to be reclaimed per try.
-  { 0,   UNINIT, MI_OPTION(destroy_on_exit)},           // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
-  #if (MI_INTPTR_SIZE>4)
-  { 1024L*1024L, UNINIT, MI_OPTION(arena_reserve) },    // reserve memory N KiB at a time (=1GiB) (use `option_get_size`)
-  #else
-  {  128L*1024L, UNINIT, MI_OPTION(arena_reserve) },    // =128MiB on 32-bit
-  #endif
-  { 10,  UNINIT, MI_OPTION(arena_purge_mult) },        // purge delay multiplier for arena's
-  { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
-  { 1,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
-  { 0,   UNINIT, MI_OPTION(disallow_arena_alloc) },     // 1 = do not use arena's for allocation (except if using specific arena id's)
-  { 400, UNINIT, MI_OPTION(retry_on_oom) },             // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries.
+  { 0, MI_OPTION_UNINIT, MI_OPTION(show_stats) },
+  { MI_DEFAULT_VERBOSE, MI_OPTION_UNINIT, MI_OPTION(verbose) },
+
+  // some of the following options are experimental and not all combinations are allowed.
+  { 1, MI_OPTION_UNINIT, MI_OPTION(deprecated_eager_commit) },  
+  { MI_DEFAULT_ARENA_EAGER_COMMIT,
+       MI_OPTION_UNINIT, MI_OPTION_LEGACY(arena_eager_commit,eager_region_commit) }, // eager commit arena's? 2 is used to enable this only on an OS that has overcommit (i.e. linux)
+  { 1, MI_OPTION_UNINIT, MI_OPTION_LEGACY(purge_decommits,reset_decommits) },        // purge decommits memory (instead of reset) (note: on linux this uses MADV_DONTNEED for decommit)
+  { MI_DEFAULT_ALLOW_LARGE_OS_PAGES,
+       MI_OPTION_UNINIT, MI_OPTION_LEGACY(allow_large_os_pages,large_os_pages) },    // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
+  { MI_DEFAULT_RESERVE_HUGE_OS_PAGES,
+       MI_OPTION_UNINIT, MI_OPTION(reserve_huge_os_pages) },      // per 1GiB huge pages
+  {-1, MI_OPTION_UNINIT, MI_OPTION(reserve_huge_os_pages_at) },   // reserve huge pages at node N
+  { MI_DEFAULT_RESERVE_OS_MEMORY,
+       MI_OPTION_UNINIT, MI_OPTION(reserve_os_memory)     },      // reserve N KiB OS memory in advance (use `option_get_size`)
+  { 0, MI_OPTION_UNINIT, MI_OPTION(deprecated_segment_cache) },   // cache N segments per thread
+  { 0, MI_OPTION_UNINIT, MI_OPTION(deprecated_page_reset) },      // reset page memory on free
+  { 0, MI_OPTION_UNINIT, MI_OPTION(deprecated_abandoned_page_purge) }, 
+  { 0, MI_OPTION_UNINIT, MI_OPTION(deprecated_segment_reset) },   // reset segment memory on free (needs eager commit)
+  { 1, MI_OPTION_UNINIT, MI_OPTION(deprecated_eager_commit_delay) },  
+  { 1000,MI_OPTION_UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
+  { 0,   MI_OPTION_UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
+  { 100, MI_OPTION_UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
+  { 32,  MI_OPTION_UNINIT, MI_OPTION(max_errors) },               // maximum errors that are output
+  { 32,  MI_OPTION_UNINIT, MI_OPTION(max_warnings) },             // maximum warnings that are output
+  { 10,  MI_OPTION_UNINIT, MI_OPTION(deprecated_max_segment_reclaim)},       // max. percentage of the abandoned segments to be reclaimed per try.
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(destroy_on_exit)},           // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
+  { MI_DEFAULT_ARENA_RESERVE, MI_OPTION_UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`)
+  { 1,   MI_OPTION_UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
+  { 1,   MI_OPTION_UNINIT, MI_OPTION_LEGACY(deprecated_purge_extend_delay, decommit_extend_delay) },
+  { MI_DEFAULT_DISALLOW_ARENA_ALLOC,   MI_OPTION_UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's)
+  { 400, MI_OPTION_UNINIT, MI_OPTION(retry_on_oom) },             // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries.
+#if defined(MI_VISIT_ABANDONED)
+  { 1,   MI_OPTION_INITIALIZED, MI_OPTION(visit_abandoned) },     // allow visiting theap blocks in abandoned segments; requires taking locks during reclaim.
+#else
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(visit_abandoned) },
+#endif
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(guarded_min) },              // only used when building with MI_GUARDED: minimal rounded object size for guarded objects
+  { MI_GiB, MI_OPTION_UNINIT, MI_OPTION(guarded_max) },           // only used when building with MI_GUARDED: maximal rounded object size for guarded objects
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(guarded_precise) },          // disregard minimal alignment requirement to always place guarded blocks exactly in front of a guard page (=0)
+  { MI_DEFAULT_GUARDED_SAMPLE_RATE,
+         MI_OPTION_UNINIT, MI_OPTION(guarded_sample_rate)},       // 1 out of N allocations in the min/max range will be guarded (=4000)
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(guarded_sample_seed)},
+  { 10000, MI_OPTION_UNINIT, MI_OPTION(generic_collect) },        // collect theaps every N (=10000) generic allocation calls
+  { 0,   MI_OPTION_UNINIT, MI_OPTION_LEGACY(page_reclaim_on_free, abandoned_reclaim_on_free) },// reclaim abandoned (small) pages on a free: -1 = disable completely, 0 = only reclaim into the originating theap, 1 = reclaim on free across theaps
+  { 2,   MI_OPTION_UNINIT, MI_OPTION(page_full_retain) },         // number of (small) pages to retain in the free page queues
+  { 4,   MI_OPTION_UNINIT, MI_OPTION(page_max_candidates) },      // max search to find a best page candidate
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(max_vabits) },               // max virtual address space bits
+  { MI_DEFAULT_PAGEMAP_COMMIT,
+         MI_OPTION_UNINIT, MI_OPTION(pagemap_commit) },           // commit the full pagemap upfront?
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(page_commit_on_demand) },    // commit pages on-demand (2 disables this only on overcommit systems (like Linux))
+  { MI_DEFAULT_PAGE_MAX_RECLAIM,
+         MI_OPTION_UNINIT, MI_OPTION(page_max_reclaim) },         // don't reclaim (small) pages of the same originating theap if we already own N pages in that size class
+  { MI_DEFAULT_PAGE_CROSS_THREAD_MAX_RECLAIM,
+         MI_OPTION_UNINIT, MI_OPTION(page_cross_thread_max_reclaim) }, // don't reclaim (small) pages across threads if we already own N pages in that size class
+  { MI_DEFAULT_ALLOW_THP,
+         MI_OPTION_UNINIT, MI_OPTION(allow_thp) },                // allow transparent huge pages? (=1) (on Android =0 by default). Set to 0 to disable THP for the process.
+  { 0,   MI_OPTION_UNINIT, MI_OPTION(minimal_purge_size) },       // set minimal purge size (in KiB) (=0). By default set to either 64 or 2048 if THP is enabled.
+  { MI_DEFAULT_ARENA_MAX_OBJECT_SIZE,   
+         MI_OPTION_UNINIT, MI_OPTION(arena_max_object_size) },    // set maximal object size that can be allocated in an arena (in KiB) (=2GiB on 64-bit). 
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
 
 static bool mi_option_has_size_in_kib(mi_option_t option) {
-  return (option == mi_option_reserve_os_memory || option == mi_option_arena_reserve);
+  return (option == mi_option_reserve_os_memory || option == mi_option_arena_reserve || 
+          option == mi_option_minimal_purge_size || option == mi_option_arena_max_object_size);
 }
 
 void _mi_options_init(void) {
-  // called on process load; should not be called before the CRT is initialized!
-  // (e.g. do not call this from process_init as that may run before CRT initialization)
-  mi_add_stderr_output(); // now it safe to use stderr for output
+  // called on process load
   for(int i = 0; i < _mi_option_last; i++ ) {
     mi_option_t option = (mi_option_t)i;
     long l = mi_option_get(option); MI_UNUSED(l); // initialize
-    // if (option != mi_option_verbose)
-    {
-      mi_option_desc_t* desc = &options[option];
-      _mi_verbose_message("option '%s': %ld %s\n", desc->name, desc->value, (mi_option_has_size_in_kib(option) ? "KiB" : ""));
-    }
   }
   mi_max_error_count = mi_option_get(mi_option_max_errors);
   mi_max_warning_count = mi_option_get(mi_option_max_warnings);
+  #if MI_GUARDED
+  if (mi_option_get(mi_option_guarded_sample_rate) > 0) {
+    if (mi_option_is_enabled(mi_option_allow_large_os_pages)) {
+      mi_option_disable(mi_option_allow_large_os_pages);
+      _mi_warning_message("option 'allow_large_os_pages' is disabled to allow for guarded objects\n");
+    }
+  }
+  #endif  
+}
+
+// called at actual process load, it should be safe to print now
+void _mi_options_post_init(void) {
+  mi_add_stderr_output(); // now it safe to use stderr for output
+  if (mi_option_is_enabled(mi_option_verbose)) { mi_options_print(); }
+}
+
+#define mi_stringifyx(str)  #str                // and stringify
+#define mi_stringify(str)   mi_stringifyx(str)  // expand
+
+mi_decl_export void mi_options_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept
+{
+  // show version
+  const int vermajor = MI_MALLOC_VERSION/10000;
+  const int verminor = (MI_MALLOC_VERSION%10000)/100;
+  const int verpatch = (MI_MALLOC_VERSION%100);
+  _mi_fprintf(out, arg, "v%i.%i.%i%s%s (built on %s, %s)\n", vermajor, verminor, verpatch,
+      #if defined(MI_CMAKE_BUILD_TYPE)
+      ", " mi_stringify(MI_CMAKE_BUILD_TYPE)
+      #else
+      ""
+      #endif
+      ,
+      #if defined(MI_GIT_DESCRIBE)
+      ", git " mi_stringify(MI_GIT_DESCRIBE)
+      #else
+      ""
+      #endif
+      , __DATE__, __TIME__);
+
+  // show options
+  for (int i = 0; i < _mi_option_last; i++) {
+    mi_option_t option = (mi_option_t)i;
+    long l = mi_option_get(option); MI_UNUSED(l); // possibly initialize
+    mi_option_desc_t* desc = &mi_options[option];
+    _mi_fprintf(out, arg, "option '%s': %ld %s\n", desc->name, desc->value, (mi_option_has_size_in_kib(option) ? "KiB" : ""));
+  }
+
+  // show build configuration
+  _mi_fprintf(out, arg, "debug level : %d\n", MI_DEBUG );
+  _mi_fprintf(out, arg, "secure level: %d\n", MI_SECURE );
+  _mi_fprintf(out, arg, "mem tracking: %s\n", MI_TRACK_TOOL);
+  #if MI_GUARDED
+  _mi_fprintf(out, arg, "guarded build: %s\n", mi_option_get(mi_option_guarded_sample_rate) != 0 ? "enabled" : "disabled");
+  #endif
+  #if MI_TSAN
+  _mi_fprintf(out, arg, "thread santizer enabled\n");
+  #endif
+}
+
+mi_decl_export void mi_options_print(void) mi_attr_noexcept {
+  mi_options_print_out(NULL, NULL);
 }
 
+long _mi_option_get_fast(mi_option_t option) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  mi_option_desc_t* desc = &mi_options[option];
+  mi_assert(desc->option == option);  // index should match the option
+  //mi_assert(desc->init != MI_OPTION_UNINIT);
+  return desc->value;
+}
+
+
 mi_decl_nodiscard long mi_option_get(mi_option_t option) {
   mi_assert(option >= 0 && option < _mi_option_last);
   if (option < 0 || option >= _mi_option_last) return 0;
-  mi_option_desc_t* desc = &options[option];
+  mi_option_desc_t* desc = &mi_options[option];
   mi_assert(desc->option == option);  // index should match the option
-  if mi_unlikely(desc->init == UNINIT) {
+  if mi_unlikely(desc->init == MI_OPTION_UNINIT) {
     mi_option_init(desc);
   }
   return desc->value;
@@ -135,11 +284,12 @@ mi_decl_nodiscard long mi_option_get_clamp(mi_option_t option, long min, long ma
 }
 
 mi_decl_nodiscard size_t mi_option_get_size(mi_option_t option) {
-  mi_assert_internal(mi_option_has_size_in_kib(option));
   const long x = mi_option_get(option);
   size_t size = (x < 0 ? 0 : (size_t)x);
   if (mi_option_has_size_in_kib(option)) {
-    size *= MI_KiB;
+    if (mi_mul_overflow(size, MI_KiB, &size)) {
+      size = MI_MAX_ALLOC_SIZE;
+    }
   }
   return size;
 }
@@ -147,17 +297,24 @@ mi_decl_nodiscard size_t mi_option_get_size(mi_option_t option) {
 void mi_option_set(mi_option_t option, long value) {
   mi_assert(option >= 0 && option < _mi_option_last);
   if (option < 0 || option >= _mi_option_last) return;
-  mi_option_desc_t* desc = &options[option];
+  mi_option_desc_t* desc = &mi_options[option];
   mi_assert(desc->option == option);  // index should match the option
   desc->value = value;
-  desc->init = INITIALIZED;
+  desc->init = MI_OPTION_INITIALIZED;
+  // ensure min/max range; be careful to not recurse.
+  if (desc->option == mi_option_guarded_min && _mi_option_get_fast(mi_option_guarded_max) < value) {
+    mi_option_set(mi_option_guarded_max, value);
+  }
+  else if (desc->option == mi_option_guarded_max && _mi_option_get_fast(mi_option_guarded_min) > value) {
+    mi_option_set(mi_option_guarded_min, value);
+  }
 }
 
 void mi_option_set_default(mi_option_t option, long value) {
   mi_assert(option >= 0 && option < _mi_option_last);
   if (option < 0 || option >= _mi_option_last) return;
-  mi_option_desc_t* desc = &options[option];
-  if (desc->init != INITIALIZED) {
+  mi_option_desc_t* desc = &mi_options[option];
+  if (desc->init != MI_OPTION_INITIALIZED) {
     desc->value = value;
   }
 }
@@ -194,9 +351,9 @@ static void mi_cdecl mi_out_stderr(const char* msg, void* arg) {
 // an output function is registered it is called immediately with
 // the output up to that point.
 #ifndef MI_MAX_DELAY_OUTPUT
-#define MI_MAX_DELAY_OUTPUT ((size_t)(32*1024))
+#define MI_MAX_DELAY_OUTPUT ((size_t)(16*1024))
 #endif
-static char out_buf[MI_MAX_DELAY_OUTPUT+1];
+static char mi_output_buffer[MI_MAX_DELAY_OUTPUT+1];
 static _Atomic(size_t) out_len;
 
 static void mi_cdecl mi_out_buf(const char* msg, void* arg) {
@@ -212,7 +369,8 @@ static void mi_cdecl mi_out_buf(const char* msg, void* arg) {
   if (start+n >= MI_MAX_DELAY_OUTPUT) {
     n = MI_MAX_DELAY_OUTPUT-start-1;
   }
-  _mi_memcpy(&out_buf[start], msg, n);
+  mi_assert_internal(start + n <= MI_MAX_DELAY_OUTPUT);
+  _mi_memcpy(&mi_output_buffer[start], msg, n);
 }
 
 static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf, void* arg) {
@@ -221,10 +379,10 @@ static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf, void* arg) {
   size_t count = mi_atomic_add_acq_rel(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
   // and output the current contents
   if (count>MI_MAX_DELAY_OUTPUT) count = MI_MAX_DELAY_OUTPUT;
-  out_buf[count] = 0;
-  out(out_buf,arg);
+  mi_output_buffer[count] = 0;
+  out(mi_output_buffer,arg);
   if (!no_more_buf) {
-    out_buf[count] = '\n'; // if continue with the buffer, insert a newline
+    mi_output_buffer[count] = '\n'; // if continue with the buffer, insert a newline
   }
 }
 
@@ -262,8 +420,10 @@ void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept {
 // add stderr to the delayed output after the module is loaded
 static void mi_add_stderr_output(void) {
   mi_assert_internal(mi_out_default == NULL);
-  mi_out_buf_flush(&mi_out_stderr, false, NULL); // flush current contents to stderr
-  mi_out_default = &mi_out_buf_stderr;           // and add stderr to the delayed output
+  if (mi_out_default==NULL) {
+    mi_out_buf_flush(&mi_out_stderr, false, NULL); // flush current contents to stderr
+    mi_out_default = &mi_out_buf_stderr;           // and add stderr to the delayed output
+  }
 }
 
 // --------------------------------------------------------
@@ -280,7 +440,7 @@ static _Atomic(size_t) warning_count; // = 0;  // when >= max_warning_count stop
 // (recursively) invoke malloc again to allocate space for the thread local
 // variables on demand. This is why we use a _mi_preloading test on such
 // platforms. However, C code generator may move the initial thread local address
-// load before the `if` and we therefore split it out in a separate funcion.
+// load before the `if` and we therefore split it out in a separate function.
 static mi_decl_thread bool recurse = false;
 
 static mi_decl_noinline bool mi_recurse_enter_prim(void) {
@@ -294,14 +454,14 @@ static mi_decl_noinline void mi_recurse_exit_prim(void) {
 }
 
 static bool mi_recurse_enter(void) {
-  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
+  #if defined(__APPLE__) || defined(__ANDROID__) || defined(MI_TLS_RECURSE_GUARD)
   if (_mi_preloading()) return false;
   #endif
   return mi_recurse_enter_prim();
 }
 
 static void mi_recurse_exit(void) {
-  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
+  #if defined(__APPLE__) || defined(__ANDROID__) || defined(MI_TLS_RECURSE_GUARD)
   if (_mi_preloading()) return;
   #endif
   mi_recurse_exit_prim();
@@ -324,7 +484,7 @@ void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* me
 // Define our own limited `fprintf` that avoids memory allocation.
 // We do this using `_mi_vsnprintf` with a limited buffer.
 static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) {
-  char buf[512];
+  char buf[992];
   if (fmt==NULL) return;
   if (!mi_recurse_enter()) return;
   _mi_vsnprintf(buf, sizeof(buf)-1, fmt, args);
@@ -350,6 +510,20 @@ static void mi_vfprintf_thread(mi_output_fun* out, void* arg, const char* prefix
   }
 }
 
+void _mi_raw_message(const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  mi_vfprintf(NULL, NULL, NULL, fmt, args);
+  va_end(args);
+}
+
+void _mi_message(const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  mi_vfprintf_thread(NULL, NULL, "mimalloc: ", fmt, args);
+  va_end(args);
+}
+
 void _mi_trace_message(const char* fmt, ...) {
   if (mi_option_get(mi_option_verbose) <= 1) return;  // only with verbose level 2 or higher
   va_list args;
@@ -387,7 +561,7 @@ void _mi_warning_message(const char* fmt, ...) {
 
 
 #if MI_DEBUG
-void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, const char* func ) {
+mi_decl_noreturn mi_decl_cold void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, const char* func ) mi_attr_noexcept {
   _mi_fprintf(NULL, NULL, "mimalloc: assertion failed: at \"%s\":%u, %s\n  assertion: \"%s\"\n", fname, line, (func==NULL?"":func), assertion);
   abort();
 }
@@ -475,17 +649,17 @@ static void mi_option_init(mi_option_desc_t* desc) {
     buf[len] = 0;
     if (buf[0] == 0 || strstr("1;TRUE;YES;ON", buf) != NULL) {
       desc->value = 1;
-      desc->init = INITIALIZED;
+      desc->init = MI_OPTION_INITIALIZED;
     }
     else if (strstr("0;FALSE;NO;OFF", buf) != NULL) {
       desc->value = 0;
-      desc->init = INITIALIZED;
+      desc->init = MI_OPTION_INITIALIZED;
     }
     else {
       char* end = buf;
       long value = strtol(buf, &end, 10);
       if (mi_option_has_size_in_kib(desc->option)) {
-        // this option is interpreted in KiB to prevent overflow of `long` for large allocations 
+        // this option is interpreted in KiB to prevent overflow of `long` for large allocations
         // (long is 32-bit on 64-bit windows, which allows for 4TiB max.)
         size_t size = (value < 0 ? 0 : (size_t)value);
         bool overflow = false;
@@ -496,16 +670,15 @@ static void mi_option_init(mi_option_desc_t* desc) {
         else { size = (size + MI_KiB - 1) / MI_KiB; }
         if (end[0] == 'I' && end[1] == 'B') { end += 2; } // KiB, MiB, GiB, TiB
         else if (*end == 'B') { end++; }                  // Kb, Mb, Gb, Tb
-        if (overflow || size > MI_MAX_ALLOC_SIZE) { size = (MI_MAX_ALLOC_SIZE / MI_KiB); }
+        if (overflow || size > (MI_MAX_ALLOC_SIZE / MI_KiB)) { size = (MI_MAX_ALLOC_SIZE / MI_KiB); }
         value = (size > LONG_MAX ? LONG_MAX : (long)size);
       }
       if (*end == 0) {
-        desc->value = value;
-        desc->init = INITIALIZED;
+        mi_option_set(desc->option, value);
       }
       else {
         // set `init` first to avoid recursion through _mi_warning_message on mimalloc_verbose.
-        desc->init = DEFAULTED;
+        desc->init = MI_OPTION_DEFAULTED;
         if (desc->option == mi_option_verbose && desc->value == 0) {
           // if the 'mimalloc_verbose' env var has a bogus value we'd never know
           // (since the value defaults to 'off') so in that case briefly enable verbose
@@ -518,9 +691,9 @@ static void mi_option_init(mi_option_desc_t* desc) {
         }
       }
     }
-    mi_assert_internal(desc->init != UNINIT);
+    mi_assert_internal(desc->init != MI_OPTION_UNINIT);
   }
   else if (!_mi_preloading()) {
-    desc->init = DEFAULTED;
+    desc->init = MI_OPTION_DEFAULTED;
   }
 }
diff --git a/system/lib/mimalloc/src/os.c b/system/lib/mimalloc/src/os.c
index ce104273bfdb0..f5b3d316b62d5 100644
--- a/system/lib/mimalloc/src/os.c
+++ b/system/lib/mimalloc/src/os.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -9,18 +9,27 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"
 
-
 /* -----------------------------------------------------------
-  Initialization. 
+  Initialization.
 ----------------------------------------------------------- */
+#ifndef MI_DEFAULT_PHYSICAL_MEMORY_IN_KIB
+#if MI_INTPTR_SIZE < 8
+#define MI_DEFAULT_PHYSICAL_MEMORY_IN_KIB   4*MI_MiB    // 4 GiB
+#else
+#define MI_DEFAULT_PHYSICAL_MEMORY_IN_KIB   32*MI_MiB   // 32 GiB
+#endif
+#endif
 
 static mi_os_mem_config_t mi_os_mem_config = {
-  4096,   // page size
-  0,      // large page size (usually 2MiB)
-  4096,   // allocation granularity
-  true,   // has overcommit?  (if true we use MAP_NORESERVE on mmap systems)
-  false,  // can we partially free allocated blocks? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span)
-  true    // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory)
+  4096,     // page size
+  0,        // large page size (usually 2MiB)
+  4096,     // allocation granularity
+  MI_DEFAULT_PHYSICAL_MEMORY_IN_KIB,
+  MI_MAX_VABITS, // in `bits.h`
+  true,     // has overcommit?  (if true we use MAP_NORESERVE on mmap systems)
+  false,    // can we partially free allocated blocks? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span)
+  true,     // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory)
+  false     // has transparent huge pages? (if true we purge in (aligned) large page size chunks only to not fragment such pages)
 };
 
 bool _mi_os_has_overcommit(void) {
@@ -42,9 +51,35 @@ size_t _mi_os_large_page_size(void) {
   return (mi_os_mem_config.large_page_size != 0 ? mi_os_mem_config.large_page_size : _mi_os_page_size());
 }
 
-bool _mi_os_use_large_page(size_t size, size_t alignment) {
+// minimal purge size. Can be larger than the page size if transparent huge pages are enabled.
+size_t _mi_os_minimal_purge_size(void) {
+  size_t minsize = mi_option_get_size(mi_option_minimal_purge_size);
+  if (minsize != 0) {
+    return _mi_align_up(minsize, _mi_os_page_size());
+  }
+  else if (mi_os_mem_config.has_transparent_huge_pages && mi_option_is_enabled(mi_option_allow_thp)) {
+    return _mi_os_large_page_size();
+  }
+  else {
+    return _mi_os_page_size();
+  }
+}
+
+size_t _mi_os_guard_page_size(void) {
+  const size_t gsize = _mi_os_page_size();
+  mi_assert(gsize <= (MI_ARENA_SLICE_SIZE/4)); // issue #1166
+  return gsize;
+}
+
+size_t _mi_os_virtual_address_bits(void) {
+  const size_t vbits = mi_os_mem_config.virtual_address_bits;
+  mi_assert(vbits <= MI_MAX_VABITS);
+  return vbits;
+}
+
+bool _mi_os_canuse_large_page(size_t size, size_t alignment) {
   // if we have access, check the size and alignment requirements
-  if (mi_os_mem_config.large_page_size == 0 || !mi_option_is_enabled(mi_option_allow_large_os_pages)) return false;
+  if (mi_os_mem_config.large_page_size == 0) return false;
   return ((size % mi_os_mem_config.large_page_size) == 0 && (alignment % mi_os_mem_config.large_page_size) == 0);
 }
 
@@ -68,98 +103,173 @@ void _mi_os_init(void) {
 /* -----------------------------------------------------------
   Util
 -------------------------------------------------------------- */
-bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
-bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats);
+bool _mi_os_decommit(void* addr, size_t size);
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero);
 
+// On systems with enough virtual address bits, we can do efficient aligned allocation by using
+// the 2TiB to 30TiB area to allocate those. If we have at least 46 bits of virtual address
+// space (64TiB) we use this technique. (but see issue #939)
+#if (MI_INTPTR_SIZE >= 8) && !defined(MI_NO_ALIGNED_HINT) // && !defined(WIN32) && !defined(ANDROID)
 
-/* -----------------------------------------------------------
-  aligned hinting
--------------------------------------------------------------- */
-
-// On 64-bit systems, we can do efficient aligned allocation by using
-// the 2TiB to 30TiB area to allocate those.
-#if (MI_INTPTR_SIZE >= 8)
-static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
-
-// Return a MI_SEGMENT_SIZE aligned address that is probably available.
+// Return a MI_HINT_ALIGN (4MiB) aligned address that is probably available.
 // If this returns NULL, the OS will determine the address but on some OS's that may not be
 // properly aligned which can be more costly as it needs to be adjusted afterwards.
-// For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization;
+// For a size > 16GiB this always returns NULL in order to guarantee good ASLR randomization;
 // (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses
 //  in the middle of the 2TiB - 6TiB address range (see issue #372))
 
-#define MI_HINT_BASE ((uintptr_t)2 << 40)  // 2TiB start
-#define MI_HINT_AREA ((uintptr_t)4 << 40)  // upto 6TiB   (since before win8 there is "only" 8TiB available to processes)
-#define MI_HINT_MAX  ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
+#define MI_HINT_ALIGN ((uintptr_t)4 << 20)  // 4MiB alignment
+#define MI_HINT_BASE  ((uintptr_t)2 << 40)  // 2TiB start
+#define MI_HINT_AREA  ((uintptr_t)4 << 40)  // upto (2+4) 6TiB  (since before win8 there is "only" 8TiB available to processes)
+#define MI_HINT_MAX   ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
 
 void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size)
 {
-  if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL;
-  size = _mi_align_up(size, MI_SEGMENT_SIZE);
-  if (size > 1*MI_GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096.
-  #if (MI_SECURE>0)
-  size += MI_SEGMENT_SIZE;        // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas.
-  #endif
-
+  static mi_decl_cache_align _Atomic(uintptr_t) aligned_base; // = 0
+
+  // todo: perhaps only do alignment hints if THP is enabled?
+  if (try_alignment <= mi_os_mem_config.alloc_granularity || try_alignment > MI_HINT_ALIGN) return NULL;
+  if (mi_os_mem_config.virtual_address_bits < 46) return NULL;  // < 64TiB virtual address space
+  size = _mi_align_up(size, MI_HINT_ALIGN);
+  if (size > 16*MI_GiB) return NULL;  // guarantee the chance of fixed valid address is at least 1/(MI_HINT_AREA / 1<<34) 
+  size += MI_HINT_ALIGN;              // put in virtual gaps between hinted blocks; this splits VLA's but increases guarded areas.
+  
   uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size);
   if (hint == 0 || hint > MI_HINT_MAX) {   // wrap or initialize
     uintptr_t init = MI_HINT_BASE;
-    #if (MI_SECURE>0 || MI_DEBUG==0)       // security: randomize start of aligned allocations unless in debug mode
-    uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap());
-    init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
+    #if (MI_SECURE>=1 || defined(NDEBUG))  // security: randomize start of aligned allocations unless in debug mode
+    const uintptr_t r = _mi_theap_random_next(mi_theap_get_default());
+    init = init + ((MI_HINT_ALIGN * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
     #endif
     uintptr_t expected = hint + size;
     mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init);
     hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all
   }
+  mi_assert_internal(hint%MI_HINT_ALIGN == 0);
   if (hint%try_alignment != 0) return NULL;
   return (void*)hint;
 }
 #else
 void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
-  MI_UNUSED(try_alignment); MI_UNUSED(size);
-  return NULL;
+   MI_UNUSED(try_alignment); MI_UNUSED(size);
+   return NULL;
 }
 #endif
+ 
+
+/* -----------------------------------------------------------
+  Guard page allocation
+----------------------------------------------------------- */
+
+// In secure mode, return the size of a guard page, otherwise 0
+size_t _mi_os_secure_guard_page_size(void) {
+  #if MI_SECURE > 0
+  return _mi_os_guard_page_size();
+  #else
+  return 0;
+  #endif
+}
+
+// In secure mode, try to decommit an area and output a warning if this fails.
+bool _mi_os_secure_guard_page_set_at(void* addr, mi_memid_t memid) {
+  if (addr == NULL) return true;
+  #if MI_SECURE > 0
+  bool ok = false;
+  if (!memid.is_pinned) {
+    mi_arena_t* const arena = mi_memid_arena(memid);
+    if (arena != NULL && arena->commit_fun != NULL) {
+      ok = (*(arena->commit_fun))(false /* decommit */, addr, _mi_os_secure_guard_page_size(), NULL, arena->commit_fun_arg);
+    }
+    else {
+      ok = _mi_os_decommit(addr, _mi_os_secure_guard_page_size());
+    }
+  }
+  if (!ok) {
+    _mi_error_message(EINVAL, "secure level %d, but failed to commit guard page (at %p of size %zu)\n", MI_SECURE, addr, _mi_os_secure_guard_page_size());
+  }
+  return ok;
+  #else
+  MI_UNUSED(memid);
+  return true;
+  #endif
+}
+
+// In secure mode, try to decommit an area and output a warning if this fails.
+bool _mi_os_secure_guard_page_set_before(void* addr, mi_memid_t memid) {
+  return _mi_os_secure_guard_page_set_at((uint8_t*)addr - _mi_os_secure_guard_page_size(), memid);
+}
+
+// In secure mode, try to recommit an area
+bool _mi_os_secure_guard_page_reset_at(void* addr, mi_memid_t memid) {
+  if (addr == NULL) return true;
+  #if MI_SECURE > 0
+  if (!memid.is_pinned) {
+    mi_arena_t* const arena = mi_memid_arena(memid);
+    if (arena != NULL && arena->commit_fun != NULL) {
+      return (*(arena->commit_fun))(true, addr, _mi_os_secure_guard_page_size(), NULL, arena->commit_fun_arg);
+    }
+    else {
+      return _mi_os_commit(addr, _mi_os_secure_guard_page_size(), NULL);
+    }
+  }
+  #else
+  MI_UNUSED(memid);
+  #endif
+  return true;
+}
+
+// In secure mode, try to recommit an area
+bool _mi_os_secure_guard_page_reset_before(void* addr, mi_memid_t memid) {
+  return _mi_os_secure_guard_page_reset_at((uint8_t*)addr - _mi_os_secure_guard_page_size(), memid);
+}
 
 
 /* -----------------------------------------------------------
   Free memory
 -------------------------------------------------------------- */
 
-static void mi_os_free_huge_os_pages(void* p, size_t size, mi_stats_t* stats);
+static void mi_os_free_huge_os_pages(void* p, size_t size, mi_subproc_t* subproc);
 
-static void mi_os_prim_free(void* addr, size_t size, bool still_committed, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
+static void mi_os_prim_free(void* addr, size_t size, size_t commit_size, mi_subproc_t* subproc) {
   mi_assert_internal((size % _mi_os_page_size()) == 0);
-  if (addr == NULL || size == 0) return; // || _mi_os_is_huge_reserved(addr)
-  int err = _mi_prim_free(addr, size);
+  if (addr == NULL) return; // || _mi_os_is_huge_reserved(addr)
+  int err = _mi_prim_free(addr, size);  // allow size==0 (issue #1041)
   if (err != 0) {
     _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr);
   }
-  if (still_committed) { _mi_stat_decrease(&stats->committed, size); }
-  _mi_stat_decrease(&stats->reserved, size);
+  if (subproc == NULL) { subproc = _mi_subproc(); } // from `mi_arenas_unsafe_destroy` we pass subproc_main explicitly as we can no longer use the theap pointer
+  if (commit_size > 0) {
+    mi_subproc_stat_decrease(subproc, committed, commit_size);
+  }
+  mi_subproc_stat_decrease(subproc, reserved, size);
 }
 
-void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* tld_stats) {
+void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid, mi_subproc_t* subproc /* can be NULL */) {
   if (mi_memkind_is_os(memid.memkind)) {
-    size_t csize = _mi_os_good_alloc_size(size);
+    size_t csize = memid.mem.os.size;
+    if (csize==0) { csize = _mi_os_good_alloc_size(size); }
+    mi_assert_internal(csize >= size);
+    size_t commit_size = (still_committed ? csize : 0);
     void* base = addr;
     // different base? (due to alignment)
-    if (memid.mem.os.base != NULL) {
+    if (memid.mem.os.base != base) {
       mi_assert(memid.mem.os.base <= addr);
-      mi_assert((uint8_t*)memid.mem.os.base + memid.mem.os.alignment >= (uint8_t*)addr);
       base = memid.mem.os.base;
-      csize += ((uint8_t*)addr - (uint8_t*)memid.mem.os.base);
+      const size_t diff = (uint8_t*)addr - (uint8_t*)memid.mem.os.base;
+      if (memid.mem.os.size==0) {
+        csize += diff;
+      }
+      if (still_committed) {
+        commit_size -= diff;  // the (addr-base) part was already un-committed
+      }
     }
     // free it
     if (memid.memkind == MI_MEM_OS_HUGE) {
       mi_assert(memid.is_pinned);
-      mi_os_free_huge_os_pages(base, csize, tld_stats);
+      mi_os_free_huge_os_pages(base, csize, subproc);
     }
     else {
-      mi_os_prim_free(base, csize, still_committed, tld_stats);
+      mi_os_prim_free(base, csize, (still_committed ? commit_size : 0), subproc);
     }
   }
   else {
@@ -168,8 +278,8 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me
   }
 }
 
-void  _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* tld_stats) {
-  _mi_os_free_ex(p, size, true, memid, tld_stats);
+void  _mi_os_free(void* p, size_t size, mi_memid_t memid) {
+  _mi_os_free_ex(p, size, true, memid, NULL);
 }
 
 
@@ -178,7 +288,8 @@ void  _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* tld_stats)
 -------------------------------------------------------------- */
 
 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
-static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, mi_stats_t* tld_stats) {
+// Also `hint_addr` is a hint and may be ignored.
+static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero) {
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(is_zero != NULL);
   mi_assert_internal(is_large != NULL);
@@ -187,18 +298,16 @@ static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bo
   if (try_alignment == 0) { try_alignment = 1; } // avoid 0 to ensure there will be no divide by zero when aligning
   *is_zero = false;
   void* p = NULL;
-  int err = _mi_prim_alloc(size, try_alignment, commit, allow_large, is_large, is_zero, &p);
+  int err = _mi_prim_alloc(hint_addr, size, try_alignment, commit, allow_large, is_large, is_zero, &p);
   if (err != 0) {
-    _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, size, try_alignment, commit, allow_large);
+    _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), addr: %p, size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, hint_addr, size, try_alignment, commit, allow_large);
   }
 
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
-  mi_stat_counter_increase(stats->mmap_calls, 1);
+  mi_os_stat_counter_increase(mmap_calls, 1);
   if (p != NULL) {
-    _mi_stat_increase(&stats->reserved, size);
+    mi_os_stat_increase(reserved, size);
     if (commit) {
-      _mi_stat_increase(&stats->committed, size);
+      mi_os_stat_increase(committed, size);
       // seems needed for asan (or `mimalloc-test-api` fails)
       #ifdef MI_TRACK_ASAN
       if (*is_zero) { mi_track_mem_defined(p,size); }
@@ -209,10 +318,14 @@ static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bo
   return p;
 }
 
+static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero) {
+  return mi_os_prim_alloc_at(NULL, size, try_alignment, commit, allow_large, is_large, is_zero);
+}
+
 
 // Primitive aligned allocation from the OS.
 // This function guarantees the allocated memory is aligned.
-static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** base, mi_stats_t* stats) {
+static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** base) {
   mi_assert_internal(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0));
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(is_large != NULL);
@@ -222,50 +335,61 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
   if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
   size = _mi_align_up(size, _mi_os_page_size());
 
-  // try first with a hint (this will be aligned directly on Win 10+ or BSD)
-  void* p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero, stats);
-  if (p == NULL) return NULL;
+  // try a direct allocation if the alignment is below the default, or less than or equal to 1/4 fraction of the size.
+  const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment <= size/4);
+
+  void* p = NULL;
+  if (try_direct_alloc) {
+    p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero);
+  }
 
   // aligned already?
-  if (((uintptr_t)p % alignment) == 0) {
+  if (p != NULL && ((uintptr_t)p % alignment) == 0) {
     *base = p;
   }
   else {
     // if not aligned, free it, overallocate, and unmap around it
-    _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
-    mi_os_prim_free(p, size, commit, stats);
+    #if !MI_TRACK_ASAN
+    if (try_direct_alloc) {
+      _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
+    }
+    #endif
+    if (p != NULL) { mi_os_prim_free(p, size, (commit ? size : 0), NULL); }
     if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
     const size_t over_size = size + alignment;
 
     if (!mi_os_mem_config.has_partial_free) {  // win32 virtualAlloc cannot free parts of an allocated block
       // over-allocate uncommitted (virtual) memory
-      p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero, stats);
+      p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero);
       if (p == NULL) return NULL;
 
       // set p to the aligned part in the full region
-      // note: this is dangerous on Windows as VirtualFree needs the actual base pointer
-      // this is handled though by having the `base` field in the memid's
+      // note: on Windows VirtualFree needs the actual base pointer
+      // this is handledby having the `base` field in the memid.
       *base = p; // remember the base
-      p = mi_align_up_ptr(p, alignment);
+      p = _mi_align_up_ptr(p, alignment);
 
       // explicitly commit only the aligned part
       if (commit) {
-        _mi_os_commit(p, size, NULL, stats);
+        if (!_mi_os_commit(p, size, NULL)) {
+          mi_os_prim_free(*base, over_size, 0, NULL);
+          return NULL;
+        }
       }
     }
     else  { // mmap can free inside an allocation
       // overallocate...
-      p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero, stats);
+      p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero);
       if (p == NULL) return NULL;
 
-      // and selectively unmap parts around the over-allocated area. 
-      void* aligned_p = mi_align_up_ptr(p, alignment);
+      // and selectively unmap parts around the over-allocated area.
+      void* aligned_p = _mi_align_up_ptr(p, alignment);
       size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
       size_t mid_size = _mi_align_up(size, _mi_os_page_size());
       size_t post_size = over_size - pre_size - mid_size;
       mi_assert_internal(pre_size < over_size&& post_size < over_size&& mid_size >= size);
-      if (pre_size > 0)  { mi_os_prim_free(p, pre_size, commit, stats); }
-      if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats); }
+      if (pre_size > 0)  { mi_os_prim_free(p, pre_size, (commit ? pre_size : 0), NULL); }
+      if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, (commit ? post_size : 0), NULL); }
       // we can return the aligned pointer on `mmap` systems
       p = aligned_p;
       *base = aligned_p; // since we freed the pre part, `*base == p`.
@@ -281,20 +405,22 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
   OS API: alloc and alloc_aligned
 ----------------------------------------------------------- */
 
-void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
+void* _mi_os_alloc(size_t size, mi_memid_t* memid) {
   *memid = _mi_memid_none();
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);
   bool os_is_large = false;
   bool os_is_zero  = false;
-  void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero, stats);
-  if (p != NULL) {
-    *memid = _mi_memid_create_os(true, os_is_zero, os_is_large);
-  }
+  void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero);
+  if (p == NULL) return NULL;
+
+  *memid = _mi_memid_create_os(p, size, true, os_is_zero, os_is_large);
+  mi_assert_internal(memid->mem.os.size >= size);
+  mi_assert_internal(memid->initially_committed);
   return p;
 }
 
-void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats)
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid)
 {
   MI_UNUSED(&_mi_os_get_aligned_hint); // suppress unused warnings
   *memid = _mi_memid_none();
@@ -305,15 +431,43 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   bool os_is_large = false;
   bool os_is_zero  = false;
   void* os_base = NULL;
-  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base, stats );
-  if (p != NULL) {
-    *memid = _mi_memid_create_os(commit, os_is_zero, os_is_large);
-    memid->mem.os.base = os_base;
-    memid->mem.os.alignment = alignment;
+  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base );
+  if (p == NULL) return NULL;
+
+  *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large);
+  memid->mem.os.base = os_base;
+  memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base);  // todo: return from prim_alloc_aligned?
+
+  mi_assert_internal(memid->mem.os.size >= size);
+  mi_assert_internal(_mi_is_aligned(p,alignment));
+  if (commit) { mi_assert_internal(memid->initially_committed); }
+  return p;
+}
+
+
+mi_decl_nodiscard static void* mi_os_ensure_zero(void* p, size_t size, mi_memid_t* memid) {
+  if (p==NULL || size==0) return p;
+  // ensure committed
+  if (!memid->initially_committed) {
+    bool is_zero = false;
+    if (!_mi_os_commit(p, size, &is_zero)) {
+      _mi_os_free(p, size, *memid);
+      return NULL;
+    }
+    memid->initially_committed = true;
   }
+  // ensure zero'd
+  if (memid->initially_zero) return p;
+  _mi_memzero_aligned(p,size);
+  memid->initially_zero = true;
   return p;
 }
 
+void*  _mi_os_zalloc(size_t size, mi_memid_t* memid) {
+  void* p = _mi_os_alloc(size,memid);
+  return mi_os_ensure_zero(p, size, memid);
+}
+
 /* -----------------------------------------------------------
   OS aligned allocation with an offset. This is used
   for large alignments > MI_BLOCK_ALIGNMENT_MAX. We use a large mimalloc
@@ -322,28 +476,26 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   to use the actual start of the memory region.
 ----------------------------------------------------------- */
 
-void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats) {
-  mi_assert(offset <= MI_SEGMENT_SIZE);
+void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid) {
   mi_assert(offset <= size);
   mi_assert((alignment % _mi_os_page_size()) == 0);
   *memid = _mi_memid_none();
-  if (offset > MI_SEGMENT_SIZE) return NULL;
   if (offset == 0) {
     // regular aligned allocation
-    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, stats);
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid);
   }
   else {
     // overallocate to align at an offset
     const size_t extra = _mi_align_up(offset, alignment) - offset;
     const size_t oversize = size + extra;
-    void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid, stats);
+    void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid);
     if (start == NULL) return NULL;
 
     void* const p = (uint8_t*)start + extra;
     mi_assert(_mi_is_aligned((uint8_t*)p + offset, alignment));
     // decommit the overallocation at the start
     if (commit && extra > _mi_os_page_size()) {
-      _mi_os_decommit(start, extra, stats);
+      _mi_os_decommit(start, extra);
     }
     return p;
   }
@@ -360,11 +512,11 @@ static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size,
   if (newsize != NULL) *newsize = 0;
   if (size == 0 || addr == NULL) return NULL;
 
-  // page align conservatively within the range
-  void* start = (conservative ? mi_align_up_ptr(addr, _mi_os_page_size())
-    : mi_align_down_ptr(addr, _mi_os_page_size()));
-  void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
-    : mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size()));
+  // page align conservatively within the range, or liberally straddling pages outside the range
+  void* start = (conservative ? _mi_align_up_ptr(addr, _mi_os_page_size())
+                              : _mi_align_down_ptr(addr, _mi_os_page_size()));
+  void* end   = (conservative ? _mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
+                              : _mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size()));
   ptrdiff_t diff = (uint8_t*)end - (uint8_t*)start;
   if (diff <= 0) return NULL;
 
@@ -377,12 +529,9 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t*
   return mi_os_page_align_areax(true, addr, size, newsize);
 }
 
-bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
+bool _mi_os_commit_ex(void* addr, size_t size, bool* is_zero, size_t stat_size) {
   if (is_zero != NULL) { *is_zero = false; }
-  _mi_stat_increase(&stats->committed, size);  // use size for precise commit vs. decommit
-  _mi_stat_counter_increase(&stats->commit_calls, 1);
+  mi_os_stat_counter_increase(commit_calls, 1);
 
   // page align range
   size_t csize;
@@ -405,14 +554,17 @@ bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats
   if (os_is_zero) { mi_track_mem_defined(start,csize); }
              else { mi_track_mem_undefined(start,csize); }
   #endif
+  mi_os_stat_increase(committed, stat_size);  // use size for precise commit vs. decommit
   return true;
 }
 
-static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero) {
+  return _mi_os_commit_ex(addr, size, is_zero, size);
+}
+
+static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, size_t stat_size) {
   mi_assert_internal(needs_recommit!=NULL);
-  _mi_stat_decrease(&stats->committed, size);
+  mi_os_stat_decrease(committed, stat_size);
 
   // page align
   size_t csize;
@@ -429,9 +581,9 @@ static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, mi_
   return (err == 0);
 }
 
-bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
+bool _mi_os_decommit(void* addr, size_t size) {
   bool needs_recommit;
-  return mi_os_decommit_ex(addr, size, &needs_recommit, tld_stats);
+  return mi_os_decommit_ex(addr, size, &needs_recommit, size);
 }
 
 
@@ -439,13 +591,13 @@ bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
 // but may be used later again. This will release physical memory
 // pages and reduce swapping while keeping the memory committed.
 // We page align to a conservative area inside the range to reset.
-bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
+bool _mi_os_reset(void* addr, size_t size) {
   // page align conservatively within the range
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
   if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr)
-  _mi_stat_increase(&stats->reset, csize);
-  _mi_stat_counter_increase(&stats->reset_calls, 1);
+  mi_os_stat_counter_increase(reset, csize);
+  mi_os_stat_counter_increase(reset_calls, 1);
 
   #if (MI_DEBUG>1) && !MI_SECURE && !MI_TRACK_ENABLED // && !MI_TSAN
   memset(start, 0, csize); // pretend it is eagerly reset
@@ -459,24 +611,39 @@ bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
 }
 
 
+void _mi_os_reuse( void* addr, size_t size ) {
+  // page align conservatively within the range
+  size_t csize = 0;
+  void* const start = mi_os_page_align_area_conservative(addr, size, &csize);
+  if (csize == 0) return;
+  const int err = _mi_prim_reuse(start, csize);
+  if (err != 0) {
+    _mi_warning_message("cannot reuse OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
+  }
+}
+
 // either resets or decommits memory, returns true if the memory needs
 // to be recommitted if it is to be re-used later on.
-bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats)
+bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stat_size, mi_commit_fun_t* commit_fun, void* commit_fun_arg)
 {
   if (mi_option_get(mi_option_purge_delay) < 0) return false;  // is purging allowed?
-  _mi_stat_counter_increase(&stats->purge_calls, 1);
-  _mi_stat_increase(&stats->purged, size);
+  mi_os_stat_counter_increase(purge_calls, 1);
+  mi_os_stat_counter_increase(purged, size);
 
-  if (mi_option_is_enabled(mi_option_purge_decommits) &&   // should decommit?
-      !_mi_preloading())                                   // don't decommit during preloading (unsafe)
+  if (commit_fun != NULL) {
+    bool decommitted = (*commit_fun)(false, p, size, NULL, commit_fun_arg);
+    return decommitted; // needs_recommit?
+  }
+  else if (mi_option_is_enabled(mi_option_purge_decommits) &&   // should decommit?
+           !_mi_preloading())                                   // don't decommit during preloading (unsafe)
   {
     bool needs_recommit = true;
-    mi_os_decommit_ex(p, size, &needs_recommit, stats);
+    mi_os_decommit_ex(p, size, &needs_recommit, stat_size);
     return needs_recommit;
   }
   else {
-    if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed
-      _mi_os_reset(p, size, stats);
+    if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed (on Windows, we cannot reset uncommitted memory)
+      _mi_os_reset(p, size);
     }
     return false;  // needs no recommit
   }
@@ -484,10 +651,11 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats)
 
 // either resets or decommits memory, returns true if the memory needs
 // to be recommitted if it is to be re-used later on.
-bool _mi_os_purge(void* p, size_t size, mi_stats_t * stats) {
-  return _mi_os_purge_ex(p, size, true, stats);
+bool _mi_os_purge(void* p, size_t size) {
+  return _mi_os_purge_ex(p, size, true, size, NULL, NULL);
 }
 
+
 // Protect a region in memory to be not accessible.
 static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
   // page align conservatively within the range
@@ -539,15 +707,14 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
     start = huge_start;
     if (start == 0) {
       // Initialize the start address after the 32TiB area
-      start = ((uintptr_t)32 << 40);  // 32TiB virtual start address
-    #if (MI_SECURE>0 || MI_DEBUG==0)      // security: randomize start of huge pages unless in debug mode
-      uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap());
+      start = ((uintptr_t)8 << 40);   // 8TiB virtual start address
+    #if (MI_SECURE>0 || MI_DEBUG==0)  // security: randomize start of huge pages unless in debug mode
+      uintptr_t r = _mi_theap_random_next(_mi_theap_default());
       start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x0FFF));  // (randomly 12bits)*1GiB == between 0 to 4TiB
     #endif
     }
     end = start + size;
-    mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
-  } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end));
+  } while (!mi_atomic_cas_weak_acq_rel(&mi_huge_start, &huge_start, end));
 
   if (total_size != NULL) *total_size = size;
   return (uint8_t*)start;
@@ -560,13 +727,13 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
 }
 #endif
 
-// Allocate MI_SEGMENT_SIZE aligned huge pages
+// Allocate MI_ARENA_SLICE_ALIGN aligned huge pages
 void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid) {
   *memid = _mi_memid_none();
   if (psize != NULL) *psize = 0;
   if (pages_reserved != NULL) *pages_reserved = 0;
   size_t size = 0;
-  uint8_t* start = mi_os_claim_huge_pages(pages, &size);
+  uint8_t* const start = mi_os_claim_huge_pages(pages, &size);
   if (start == NULL) return NULL; // or 32-bit systems
 
   // Allocate one page at the time but try to place them contiguously
@@ -592,15 +759,15 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
       // no success, issue a warning and break
       if (p != NULL) {
         _mi_warning_message("could not allocate contiguous huge OS page %zu at %p\n", page, addr);
-        mi_os_prim_free(p, MI_HUGE_OS_PAGE_SIZE, true, &_mi_stats_main);
+        mi_os_prim_free(p, MI_HUGE_OS_PAGE_SIZE, MI_HUGE_OS_PAGE_SIZE, NULL);
       }
       break;
     }
 
     // success, record it
     page++;  // increase before timeout check (see issue #711)
-    _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
-    _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
+    mi_os_stat_increase(committed, MI_HUGE_OS_PAGE_SIZE);
+    mi_os_stat_increase(reserved, MI_HUGE_OS_PAGE_SIZE);
 
     // check for timeout
     if (max_msecs > 0) {
@@ -622,7 +789,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
   if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; }
   if (page != 0) {
     mi_assert(start != NULL);
-    *memid = _mi_memid_create_os(true /* is committed */, all_zero, true /* is_large */);
+    *memid = _mi_memid_create_os(start, size, true /* is committed */, all_zero, true /* is_large */);
     memid->memkind = MI_MEM_OS_HUGE;
     mi_assert(memid->is_pinned);
     #ifdef MI_TRACK_ASAN
@@ -634,45 +801,103 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
 
 // free every huge page in a range individually (as we allocated per page)
 // note: needed with VirtualAlloc but could potentially be done in one go on mmap'd systems.
-static void mi_os_free_huge_os_pages(void* p, size_t size, mi_stats_t* stats) {
+static void mi_os_free_huge_os_pages(void* p, size_t size, mi_subproc_t* subproc) {
   if (p==NULL || size==0) return;
   uint8_t* base = (uint8_t*)p;
   while (size >= MI_HUGE_OS_PAGE_SIZE) {
-    mi_os_prim_free(base, MI_HUGE_OS_PAGE_SIZE, true, stats);
+    mi_os_prim_free(base, MI_HUGE_OS_PAGE_SIZE, MI_HUGE_OS_PAGE_SIZE, subproc);
     size -= MI_HUGE_OS_PAGE_SIZE;
     base += MI_HUGE_OS_PAGE_SIZE;
   }
 }
 
+
 /* ----------------------------------------------------------------------------
 Support NUMA aware allocation
 -----------------------------------------------------------------------------*/
 
-_Atomic(size_t)  _mi_numa_node_count; // = 0   // cache the node count
+static _Atomic(size_t) mi_numa_node_count; // = 0   // cache the node count
 
-size_t _mi_os_numa_node_count_get(void) {
-  size_t count = mi_atomic_load_acquire(&_mi_numa_node_count);
-  if (count <= 0) {
+int _mi_os_numa_node_count(void) {
+  size_t count = mi_atomic_load_acquire(&mi_numa_node_count);
+  if mi_unlikely(count == 0) {
     long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly?
-    if (ncount > 0) {
+    if (ncount > 0 && ncount < INT_MAX) {
       count = (size_t)ncount;
     }
     else {
-      count = _mi_prim_numa_node_count(); // or detect dynamically
-      if (count == 0) count = 1;
+      const size_t n = _mi_prim_numa_node_count(); // or detect dynamically
+      if (n == 0 || n > INT_MAX) { count = 1; }
+                            else { count = n; }
     }
-    mi_atomic_store_release(&_mi_numa_node_count, count); // save it
-    _mi_verbose_message("using %zd numa regions\n", count);
+    mi_atomic_store_release(&mi_numa_node_count, count); // save it
+    if (count>1) { _mi_verbose_message("using %zd numa regions\n", count); }
   }
-  return count;
+  mi_assert_internal(count > 0 && count <= INT_MAX);
+  return (int)count;
 }
 
-int _mi_os_numa_node_get(mi_os_tld_t* tld) {
-  MI_UNUSED(tld);
-  size_t numa_count = _mi_os_numa_node_count();
+static int mi_os_numa_node_get(void) {
+  int numa_count = _mi_os_numa_node_count();
   if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
   // never more than the node count and >= 0
-  size_t numa_node = _mi_prim_numa_node();
+  const size_t n = _mi_prim_numa_node();
+  int numa_node = (n < INT_MAX ? (int)n : 0);
   if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
-  return (int)numa_node;
+  return numa_node;
+}
+
+int _mi_os_numa_node(void) {
+  if mi_likely(mi_atomic_load_relaxed(&mi_numa_node_count) == 1) {
+    return 0;
+  }
+  else {
+    return mi_os_numa_node_get();
+  }
+}
+
+
+/* ----------------------------------------------------------------------------
+  Public API
+-----------------------------------------------------------------------------*/
+#if 0
+mi_decl_export void* mi_os_alloc(size_t size, bool commit, size_t* full_size) {
+  return mi_os_alloc_aligned(size, mi_os_mem_config.alloc_granularity, commit, NULL, full_size);
+}
+
+static void* mi_os_alloc_aligned_ex(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_committed, bool* is_pinned, void** base, size_t* full_size) {
+  mi_memid_t memid = _mi_memid_none();
+  void* p = _mi_os_alloc_aligned(size, alignment, commit, allow_large, &memid);
+  if (p == NULL) return p;
+  if (is_committed != NULL) { *is_committed = memid.initially_committed;  }
+  if (is_pinned != NULL) { *is_pinned = memid.is_pinned;  }
+  if (base != NULL) { *base = memid.mem.os.base;  }
+  if (full_size != NULL) { *full_size = memid.mem.os.size;  }
+  if (!memid.initially_zero && memid.initially_committed) {
+    _mi_memzero_aligned(memid.mem.os.base, memid.mem.os.size);
+  }
+  return p;
+}
+
+mi_decl_export void* mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, void** base, size_t* full_size) {
+  return mi_os_alloc_aligned_ex(size, alignment, commit, false, NULL, NULL, base, full_size);
+}
+
+mi_decl_export void* mi_os_alloc_aligned_allow_large(size_t size, size_t alignment, bool commit, bool* is_committed, bool* is_pinned, void** base, size_t* full_size) {
+  return mi_os_alloc_aligned_ex(size, alignment, commit, true, is_committed, is_pinned, base, full_size);
 }
+
+mi_decl_export void  mi_os_free(void* p, size_t size) {
+  if (p==NULL || size == 0) return;
+  mi_memid_t memid = _mi_memid_create_os(p, size, true, false, false);
+  _mi_os_free(p, size, memid);
+}
+
+mi_decl_export void  mi_os_commit(void* p, size_t size) {
+  _mi_os_commit(p, size, NULL);
+}
+
+mi_decl_export void  mi_os_decommit(void* p, size_t size) {
+  _mi_os_decommit(p, size);
+}
+#endif
diff --git a/system/lib/mimalloc/src/page-map.c b/system/lib/mimalloc/src/page-map.c
new file mode 100644
index 0000000000000..fab20cc0eaa54
--- /dev/null
+++ b/system/lib/mimalloc/src/page-map.c
@@ -0,0 +1,441 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2023-2025, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "bitmap.h"
+
+static void mi_page_map_cannot_commit(void) {
+  _mi_warning_message("unable to commit the allocation page-map on-demand\n" );
+}
+
+#if MI_PAGE_MAP_FLAT
+
+// The page-map contains a byte for each 64kb slice in the address space.
+// For an address `a` where `ofs = _mi_page_map[a >> 16]`:
+// 0 = unused
+// 1 = the slice at `a & ~0xFFFF` is a mimalloc page.
+// 1 < ofs <= 127 = the slice is part of a page, starting at `(((a>>16) - ofs - 1) << 16)`.
+//
+// 1 byte per slice => 1 TiB address space needs a 2^14 * 2^16 = 16 MiB page map.
+// A full 256 TiB address space (48 bit) needs a 4 GiB page map.
+// A full 4 GiB address space (32 bit) needs only a 64 KiB page map.
+
+mi_decl_cache_align uint8_t* _mi_page_map = NULL;
+static void*        mi_page_map_max_address = NULL;
+static mi_memid_t   mi_page_map_memid;
+
+#define MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT   MI_ARENA_SLICE_SIZE
+static mi_bitmap_t* mi_page_map_commit; // one bit per committed 64 KiB entries
+
+mi_decl_nodiscard static bool mi_page_map_ensure_committed(size_t idx, size_t slice_count);
+
+bool _mi_page_map_init(void) {
+  size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS);
+  if (vbits == 0) {
+    vbits = _mi_os_virtual_address_bits();
+    #if MI_ARCH_X64  // canonical address is limited to the first 128 TiB
+    if (vbits >= 48) { vbits = 47; }
+    #endif
+  }
+
+  // Allocate the page map and commit bits
+  mi_page_map_max_address = (void*)(vbits >= MI_SIZE_BITS ? (SIZE_MAX - MI_ARENA_SLICE_SIZE + 1) : (MI_PU(1) << vbits));
+  const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT));
+  const bool commit = (page_map_size <= 1*MI_MiB || mi_option_is_enabled(mi_option_pagemap_commit)); // _mi_os_has_overcommit(); // commit on-access on Linux systems?
+  const size_t commit_bits = _mi_divide_up(page_map_size, MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT);
+  const size_t bitmap_size = (commit ? 0 : mi_bitmap_size(commit_bits, NULL));
+  const size_t reserve_size = bitmap_size + page_map_size;
+  uint8_t* const base = (uint8_t*)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid);
+  if (base==NULL) {
+    _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
+    return false;
+  }
+  if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
+    _mi_warning_message("internal: the page map was committed but not zero initialized!\n");
+    _mi_memzero_aligned(base, reserve_size);
+  }
+  if (bitmap_size > 0) {
+    mi_page_map_commit = (mi_bitmap_t*)base;
+    if (!_mi_os_commit(mi_page_map_commit, bitmap_size, NULL)) {
+      mi_page_map_cannot_commit();
+      return false;
+    }
+    mi_bitmap_init(mi_page_map_commit, commit_bits, true);
+  }
+  _mi_page_map = base + bitmap_size;
+
+  // commit the first part so NULL pointers get resolved without an access violation
+  if (!commit) {
+    if (!mi_page_map_ensure_committed(0, 1)) {
+      mi_page_map_cannot_commit();
+      return false;
+    }
+  }
+  _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL
+  mi_assert_internal(_mi_ptr_page(NULL)==NULL);
+  return true;
+}
+
+void _mi_page_map_unsafe_destroy(mi_subproc_t* subproc) {
+  mi_assert_internal(subproc != NULL);
+  mi_assert_internal(_mi_page_map != NULL);
+  if (_mi_page_map == NULL) return;
+  _mi_os_free_ex(mi_page_map_memid.mem.os.base, mi_page_map_memid.mem.os.size, true, mi_page_map_memid, subproc);
+  _mi_page_map = NULL;
+  mi_page_map_commit = NULL;
+  mi_page_map_max_address = NULL;
+  mi_page_map_memid = _mi_memid_none();
+}
+
+
+static bool mi_page_map_ensure_committed(size_t idx, size_t slice_count) {
+  // is the page map area that contains the page address committed?
+  // we always set the commit bits so we can track what ranges are in-use.
+  // we only actually commit if the map wasn't committed fully already.
+  if (mi_page_map_commit != NULL) {
+    const size_t commit_idx = idx / MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
+    const size_t commit_idx_hi = (idx + slice_count - 1) / MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
+    for (size_t i = commit_idx; i <= commit_idx_hi; i++) {  // per bit to avoid crossing over bitmap chunks
+      if (mi_bitmap_is_clear(mi_page_map_commit, i)) {
+        // this may race, in which case we do multiple commits (which is ok)
+        bool is_zero;
+        uint8_t* const start = _mi_page_map + (i * MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT);
+        const size_t   size  = MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT;
+        if (!_mi_os_commit(start, size, &is_zero)) {
+          mi_page_map_cannot_commit();
+          return false;
+        }
+        if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start, size); }
+        mi_bitmap_set(mi_page_map_commit, i);
+      }
+    }
+  }
+  #if MI_DEBUG > 0
+  _mi_page_map[idx] = 0;
+  _mi_page_map[idx+slice_count-1] = 0;
+  #endif
+  return true;
+}
+
+
+static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* slice_count) {
+  size_t page_size;
+  *page_start = mi_page_area(page, &page_size);
+  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; }  // furthest interior pointer
+  *slice_count = mi_slice_count_of_size(page_size) + ((*page_start - mi_page_slice_start(page))/MI_ARENA_SLICE_SIZE); // add for large aligned blocks
+  return _mi_page_map_index(page);
+}
+
+bool _mi_page_map_register(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(_mi_is_aligned(mi_page_slice_start(page), MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_page_map != NULL);  // should be initialized before multi-thread access!
+  if mi_unlikely(_mi_page_map == NULL) {
+    if (!_mi_page_map_init()) return false;
+  }
+  mi_assert(_mi_page_map!=NULL);
+  uint8_t* page_start;
+  size_t   slice_count;
+  const size_t idx = mi_page_map_get_idx(page, &page_start, &slice_count);
+
+  if (!mi_page_map_ensure_committed(idx, slice_count)) {
+    return false;
+  }
+
+  // set the offsets
+  for (size_t i = 0; i < slice_count; i++) {
+    mi_assert_internal(i < 128);
+    _mi_page_map[idx + i] = (uint8_t)(i+1);
+  }
+  return true;
+}
+
+void _mi_page_map_unregister(mi_page_t* page) {
+  mi_assert_internal(_mi_page_map != NULL);
+  // get index and count
+  uint8_t* page_start;
+  size_t   slice_count;
+  const size_t idx = mi_page_map_get_idx(page, &page_start, &slice_count);
+  // unset the offsets
+  _mi_memzero(_mi_page_map + idx, slice_count);
+}
+
+void _mi_page_map_unregister_range(void* start, size_t size) {
+  const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
+  const uintptr_t index = _mi_page_map_index(start);
+  // todo: scan the commit bits and clear only those ranges?
+  if (!mi_page_map_ensure_committed(index, slice_count)) { // we commit the range in total;
+    return;
+  }
+  _mi_memzero(&_mi_page_map[index], slice_count);
+}
+
+
+mi_page_t* _mi_safe_ptr_page(const void* p) {
+  if mi_unlikely(p >= mi_page_map_max_address) return NULL;
+  const uintptr_t idx = _mi_page_map_index(p);
+  if mi_unlikely(mi_page_map_commit != NULL && !mi_bitmap_is_set(mi_page_map_commit, idx/MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT)) return NULL;
+  const uintptr_t ofs = _mi_page_map[idx];
+  if mi_unlikely(ofs == 0) return NULL;
+  return (mi_page_t*)((((uintptr_t)p >> MI_ARENA_SLICE_SHIFT) - ofs + 1) << MI_ARENA_SLICE_SHIFT);
+}
+
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  return (_mi_safe_ptr_page(p) != NULL);
+}
+
+#else
+
+// A 2-level page map
+#define MI_PAGE_MAP_SUB_SIZE          (MI_PAGE_MAP_SUB_COUNT * sizeof(mi_page_t*))
+#define MI_PAGE_MAP_ENTRIES_PER_CBIT  (MI_PAGE_MAP_COUNT < MI_BFIELD_BITS ? 1 : (MI_PAGE_MAP_COUNT / MI_BFIELD_BITS))
+
+mi_decl_cache_align _Atomic(mi_submap_t)* _mi_page_map;
+static size_t       mi_page_map_count;
+static void*        mi_page_map_max_address;
+static mi_memid_t   mi_page_map_memid;
+static mi_lock_t    mi_page_map_lock;
+
+// divide the main map in 64 (`MI_BFIELD_BITS`) parts commit those parts on demand
+static _Atomic(mi_bfield_t)  mi_page_map_commit;
+
+mi_decl_nodiscard static inline bool mi_page_map_is_committed(size_t idx, size_t* pbit_idx) {
+  mi_bfield_t commit = mi_atomic_load_relaxed(&mi_page_map_commit);
+  const size_t bit_idx = idx/MI_PAGE_MAP_ENTRIES_PER_CBIT;
+  mi_assert_internal(bit_idx < MI_BFIELD_BITS);
+  if (pbit_idx != NULL) { *pbit_idx = bit_idx; }
+  return ((commit & (MI_ZU(1) << bit_idx)) != 0);
+}
+
+mi_decl_nodiscard static bool mi_page_map_ensure_committed(size_t idx, mi_submap_t* submap) {
+  mi_assert_internal(submap!=NULL && *submap==NULL);
+  size_t bit_idx;
+  if mi_unlikely(!mi_page_map_is_committed(idx, &bit_idx)) {
+    uint8_t* start = (uint8_t*)&_mi_page_map[bit_idx * MI_PAGE_MAP_ENTRIES_PER_CBIT];
+    if (!_mi_os_commit(start, MI_PAGE_MAP_ENTRIES_PER_CBIT * sizeof(mi_submap_t), NULL)) {
+      mi_page_map_cannot_commit();
+      return false;
+    }
+    mi_atomic_or_acq_rel(&mi_page_map_commit, MI_ZU(1) << bit_idx);
+  }
+  *submap = mi_atomic_load_ptr_acquire(mi_page_t*, &_mi_page_map[idx]); // acquire _mi_page_map_at(idx);
+  return true;
+}
+
+// initialize the page map
+bool _mi_page_map_init(void) {
+  size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS);
+  if (vbits == 0) {
+    vbits = _mi_os_virtual_address_bits();
+    #if MI_ARCH_X64  // canonical address is limited to the first 128 TiB
+    if (vbits >= 48) { vbits = 47; }
+    #endif
+  }
+
+  // Allocate the page map and commit bits
+  mi_assert(MI_MAX_VABITS >= vbits);
+  mi_page_map_max_address = (void*)(vbits >= MI_SIZE_BITS ? (SIZE_MAX - MI_ARENA_SLICE_SIZE + 1) : (MI_PU(1) << vbits));
+  mi_page_map_count = (MI_ZU(1) << (vbits - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT));
+  mi_assert(mi_page_map_count <= MI_PAGE_MAP_COUNT);
+  const size_t os_page_size = _mi_os_page_size();
+  const size_t page_map_size = _mi_align_up( mi_page_map_count * sizeof(mi_page_t**), os_page_size);
+  const size_t submap_size = MI_PAGE_MAP_SUB_SIZE;
+  const size_t reserve_size = page_map_size + submap_size;
+  #if MI_SECURE
+  const bool commit = true;  // the whole page map is valid and we can reliably check any pointer
+  #else
+  const bool commit = page_map_size <= 64*MI_KiB ||
+                      mi_option_is_enabled(mi_option_pagemap_commit) || _mi_os_has_overcommit();
+  #endif
+  _mi_page_map = (_Atomic(mi_page_t**)*)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid);
+  if (_mi_page_map==NULL) {
+    _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
+    return false;
+  }
+  if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
+    _mi_warning_message("internal: the page map was committed but not zero initialized!\n");
+    _mi_memzero_aligned(_mi_page_map, page_map_size);
+  }
+  mi_atomic_store_release(&mi_page_map_commit, (mi_page_map_memid.initially_committed ? ~MI_ZU(0) : MI_ZU(0)));
+
+  // ensure there is a submap for the NULL address
+  mi_page_t** const sub0 = (mi_page_t**)((uint8_t*)_mi_page_map + page_map_size);  // we reserved a submap part at the end already
+  if (!mi_page_map_memid.initially_committed) {
+    if (!_mi_os_commit(sub0, submap_size, NULL)) {  // commit full submap (issue #1087)
+      mi_page_map_cannot_commit();
+      return false;
+    }
+  }
+  if (!mi_page_map_memid.initially_zero) {     // initialize low addresses with NULL
+    _mi_memzero_aligned(sub0, submap_size);
+  }
+  mi_submap_t nullsub = NULL;
+  if (!mi_page_map_ensure_committed(0,&nullsub)) {
+    mi_page_map_cannot_commit();
+    return false;
+  }
+  mi_atomic_store_ptr_release(mi_page_t*, &_mi_page_map[0], sub0);
+  mi_lock_init(&mi_page_map_lock);             // initialize late in case the lock init causes allocation
+  
+  mi_assert_internal(_mi_ptr_page(NULL)==NULL);
+  return true;
+}
+
+
+void _mi_page_map_unsafe_destroy(mi_subproc_t* subproc) {
+  mi_assert_internal(subproc != NULL);
+  mi_assert_internal(_mi_page_map != NULL);
+  if (_mi_page_map == NULL) return;
+  mi_lock_done(&mi_page_map_lock);  
+  for (size_t idx = 1; idx < mi_page_map_count; idx++) {  // skip entry 0 (as we allocate that submap at the end of the page_map)
+    // free all sub-maps
+    if (mi_page_map_is_committed(idx, NULL)) {
+      mi_submap_t sub = _mi_page_map_at(idx);
+      if (sub != NULL) {
+        mi_memid_t memid = _mi_memid_create_os(sub, MI_PAGE_MAP_SUB_SIZE, true, false, false);
+        _mi_os_free_ex(memid.mem.os.base, memid.mem.os.size, true, memid, subproc);
+        mi_atomic_store_ptr_release(mi_page_t*, &_mi_page_map[idx], NULL);
+      }
+    }
+  }
+  _mi_os_free_ex(_mi_page_map, mi_page_map_memid.mem.os.size, true, mi_page_map_memid, subproc);
+  _mi_page_map = NULL;
+  mi_page_map_count = 0;
+  mi_page_map_memid = _mi_memid_none();
+  mi_page_map_max_address = NULL;
+  mi_atomic_store_release(&mi_page_map_commit, (mi_bfield_t)0);
+}
+
+
+mi_decl_nodiscard static bool mi_page_map_ensure_submap_at(size_t idx, mi_submap_t* submap) {
+  mi_assert_internal(submap!=NULL && *submap==NULL);
+  mi_submap_t sub = NULL;
+  if (!mi_page_map_ensure_committed(idx, &sub)) {
+    return false;
+  }
+  if mi_unlikely(sub == NULL) {
+    // sub map not yet allocated, alloc now
+    mi_lock(&mi_page_map_lock) 
+    {
+      sub = mi_atomic_load_ptr_acquire(mi_page_t*, &_mi_page_map[idx]); // reload
+      if (sub==NULL) // not yet allocated by another thread?      
+      {
+        mi_memid_t memid;
+        const size_t submap_size = MI_PAGE_MAP_SUB_SIZE;
+        sub = (mi_submap_t)_mi_os_zalloc(submap_size, &memid);        
+        if (sub==NULL) {
+          _mi_warning_message("internal error: unable to extend the page map\n");          
+        }
+        else {
+          mi_submap_t expect = NULL;
+          if (!mi_atomic_cas_ptr_strong_acq_rel(mi_page_t*, &_mi_page_map[idx], &expect, sub)) {
+            // another thread already allocated it.. free and continue
+            _mi_os_free(sub, submap_size, memid);
+            sub = expect;
+          }
+        }
+      }
+    }
+    if (sub==NULL) return false; // unable to allocate the submap..
+  }
+  mi_assert_internal(sub!=NULL);
+  *submap = sub;
+  return true;
+}
+
+static bool mi_page_map_set_range_prim(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count) {
+  // is the page map area that contains the page address committed?
+  while (slice_count > 0) {
+    mi_submap_t sub = NULL;
+    if (!mi_page_map_ensure_submap_at(idx, &sub)) {
+      return false;
+    };
+    mi_assert_internal(sub!=NULL);
+    // set the offsets for the page
+    while (slice_count > 0 && sub_idx < MI_PAGE_MAP_SUB_COUNT) {
+      sub[sub_idx] = page;
+      slice_count--;
+      sub_idx++;
+    }
+    idx++; // potentially wrap around to the next idx
+    sub_idx = 0;
+  }
+  return true;
+}
+
+static bool mi_page_map_set_range(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count) {
+  if mi_unlikely(!mi_page_map_set_range_prim(page,idx,sub_idx,slice_count)) {
+    // failed to commit, call again to reset the page pointer if needed
+    if (page!=NULL) {
+      mi_page_map_set_range_prim(NULL,idx,sub_idx,slice_count);
+    }
+    return false;
+  }
+  return true;
+}
+
+static size_t mi_page_map_get_idx(mi_page_t* page, size_t* sub_idx, size_t* slice_count) {
+  size_t page_size;
+  uint8_t* page_start = mi_page_area(page, &page_size);
+  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; }  // furthest interior pointer
+  *slice_count = mi_slice_count_of_size(page_size) + ((page_start - mi_page_slice_start(page))/MI_ARENA_SLICE_SIZE); // add for large aligned blocks
+  return _mi_page_map_index(page_start, sub_idx);
+}
+
+bool _mi_page_map_register(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(_mi_is_aligned(mi_page_slice_start(page), MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_page_map != NULL);  // should be initialized before multi-thread access!
+  if mi_unlikely(_mi_page_map == NULL) {
+    if (!_mi_page_map_init()) return false;
+  }
+  mi_assert(_mi_page_map!=NULL);
+  size_t   slice_count;
+  size_t   sub_idx;
+  const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count);
+  return mi_page_map_set_range(page, idx, sub_idx, slice_count);
+}
+
+void _mi_page_map_unregister(mi_page_t* page) {
+  mi_assert_internal(_mi_page_map != NULL);
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(_mi_is_aligned(mi_page_slice_start(page), MI_PAGE_ALIGN));
+  if mi_unlikely(_mi_page_map == NULL) return;
+  // get index and count
+  size_t slice_count;
+  size_t sub_idx;
+  const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count);
+  // unset the offsets
+  mi_page_map_set_range(NULL, idx, sub_idx, slice_count);
+}
+
+void _mi_page_map_unregister_range(void* start, size_t size) {
+  if mi_unlikely(_mi_page_map == NULL) return;
+  const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE);
+  size_t sub_idx;
+  const uintptr_t idx = _mi_page_map_index(start, &sub_idx);
+  mi_page_map_set_range(NULL, idx, sub_idx, slice_count);  // todo: avoid committing if not already committed?
+}
+
+// Return NULL for invalid pointers
+mi_page_t* _mi_safe_ptr_page(const void* p) {
+  if (p==NULL) return NULL;
+  if mi_unlikely(p >= mi_page_map_max_address) return NULL;
+  size_t sub_idx;
+  const size_t idx = _mi_page_map_index(p,&sub_idx);
+  if mi_unlikely(!mi_page_map_is_committed(idx,NULL)) return NULL;
+  mi_page_t** const sub = _mi_page_map[idx];
+  if mi_unlikely(sub==NULL) return NULL;
+  return sub[sub_idx];
+}
+
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  return (_mi_safe_ptr_page(p) != NULL);
+}
+
+#endif
diff --git a/system/lib/mimalloc/src/page-queue.c b/system/lib/mimalloc/src/page-queue.c
index ceea91ee4dcbd..4c54cb9a20757 100644
--- a/system/lib/mimalloc/src/page-queue.c
+++ b/system/lib/mimalloc/src/page-queue.c
@@ -38,15 +38,19 @@ terms of the MIT license. A copy of the license can be found in the file
 
 
 static inline bool mi_page_queue_is_huge(const mi_page_queue_t* pq) {
-  return (pq->block_size == (MI_MEDIUM_OBJ_SIZE_MAX+sizeof(uintptr_t)));
+  return (pq->block_size == (MI_LARGE_MAX_OBJ_SIZE+sizeof(uintptr_t)));
 }
 
 static inline bool mi_page_queue_is_full(const mi_page_queue_t* pq) {
-  return (pq->block_size == (MI_MEDIUM_OBJ_SIZE_MAX+(2*sizeof(uintptr_t))));
+  return (pq->block_size == (MI_LARGE_MAX_OBJ_SIZE+(2*sizeof(uintptr_t))));
 }
 
 static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
-  return (pq->block_size > MI_MEDIUM_OBJ_SIZE_MAX);
+  return (pq->block_size > MI_LARGE_MAX_OBJ_SIZE);
+}
+
+static inline size_t mi_page_queue_count(const mi_page_queue_t* pq) {
+  return pq->count;
 }
 
 /* -----------------------------------------------------------
@@ -57,27 +61,23 @@ static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
 // Returns MI_BIN_HUGE if the size is too large.
 // We use `wsize` for the size in "machine word sizes",
 // i.e. byte size == `wsize*sizeof(void*)`.
-static inline uint8_t mi_bin(size_t size) {
+static mi_decl_noinline size_t mi_bin(size_t size) {
   size_t wsize = _mi_wsize_from_size(size);
-  uint8_t bin;
-  if (wsize <= 1) {
-    bin = 1;
+#if defined(MI_ALIGN4W)
+  if mi_likely(wsize <= 4) {
+    return (wsize <= 1 ? 1 : (wsize+1)&~1); // round to double word sizes
   }
-  #if defined(MI_ALIGN4W)
-  else if (wsize <= 4) {
-    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+#elif defined(MI_ALIGN2W)
+  if mi_likely(wsize <= 8) {
+    return (wsize <= 1 ? 1 : (wsize+1)&~1); // round to double word sizes
   }
-  #elif defined(MI_ALIGN2W)
-  else if (wsize <= 8) {
-    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
-  }
-  #else
-  else if (wsize <= 8) {
-    bin = (uint8_t)wsize;
+#else
+  if mi_likely(wsize <= 8) {
+    return (wsize == 0 ? 1 : wsize);
   }
-  #endif
-  else if (wsize > MI_MEDIUM_OBJ_WSIZE_MAX) {
-    bin = MI_BIN_HUGE;
+#endif
+  else if mi_unlikely(wsize > MI_LARGE_MAX_OBJ_WSIZE) {
+    return MI_BIN_HUGE;
   }
   else {
     #if defined(MI_ALIGN4W)
@@ -85,15 +85,14 @@ static inline uint8_t mi_bin(size_t size) {
     #endif
     wsize--;
     // find the highest bit
-    uint8_t b = (uint8_t)mi_bsr(wsize);  // note: wsize != 0
+    const size_t b = (MI_SIZE_BITS - 1 - mi_clz(wsize));  // note: wsize != 0
     // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
     // - adjust with 3 because we use do not round the first 8 sizes
     //   which each get an exact bin
-    bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3;
-    mi_assert_internal(bin < MI_BIN_HUGE);
+    const size_t bin = ((b << 2) + ((wsize >> (b - 2)) & 0x03)) - 3;
+    mi_assert_internal(bin > 0 && bin < MI_BIN_HUGE);
+    return bin;
   }
-  mi_assert_internal(bin > 0 && bin <= MI_BIN_HUGE);
-  return bin;
 }
 
 
@@ -102,17 +101,18 @@ static inline uint8_t mi_bin(size_t size) {
   Queue of pages with free blocks
 ----------------------------------------------------------- */
 
-uint8_t _mi_bin(size_t size) {
+size_t _mi_bin(size_t size) {
   return mi_bin(size);
 }
 
-size_t _mi_bin_size(uint8_t bin) {
-  return _mi_heap_empty.pages[bin].block_size;
+size_t _mi_bin_size(size_t bin) {
+  mi_assert_internal(bin <= MI_BIN_HUGE);
+  return _mi_theap_empty.pages[bin].block_size;
 }
 
 // Good size for allocation
-size_t mi_good_size(size_t size) mi_attr_noexcept {
-  if (size <= MI_MEDIUM_OBJ_SIZE_MAX) {
+mi_decl_nodiscard mi_decl_export size_t mi_good_size(size_t size) mi_attr_noexcept {
+  if (size <= MI_LARGE_MAX_OBJ_SIZE) {
     return _mi_bin_size(mi_bin(size + MI_PADDING_SIZE));
   }
   else {
@@ -136,29 +136,64 @@ static bool mi_page_queue_contains(mi_page_queue_t* queue, const mi_page_t* page
 #endif
 
 #if (MI_DEBUG>1)
-static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t* pq) {
-  return (pq >= &heap->pages[0] && pq <= &heap->pages[MI_BIN_FULL]);
+static bool mi_theap_contains_queue(const mi_theap_t* theap, const mi_page_queue_t* pq) {
+  return (pq >= &theap->pages[0] && pq <= &theap->pages[MI_BIN_FULL]);
 }
 #endif
 
-static inline bool mi_page_is_large_or_huge(const mi_page_t* page) {
-  return (mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_huge(page));
+bool _mi_page_queue_is_valid(mi_theap_t* theap, const mi_page_queue_t* pq) {
+  MI_UNUSED_RELEASE(theap);
+  if (pq==NULL) return false;
+  size_t count = 0; MI_UNUSED_RELEASE(count);
+  mi_page_t* prev = NULL; MI_UNUSED_RELEASE(prev);
+  for (mi_page_t* page = pq->first; page != NULL; page = page->next) {
+    mi_assert_internal(page->prev == prev);
+    if (mi_page_is_in_full(page)) {
+      mi_assert_internal(_mi_wsize_from_size(pq->block_size) == MI_LARGE_MAX_OBJ_WSIZE + 2);
+    }
+    else if (mi_page_is_huge(page)) {
+      mi_assert_internal(_mi_wsize_from_size(pq->block_size) == MI_LARGE_MAX_OBJ_WSIZE + 1);
+    }
+    else {
+      mi_assert_internal(mi_page_block_size(page) == pq->block_size);
+    }
+    mi_assert_internal(page->theap == theap);
+    if (page->next == NULL) {
+      mi_assert_internal(pq->last == page);
+    }
+    count++;
+    prev = page;
+  }
+  mi_assert_internal(pq->count == count);
+  return true;
 }
 
-static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
-  mi_assert_internal(heap!=NULL);
-  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : (mi_page_is_huge(page) ? MI_BIN_HUGE : mi_bin(mi_page_block_size(page))));
+static size_t mi_page_bin(const mi_page_t* page) {
+  const size_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : (mi_page_is_huge(page) ? MI_BIN_HUGE : mi_bin(mi_page_block_size(page))));
   mi_assert_internal(bin <= MI_BIN_FULL);
-  mi_page_queue_t* pq = &heap->pages[bin];
+  return bin;
+}
+
+// returns the page bin without using MI_BIN_FULL for statistics
+size_t _mi_page_stats_bin(const mi_page_t* page) {
+  const size_t bin = (mi_page_is_huge(page) ? MI_BIN_HUGE : mi_bin(mi_page_block_size(page)));
+  mi_assert_internal(bin <= MI_BIN_HUGE);
+  return bin;
+}
+
+static mi_page_queue_t* mi_theap_page_queue_of(mi_theap_t* theap, const mi_page_t* page) {
+  mi_assert_internal(theap!=NULL);
+  const size_t bin = mi_page_bin(page);
+  mi_page_queue_t* pq = &theap->pages[bin];
   mi_assert_internal((mi_page_block_size(page) == pq->block_size) ||
-                       (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(pq)) ||
+                       (mi_page_is_huge(page) && mi_page_queue_is_huge(pq)) ||
                          (mi_page_is_in_full(page) && mi_page_queue_is_full(pq)));
   return pq;
 }
 
 static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
-  mi_heap_t* heap = mi_page_heap(page);
-  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
+  mi_theap_t* theap = mi_page_theap(page);
+  mi_page_queue_t* pq = mi_theap_page_queue_of(theap, page);
   mi_assert_expensive(mi_page_queue_contains(pq, page));
   return pq;
 }
@@ -168,8 +203,8 @@ static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
 // size without having to compute the bin. This means when the
 // current free page queue is updated for a small bin, we need to update a
 // range of entries in `_mi_page_small_free`.
-static inline void mi_heap_queue_first_update(mi_heap_t* heap, const mi_page_queue_t* pq) {
-  mi_assert_internal(mi_heap_contains_queue(heap,pq));
+static inline void mi_theap_queue_first_update(mi_theap_t* theap, const mi_page_queue_t* pq) {
+  mi_assert_internal(mi_theap_contains_queue(theap,pq));
   size_t size = pq->block_size;
   if (size > MI_SMALL_SIZE_MAX) return;
 
@@ -179,7 +214,7 @@ static inline void mi_heap_queue_first_update(mi_heap_t* heap, const mi_page_que
   // find index in the right direct page array
   size_t start;
   size_t idx = _mi_wsize_from_size(size);
-  mi_page_t** pages_free = heap->pages_free_direct;
+  mi_page_t** pages_free = theap->pages_free_direct;
 
   if (pages_free[idx] == page) return;  // already set
 
@@ -189,9 +224,9 @@ static inline void mi_heap_queue_first_update(mi_heap_t* heap, const mi_page_que
   }
   else {
     // find previous size; due to minimal alignment upto 3 previous bins may need to be skipped
-    uint8_t bin = mi_bin(size);
+    size_t bin = mi_bin(size);
     const mi_page_queue_t* prev = pq - 1;
-    while( bin == mi_bin(prev->block_size) && prev > &heap->pages[0]) {
+    while( bin == mi_bin(prev->block_size) && prev > &theap->pages[0]) {
       prev--;
     }
     start = 1 + _mi_wsize_from_size(prev->block_size);
@@ -214,40 +249,40 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(mi_page_queue_contains(queue, page));
+  mi_assert_internal(queue->count >= 1);
   mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
-                      (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(queue)) ||
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
-  mi_heap_t* heap = mi_page_heap(page);
-
+  mi_theap_t* theap = mi_page_theap(page);
   if (page->prev != NULL) page->prev->next = page->next;
   if (page->next != NULL) page->next->prev = page->prev;
   if (page == queue->last)  queue->last = page->prev;
   if (page == queue->first) {
     queue->first = page->next;
     // update first
-    mi_assert_internal(mi_heap_contains_queue(heap, queue));
-    mi_heap_queue_first_update(heap,queue);
+    mi_assert_internal(mi_theap_contains_queue(theap, queue));
+    mi_theap_queue_first_update(theap,queue);
   }
-  heap->page_count--;
+  theap->page_count--;
+  queue->count--;
   page->next = NULL;
   page->prev = NULL;
-  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), NULL);
   mi_page_set_in_full(page,false);
 }
 
 
-static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
-  mi_assert_internal(mi_page_heap(page) == heap);
+static void mi_page_queue_push(mi_theap_t* theap, mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(mi_page_theap(page) == theap);
   mi_assert_internal(!mi_page_queue_contains(queue, page));
   #if MI_HUGE_PAGE_ABANDON
-  mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
   #endif
   mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
-                      (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(queue)) ||
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
 
   mi_page_set_in_full(page, mi_page_queue_is_full(queue));
-  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), heap);
+
   page->next = queue->first;
   page->prev = NULL;
   if (queue->first != NULL) {
@@ -258,15 +293,54 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
   else {
     queue->first = queue->last = page;
   }
+  queue->count++;
 
   // update direct
-  mi_heap_queue_first_update(heap, queue);
-  heap->page_count++;
+  mi_theap_queue_first_update(theap, queue);
+  theap->page_count++;
 }
 
+static void mi_page_queue_push_at_end(mi_theap_t* theap, mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(mi_page_theap(page) == theap);
+  mi_assert_internal(!mi_page_queue_contains(queue, page));
 
-static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) {
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
+                       (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+
+  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
+
+  page->prev = queue->last;
+  page->next = NULL;
+  if (queue->last != NULL) {
+    mi_assert_internal(queue->last->next == NULL);
+    queue->last->next = page;
+    queue->last = page;
+  }
+  else {
+    queue->first = queue->last = page;
+  }
+  queue->count++;
+
+  // update direct
+  if (queue->first == page) {
+    mi_theap_queue_first_update(theap, queue);
+  }
+  theap->page_count++;
+}
+
+static void mi_page_queue_move_to_front(mi_theap_t* theap, mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(mi_page_theap(page) == theap);
+  mi_assert_internal(mi_page_queue_contains(queue, page));
+  if (queue->first == page) return;
+  mi_page_queue_remove(queue, page);
+  mi_page_queue_push(theap, queue, page);
+  mi_assert_internal(queue->first == page);
+}
+
+static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t* from, bool enqueue_at_end, mi_page_t* page) {
   mi_assert_internal(page != NULL);
+  mi_assert_internal(from->count >= 1);
   mi_assert_expensive(mi_page_queue_contains(from, page));
   mi_assert_expensive(!mi_page_queue_contains(to, page));
   const size_t bsize = mi_page_block_size(page);
@@ -274,62 +348,98 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro
   mi_assert_internal((bsize == to->block_size && bsize == from->block_size) ||
                      (bsize == to->block_size && mi_page_queue_is_full(from)) ||
                      (bsize == from->block_size && mi_page_queue_is_full(to)) ||
-                     (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(to)) ||
-                     (mi_page_is_large_or_huge(page) && mi_page_queue_is_full(to)));
+                     (mi_page_is_huge(page) && mi_page_queue_is_huge(to)) ||
+                     (mi_page_is_huge(page) && mi_page_queue_is_full(to)));
 
-  mi_heap_t* heap = mi_page_heap(page);
+  mi_theap_t* theap = mi_page_theap(page);
+
+  // delete from `from`
   if (page->prev != NULL) page->prev->next = page->next;
   if (page->next != NULL) page->next->prev = page->prev;
   if (page == from->last)  from->last = page->prev;
   if (page == from->first) {
     from->first = page->next;
     // update first
-    mi_assert_internal(mi_heap_contains_queue(heap, from));
-    mi_heap_queue_first_update(heap, from);
+    mi_assert_internal(mi_theap_contains_queue(theap, from));
+    mi_theap_queue_first_update(theap, from);
   }
-
-  page->prev = to->last;
-  page->next = NULL;
-  if (to->last != NULL) {
-    mi_assert_internal(heap == mi_page_heap(to->last));
-    to->last->next = page;
-    to->last = page;
+  from->count--;
+
+  // insert into `to`
+  to->count++;
+  if (enqueue_at_end) {
+    // enqueue at the end
+    page->prev = to->last;
+    page->next = NULL;
+    if (to->last != NULL) {
+      mi_assert_internal(theap == mi_page_theap(to->last));
+      to->last->next = page;
+      to->last = page;
+    }
+    else {
+      to->first = page;
+      to->last = page;
+      mi_theap_queue_first_update(theap, to);
+    }
   }
   else {
-    to->first = page;
-    to->last = page;
-    mi_heap_queue_first_update(heap, to);
+    if (to->first != NULL) {
+      // enqueue at 2nd place
+      mi_assert_internal(theap == mi_page_theap(to->first));
+      mi_page_t* next = to->first->next;
+      page->prev = to->first;
+      page->next = next;
+      to->first->next = page;
+      if (next != NULL) {
+        next->prev = page;
+      }
+      else {
+        to->last = page;
+      }
+    }
+    else {
+      // enqueue at the head (singleton list)
+      page->prev = NULL;
+      page->next = NULL;
+      to->first = page;
+      to->last = page;
+      mi_theap_queue_first_update(theap, to);
+    }
   }
 
   mi_page_set_in_full(page, mi_page_queue_is_full(to));
 }
 
-// Only called from `mi_heap_absorb`.
-size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append) {
-  mi_assert_internal(mi_heap_contains_queue(heap,pq));
+static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) {
+  mi_page_queue_enqueue_from_ex(to, from, true /* enqueue at the end */, page);
+}
+
+static void mi_page_queue_enqueue_from_full(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) {
+  // note: we could insert at the front to increase reuse, but it slows down certain benchmarks (like `alloc-test`)
+  mi_page_queue_enqueue_from_ex(to, from, true /* enqueue at the end of the `to` queue? */, page);
+}
+
+// Only called from `mi_theap_absorb`.
+size_t _mi_page_queue_append(mi_theap_t* theap, mi_page_queue_t* pq, mi_page_queue_t* append) {
+  mi_assert_internal(mi_theap_contains_queue(theap,pq));
   mi_assert_internal(pq->block_size == append->block_size);
 
   if (append->first==NULL) return 0;
 
-  // set append pages to new heap and count
+  // set append pages to new theap and count
   size_t count = 0;
   for (mi_page_t* page = append->first; page != NULL; page = page->next) {
-    // inline `mi_page_set_heap` to avoid wrong assertion during absorption;
-    // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive.
-    mi_atomic_store_release(&page->xheap, (uintptr_t)heap);
-    // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a
-    // side effect that it spins until any DELAYED_FREEING is finished. This ensures
-    // that after appending only the new heap will be used for delayed free operations.
-    _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false);
+    mi_page_set_theap(page, theap);
     count++;
   }
+  mi_assert_internal(count == append->count);
 
   if (pq->last==NULL) {
     // take over afresh
     mi_assert_internal(pq->first==NULL);
     pq->first = append->first;
     pq->last = append->last;
-    mi_heap_queue_first_update(heap, pq);
+    mi_theap_queue_first_update(theap, pq);
   }
   else {
     // append to end
@@ -339,5 +449,7 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
     append->first->prev = pq->last;
     pq->last = append->last;
   }
+  pq->count += append->count;
+
   return count;
 }
diff --git a/system/lib/mimalloc/src/page.c b/system/lib/mimalloc/src/page.c
index 871ed21514775..a2fa937065176 100644
--- a/system/lib/mimalloc/src/page.c
+++ b/system/lib/mimalloc/src/page.c
@@ -14,6 +14,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"
 
 /* -----------------------------------------------------------
   Definition of page queues for each block size
@@ -36,14 +37,16 @@ static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_sta
   return (mi_block_t*)((uint8_t*)page_start + (i * block_size));
 }
 
-static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld);
-static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld);
+static bool mi_page_extend_free(mi_theap_t* theap, mi_page_t* page);
 
 #if (MI_DEBUG>=3)
 static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
+  mi_assert_internal(_mi_ptr_page(page->page_start) == page);
+  const uint8_t* slice_start = mi_page_slice_start(page);
+  mi_assert_internal(_mi_is_aligned(slice_start,MI_PAGE_ALIGN));
   size_t count = 0;
   while (head != NULL) {
-    mi_assert_internal(page == _mi_ptr_page(head));
+    mi_assert_internal((uint8_t*)head - slice_start > (ptrdiff_t)MI_LARGE_PAGE_SIZE || page == _mi_ptr_page(head));
     count++;
     head = mi_block_next(page, head);
   }
@@ -59,7 +62,7 @@ static inline uint8_t* mi_page_area(const mi_page_t* page) {
 
 static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
   size_t psize;
-  uint8_t* page_area = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+  uint8_t* page_area = mi_page_area(page, &psize);
   mi_block_t* start = (mi_block_t*)page_area;
   mi_block_t* end   = (mi_block_t*)(page_area + psize);
   while(p != NULL) {
@@ -82,9 +85,8 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   mi_assert_internal(page->used <= page->capacity);
   mi_assert_internal(page->capacity <= page->reserved);
 
-  uint8_t* start = mi_page_start(page);
-  mi_assert_internal(start == _mi_segment_page_start(_mi_page_segment(page), page, NULL));
-  mi_assert_internal(page->is_huge == (_mi_page_segment(page)->kind == MI_SEGMENT_HUGE));
+  // const size_t bsize = mi_page_block_size(page);
+  // uint8_t* start = mi_page_start(page);
   //mi_assert_internal(start + page->capacity*page->block_size == page->top);
 
   mi_assert_internal(mi_page_list_is_valid(page,page->free));
@@ -112,94 +114,45 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   return true;
 }
 
-extern bool _mi_process_is_initialized;             // has mi_process_init been called?
+extern mi_decl_hidden bool _mi_process_is_initialized;             // has mi_process_init been called?
 
 bool _mi_page_is_valid(mi_page_t* page) {
   mi_assert_internal(mi_page_is_valid_init(page));
   #if MI_SECURE
   mi_assert_internal(page->keys[0] != 0);
   #endif
-  if (mi_page_heap(page)!=NULL) {
-    mi_segment_t* segment = _mi_page_segment(page);
-
-    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id==0 || segment->thread_id == mi_page_heap(page)->thread_id);
-    #if MI_HUGE_PAGE_ABANDON
-    if (segment->kind != MI_SEGMENT_HUGE)
-    #endif
+  if (!mi_page_is_abandoned(page)) {
+    //mi_assert_internal(!_mi_process_is_initialized);
     {
       mi_page_queue_t* pq = mi_page_queue_of(page);
       mi_assert_internal(mi_page_queue_contains(pq, page));
-      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_in_full(page));
-      mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
+      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_is_huge(page) || mi_page_is_in_full(page));
+      // mi_assert_internal(mi_theap_contains_queue(mi_page_theap(page),pq));
     }
   }
   return true;
 }
 #endif
 
-void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
-  while (!_mi_page_try_use_delayed_free(page, delay, override_never)) {
-    mi_atomic_yield();
-  }
-}
-
-bool _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
-  mi_thread_free_t tfreex;
-  mi_delayed_t     old_delay;
-  mi_thread_free_t tfree;
-  size_t yield_count = 0;
-  do {
-    tfree = mi_atomic_load_acquire(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS;
-    tfreex = mi_tf_set_delayed(tfree, delay);
-    old_delay = mi_tf_delayed(tfree);
-    if mi_unlikely(old_delay == MI_DELAYED_FREEING) {
-      if (yield_count >= 4) return false;  // give up after 4 tries
-      yield_count++;
-      mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done.
-      // tfree = mi_tf_set_delayed(tfree, MI_NO_DELAYED_FREE); // will cause CAS to busy fail
-    }
-    else if (delay == old_delay) {
-      break; // avoid atomic operation if already equal
-    }
-    else if (!override_never && old_delay == MI_NEVER_DELAYED_FREE) {
-      break; // leave never-delayed flag set
-    }
-  } while ((old_delay == MI_DELAYED_FREEING) ||
-           !mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-
-  return true; // success
-}
 
 /* -----------------------------------------------------------
   Page collect the `local_free` and `thread_free` lists
 ----------------------------------------------------------- */
 
-// Collect the local `thread_free` list using an atomic exchange.
-// Note: The exchange must be done atomically as this is used right after
-// moving to the full list in `mi_page_collect_ex` and we need to
-// ensure that there was no race where the page became unfull just before the move.
-static void _mi_page_thread_free_collect(mi_page_t* page)
+static void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head)
 {
-  mi_block_t* head;
-  mi_thread_free_t tfreex;
-  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
-  do {
-    head = mi_tf_block(tfree);
-    tfreex = mi_tf_set_block(tfree,NULL);
-  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));
-
-  // return if the list is empty
   if (head == NULL) return;
 
-  // find the tail -- also to get a proper count (without data races)
+  // find the last block in the list -- also to get a proper use count (without data races)
   size_t max_count = page->capacity; // cannot collect more than capacity
   size_t count = 1;
-  mi_block_t* tail = head;
+  mi_block_t* last = head;
   mi_block_t* next;
-  while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) {
+  while ((next = mi_block_next(page, last)) != NULL && count <= max_count) {
     count++;
-    tail = next;
+    last = next;
   }
+
   // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free)
   if (count > max_count) {
     _mi_error_message(EFAULT, "corrupted thread-free list\n");
@@ -207,20 +160,49 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
   }
 
   // and append the current local free list
-  mi_block_set_next(page,tail, page->local_free);
+  mi_block_set_next(page, last, page->local_free);
   page->local_free = head;
 
   // update counts now
-  page->used -= (uint16_t)count;
+  mi_assert_internal(count <= UINT16_MAX);
+  mi_assert_internal(page->used >= (uint16_t)count);
+  page->used = page->used - (uint16_t)count;
+}
+
+// Collect the local `thread_free` list using an atomic exchange.
+static void mi_page_thread_free_collect(mi_page_t* page)
+{
+  // atomically capture the thread free list
+  mi_block_t* head;
+  mi_thread_free_t tfreex;
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    head = mi_tf_block(tfree);
+    if mi_likely(head == NULL) return; // return if the list is empty
+    tfreex = mi_tf_create(NULL,mi_tf_is_owned(tfree));  // set the thread free list to NULL
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));  // release is enough?
+  mi_assert_internal(head != NULL);
+
+  // and move it to the local list
+  mi_page_thread_collect_to_local(page, head);
+}
+
+// returns `true` if after collection `mi_page_immediate_available` is true.
+static bool mi_page_free_quick_collect(mi_page_t* page) {
+  if (page->free != NULL) return true;
+  if (page->local_free == NULL) return false;
+  // move local_free to free
+  page->free = page->local_free;
+  page->local_free = NULL;
+  page->free_is_zero = false;
+  return true;
 }
 
 void _mi_page_free_collect(mi_page_t* page, bool force) {
   mi_assert_internal(page!=NULL);
 
   // collect the thread free list
-  if (force || mi_page_thread_free(page) != NULL) {  // quick test to avoid an atomic operation
-    _mi_page_thread_free_collect(page);
-  }
+  mi_page_thread_free_collect(page);
 
   // and the local free list
   if (page->local_free != NULL) {
@@ -247,118 +229,149 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
   mi_assert_internal(!force || page->local_free == NULL);
 }
 
+// Collect elements in the thread-free list starting at `head`. This is an optimized
+// version of `_mi_page_free_collect` to be used from `free.c:_mi_free_collect_mt` that avoids atomic access to `xthread_free`.
+//
+// `head` must be in the `xthread_free` list. It will not collect `head` itself
+// so the `used` count is not fully updated in general. However, if the `head` is
+// the last remaining element, it will be collected and the used count will become `0` (so `mi_page_all_free` becomes true).
+void _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head) {
+  if (head == NULL) return;
+  mi_block_t* next = mi_block_next(page,head);  // we cannot collect the head element itself as `page->thread_free` may point to it (and we want to avoid atomic ops)
+  if (next != NULL) {
+    mi_block_set_next(page, head, NULL);
+    mi_page_thread_collect_to_local(page, next);
+    if (page->local_free != NULL && page->free == NULL) {
+      page->free = page->local_free;
+      page->local_free = NULL;
+      page->free_is_zero = false;
+    }
+  }
+  if (page->used == 1) {
+    // all elements are free'd since we skipped the `head` element itself
+    mi_assert_internal(mi_tf_block(mi_atomic_load_relaxed(&page->xthread_free)) == head);
+    mi_assert_internal(mi_block_next(page,head) == NULL);
+    _mi_page_free_collect(page, false);  // collect the final element
+  }
+}
 
 
 /* -----------------------------------------------------------
   Page fresh and retire
 ----------------------------------------------------------- */
 
+/*
 // called from segments when reclaiming abandoned pages
-void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
-  mi_assert_expensive(mi_page_is_valid_init(page));
+void _mi_page_reclaim(mi_theap_t* theap, mi_page_t* page) {
+  // mi_page_set_theap(page, theap);
+  // _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after theap is set)
+  _mi_page_free_collect(page, false); // ensure used count is up to date
 
-  mi_assert_internal(mi_page_heap(page) == heap);
-  mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
-  #if MI_HUGE_PAGE_ABANDON
-  mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
-  #endif
+  mi_assert_expensive(mi_page_is_valid_init(page));
+  // mi_assert_internal(mi_page_theap(page) == theap);
+  // mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
 
   // TODO: push on full queue immediately if it is full?
-  mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
-  mi_page_queue_push(heap, pq, page);
+  mi_page_queue_t* pq = mi_theap_page_queue_of(theap, page);
+  mi_page_queue_push(theap, pq, page);
+  mi_assert_expensive(_mi_page_is_valid(page));
+}
+*/
+
+// called from `mi_free` on a reclaim, and fresh_alloc if we get an abandoned page
+void _mi_theap_page_reclaim(mi_theap_t* theap, mi_page_t* page)
+{
+  mi_assert_internal(_mi_is_aligned(mi_page_slice_start(page), MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_is_abandoned(page));
+
+  mi_page_set_theap(page,theap);
+  _mi_page_free_collect(page, false); // ensure used count is up to date
+  mi_page_queue_t* pq = mi_theap_page_queue_of(theap, page);
+  mi_page_queue_push_at_end(theap, pq, page);
   mi_assert_expensive(_mi_page_is_valid(page));
 }
 
-// allocate a fresh page from a segment
-static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size, size_t page_alignment) {
+void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
+  _mi_page_free_collect(page, false); // ensure used count is up to date
+  if (mi_page_all_free(page)) {
+    _mi_page_free(page, pq);
+  }
+  else {
+    mi_page_queue_remove(pq, page);
+    mi_theap_t* theap = page->theap;
+    mi_page_set_theap(page, NULL);
+    page->theap = theap; // don't actually set theap to NULL so we can reclaim_on_free within the same theap
+    _mi_arenas_page_abandon(page, theap);
+    _mi_arenas_collect(false, false, theap->tld); // allow purging
+  }
+}
+
+
+// allocate a fresh page from an arena
+static mi_page_t* mi_page_fresh_alloc(mi_theap_t* theap, mi_page_queue_t* pq, size_t block_size, size_t page_alignment) {
   #if !MI_HUGE_PAGE_ABANDON
   mi_assert_internal(pq != NULL);
-  mi_assert_internal(mi_heap_contains_queue(heap, pq));
-  mi_assert_internal(page_alignment > 0 || block_size > MI_MEDIUM_OBJ_SIZE_MAX || block_size == pq->block_size);
+  mi_assert_internal(mi_theap_contains_queue(theap, pq));
+  mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_MAX_OBJ_SIZE || block_size == pq->block_size);
   #endif
-  mi_page_t* page = _mi_segment_page_alloc(heap, block_size, page_alignment, &heap->tld->segments, &heap->tld->os);
+  mi_page_t* page = _mi_arenas_page_alloc(theap, block_size, page_alignment);
   if (page == NULL) {
-    // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue)
+    // out-of-memory
     return NULL;
   }
-  #if MI_HUGE_PAGE_ABANDON
-  mi_assert_internal(pq==NULL || _mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
-  #endif
-  mi_assert_internal(page_alignment >0 || block_size > MI_MEDIUM_OBJ_SIZE_MAX || _mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  if (mi_page_is_abandoned(page)) {
+    _mi_theap_page_reclaim(theap, page);
+    if (!mi_page_immediate_available(page)) {
+      if (mi_page_is_expandable(page)) {
+        if (!mi_page_extend_free(theap, page)) {
+          return NULL; // cannot commit
+        };
+      }
+      else {
+        mi_assert(false); // should not happen?
+        return NULL;
+      }
+    }
+  }
+  else if (pq != NULL) {
+    mi_page_queue_push(theap, pq, page);
+  }
   mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
-  // a fresh page was found, initialize it
-  const size_t full_block_size = (pq == NULL || mi_page_is_huge(page) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
-  mi_assert_internal(full_block_size >= block_size);
-  mi_page_init(heap, page, full_block_size, heap->tld);
-  mi_heap_stat_increase(heap, pages, 1);
-  if (pq != NULL) { mi_page_queue_push(heap, pq, page); }
   mi_assert_expensive(_mi_page_is_valid(page));
   return page;
 }
 
 // Get a fresh page to use
-static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
-  mi_assert_internal(mi_heap_contains_queue(heap, pq));
-  mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size, 0);
+static mi_page_t* mi_page_fresh(mi_theap_t* theap, mi_page_queue_t* pq) {
+  mi_assert_internal(mi_theap_contains_queue(theap, pq));
+  mi_page_t* page = mi_page_fresh_alloc(theap, pq, pq->block_size, 0);
   if (page==NULL) return NULL;
   mi_assert_internal(pq->block_size==mi_page_block_size(page));
-  mi_assert_internal(pq==mi_page_queue(heap, mi_page_block_size(page)));
+  mi_assert_internal(pq==mi_theap_page_queue_of(theap, page));
   return page;
 }
 
-/* -----------------------------------------------------------
-   Do any delayed frees
-   (put there by other threads if they deallocated in a full page)
------------------------------------------------------------ */
-void _mi_heap_delayed_free_all(mi_heap_t* heap) {
-  while (!_mi_heap_delayed_free_partial(heap)) {
-    mi_atomic_yield();
-  }
-}
-
-// returns true if all delayed frees were processed
-bool _mi_heap_delayed_free_partial(mi_heap_t* heap) {
-  // take over the list (note: no atomic exchange since it is often NULL)
-  mi_block_t* block = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-  while (block != NULL && !mi_atomic_cas_ptr_weak_acq_rel(mi_block_t, &heap->thread_delayed_free, &block, NULL)) { /* nothing */ };
-  bool all_freed = true;
-
-  // and free them all
-  while(block != NULL) {
-    mi_block_t* next = mi_block_nextx(heap,block, heap->keys);
-    // use internal free instead of regular one to keep stats etc correct
-    if (!_mi_free_delayed_block(block)) {
-      // we might already start delayed freeing while another thread has not yet
-      // reset the delayed_freeing flag; in that case delay it further by reinserting the current block
-      // into the delayed free list
-      all_freed = false;
-      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-      do {
-        mi_block_set_nextx(heap, block, dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
-    }
-    block = next;
-  }
-  return all_freed;
-}
 
 /* -----------------------------------------------------------
   Unfull, abandon, free and retire
 ----------------------------------------------------------- */
 
-// Move a page from the full list back to a regular list
+// Move a page from the full list back to a regular list (called from thread-local mi_free)
 void _mi_page_unfull(mi_page_t* page) {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(_mi_page_is_valid(page));
   mi_assert_internal(mi_page_is_in_full(page));
+  mi_assert_internal(!mi_page_theap(page)->allow_page_abandon);
   if (!mi_page_is_in_full(page)) return;
 
-  mi_heap_t* heap = mi_page_heap(page);
-  mi_page_queue_t* pqfull = &heap->pages[MI_BIN_FULL];
+  mi_theap_t* theap = mi_page_theap(page);
+  mi_page_queue_t* pqfull = &theap->pages[MI_BIN_FULL];
   mi_page_set_in_full(page, false); // to get the right queue
-  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
+  mi_page_queue_t* pq = mi_theap_page_queue_of(theap, page);
   mi_page_set_in_full(page, true);
-  mi_page_queue_enqueue_from(pq, pqfull, page);
+  mi_page_queue_enqueue_from_full(pq, pqfull, page);
 }
 
 static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
@@ -366,74 +379,48 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
   mi_assert_internal(!mi_page_immediate_available(page));
   mi_assert_internal(!mi_page_is_in_full(page));
 
-  if (mi_page_is_in_full(page)) return;
-  mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page);
-  _mi_page_free_collect(page,false);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
-}
-
-
-// Abandon a page with used blocks at the end of a thread.
-// Note: only call if it is ensured that no references exist from
-// the `page->heap->thread_delayed_free` into this page.
-// Currently only called through `mi_heap_collect_ex` which ensures this.
-void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
-  mi_assert_internal(page != NULL);
-  mi_assert_expensive(_mi_page_is_valid(page));
-  mi_assert_internal(pq == mi_page_queue_of(page));
-  mi_assert_internal(mi_page_heap(page) != NULL);
-
-  mi_heap_t* pheap = mi_page_heap(page);
-
-  // remove from our page list
-  mi_segments_tld_t* segments_tld = &pheap->tld->segments;
-  mi_page_queue_remove(pq, page);
-
-  // page is no longer associated with our heap
-  mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
-  mi_page_set_heap(page, NULL);
-
-#if (MI_DEBUG>1) && !MI_TRACK_ENABLED
-  // check there are no references left..
-  for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->keys)) {
-    mi_assert_internal(_mi_ptr_page(block) != page);
+  mi_theap_t* theap = mi_page_theap(page);
+  if (theap->allow_page_abandon) {
+    // abandon full pages (this is the usual case in order to allow for sharing of memory between theaps)
+    _mi_page_abandon(page, pq);
+  }
+  else if (!mi_page_is_in_full(page)) {
+    // put full pages in a theap local queue (this is for theaps that cannot abandon, for example, if the theap can be destroyed)
+    mi_page_queue_enqueue_from(&mi_page_theap(page)->pages[MI_BIN_FULL], pq, page);
+    _mi_page_free_collect(page, false);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
   }
-#endif
-
-  // and abandon it
-  mi_assert_internal(mi_page_heap(page) == NULL);
-  _mi_segment_page_abandon(page,segments_tld);
 }
 
 
 // Free a page with no more free blocks
-void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
+void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq) {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(_mi_page_is_valid(page));
   mi_assert_internal(pq == mi_page_queue_of(page));
   mi_assert_internal(mi_page_all_free(page));
-  mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING);
+  // mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING);
 
   // no more aligned blocks in here
-  mi_page_set_has_aligned(page, false);
-
-  mi_heap_t* heap = mi_page_heap(page);
+  mi_page_set_has_interior_pointers(page, false);
 
   // remove from the page list
-  // (no need to do _mi_heap_delayed_free first as all blocks are already free)
-  mi_segments_tld_t* segments_tld = &heap->tld->segments;
+  // (no need to do _mi_theap_delayed_free first as all blocks are already free)
   mi_page_queue_remove(pq, page);
 
   // and free it
-  mi_page_set_heap(page,NULL);
-  _mi_segment_page_free(page, force, segments_tld);
+  mi_theap_t* theap = mi_page_theap(page); mi_assert_internal(theap!=NULL);
+  mi_page_set_theap(page,NULL);
+  _mi_arenas_page_free(page, theap);
+  _mi_arenas_collect(false, false, theap->tld);  // allow purging
 }
 
-#define MI_MAX_RETIRE_SIZE    MI_MEDIUM_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
+#define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
 #define MI_RETIRE_CYCLES      (16)
 
 // Retire a page with no more used blocks
 // Important to not retire too quickly though as new
 // allocations might coming.
+//
 // Note: called from `mi_free` and benchmarks often
 // trigger this due to freeing everything and then
 // allocating again so careful when changing this.
@@ -442,7 +429,8 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   mi_assert_expensive(_mi_page_is_valid(page));
   mi_assert_internal(mi_page_all_free(page));
 
-  mi_page_set_has_aligned(page, false);
+  if (page->retire_expire!=0) return;  // already retired, just keep it retired
+  mi_page_set_has_interior_pointers(page, false);
 
   // don't retire too often..
   // (or we end up retiring and re-allocating most of the time)
@@ -451,37 +439,41 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   // how to check this efficiently though...
   // for now, we don't retire if it is the only page left of this size class.
   mi_page_queue_t* pq = mi_page_queue_of(page);
+  #if MI_RETIRE_CYCLES > 0
   const size_t bsize = mi_page_block_size(page);
   if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) {  // not full or huge queue?
     if (pq->last==page && pq->first==page) { // the only page in the queue?
-      mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
-      page->retire_expire = (bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
-      mi_heap_t* heap = mi_page_heap(page);
-      mi_assert_internal(pq >= heap->pages);
-      const size_t index = pq - heap->pages;
+      mi_theap_t* theap = mi_page_theap(page);
+      #if MI_STAT>0
+      mi_theap_stat_counter_increase(theap, pages_retire, 1);
+      #endif
+      page->retire_expire = (bsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
+      mi_assert_internal(pq >= theap->pages);
+      const size_t index = pq - theap->pages;
       mi_assert_internal(index < MI_BIN_FULL && index < MI_BIN_HUGE);
-      if (index < heap->page_retired_min) heap->page_retired_min = index;
-      if (index > heap->page_retired_max) heap->page_retired_max = index;
+      if (index < theap->page_retired_min) theap->page_retired_min = index;
+      if (index > theap->page_retired_max) theap->page_retired_max = index;
       mi_assert_internal(mi_page_all_free(page));
       return; // don't free after all
     }
   }
-  _mi_page_free(page, pq, false);
+  #endif
+  _mi_page_free(page, pq);
 }
 
 // free retired pages: we don't need to look at the entire queues
 // since we only retire pages that are at the head position in a queue.
-void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
+void _mi_theap_collect_retired(mi_theap_t* theap, bool force) {
   size_t min = MI_BIN_FULL;
   size_t max = 0;
-  for(size_t bin = heap->page_retired_min; bin <= heap->page_retired_max; bin++) {
-    mi_page_queue_t* pq   = &heap->pages[bin];
+  for(size_t bin = theap->page_retired_min; bin <= theap->page_retired_max; bin++) {
+    mi_page_queue_t* pq   = &theap->pages[bin];
     mi_page_t*       page = pq->first;
     if (page != NULL && page->retire_expire != 0) {
       if (mi_page_all_free(page)) {
         page->retire_expire--;
-        if (force || page->retire_expire == 0) {
-          _mi_page_free(pq->first, pq, force);
+        if (page->retire_expire == 0 || force) {
+          _mi_page_free(page, pq);
         }
         else {
           // keep retired, update min/max
@@ -494,9 +486,32 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
       }
     }
   }
-  heap->page_retired_min = min;
-  heap->page_retired_max = max;
+  theap->page_retired_min = min;
+  theap->page_retired_max = max;
+}
+
+/*
+static void mi_theap_collect_full_pages(mi_theap_t* theap) {
+  // note: normally full pages get immediately abandoned and the full queue is always empty
+  // this path is only used if abandoning is disabled due to a destroy-able theap or options
+  // set by the user.
+  mi_page_queue_t* pq = &theap->pages[MI_BIN_FULL];
+  for (mi_page_t* page = pq->first; page != NULL; ) {
+    mi_page_t* next = page->next;         // get next in case we free the page
+    _mi_page_free_collect(page, false);   // register concurrent free's
+    // no longer full?
+    if (!mi_page_is_full(page)) {
+      if (mi_page_all_free(page)) {
+        _mi_page_free(page, pq);
+      }
+      else {
+        _mi_page_unfull(page);
+      }
+    }
+    page = next;
+  }
 }
+*/
 
 
 /* -----------------------------------------------------------
@@ -509,9 +524,8 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
 #define MI_MAX_SLICES       (1UL << MI_MAX_SLICE_SHIFT)
 #define MI_MIN_SLICES       (2)
 
-static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats) {
-  MI_UNUSED(stats);
-  #if (MI_SECURE<=2)
+static void mi_page_free_list_extend_secure(mi_theap_t* const theap, mi_page_t* const page, const size_t bsize, const size_t extend) {
+  #if (MI_SECURE<3)
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->local_free == NULL);
   #endif
@@ -538,7 +552,7 @@ static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* co
 
   // and initialize the free list by randomly threading through them
   // set up first element
-  const uintptr_t r = _mi_heap_random_next(heap);
+  const uintptr_t r = _mi_theap_random_next(theap);
   size_t current = r % slice_count;
   counts[current]--;
   mi_block_t* const free_start = blocks[current];
@@ -566,10 +580,9 @@ static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* co
   page->free = free_start;
 }
 
-static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats)
+static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t bsize, const size_t extend)
 {
-  MI_UNUSED(stats);
-  #if (MI_SECURE <= 2)
+  #if (MI_SECURE<3)
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->local_free == NULL);
   #endif
@@ -597,10 +610,10 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 ----------------------------------------------------------- */
 
 #define MI_MAX_EXTEND_SIZE    (4*1024)      // heuristic, one OS page seems to work well.
-#if (MI_SECURE>0)
+#if (MI_SECURE>=3)
 #define MI_MIN_EXTEND         (8*MI_SECURE) // extend at least by this many
 #else
-#define MI_MIN_EXTEND         (4)
+#define MI_MIN_EXTEND         (1)
 #endif
 
 // Extend the capacity (up to reserved) by initializing a free list
@@ -608,21 +621,25 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 // Note: we also experimented with "bump" allocation on the first
 // allocations but this did not speed up any benchmark (due to an
 // extra test in malloc? or cache effects?)
-static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) {
-  MI_UNUSED(tld);
+static bool mi_page_extend_free(mi_theap_t* theap, mi_page_t* page) {
   mi_assert_expensive(mi_page_is_valid_init(page));
-  #if (MI_SECURE<=2)
+  #if (MI_SECURE<3)
   mi_assert(page->free == NULL);
   mi_assert(page->local_free == NULL);
-  if (page->free != NULL) return;
+  if (page->free != NULL) return true;
   #endif
-  if (page->capacity >= page->reserved) return;
+  if (page->capacity >= page->reserved) return true;
 
-  mi_stat_counter_increase(tld->stats.pages_extended, 1);
+  size_t page_size;
+  //uint8_t* page_start =
+  mi_page_area(page, &page_size);
+  #if MI_STAT>0
+  mi_theap_stat_counter_increase(theap, pages_extended, 1);
+  #endif
 
   // calculate the extend count
   const size_t bsize = mi_page_block_size(page);
-  size_t extend = page->reserved - page->capacity;
+  size_t extend = (size_t)page->reserved - page->capacity;
   mi_assert_internal(extend > 0);
 
   size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/bsize);
@@ -638,73 +655,79 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
   mi_assert_internal(extend > 0 && extend + page->capacity <= page->reserved);
   mi_assert_internal(extend < (1UL<<16));
 
+  // commit on demand?
+  if (page->slice_committed > 0) {
+    const size_t needed_size = (page->capacity + extend)*bsize;
+    const size_t needed_commit = _mi_align_up( mi_page_slice_offset_of(page, needed_size), MI_PAGE_MIN_COMMIT_SIZE );
+    if (needed_commit > page->slice_committed) {
+      mi_assert_internal(((needed_commit - page->slice_committed) % _mi_os_page_size()) == 0);
+      if (!_mi_os_commit(mi_page_slice_start(page) + page->slice_committed, needed_commit - page->slice_committed, NULL)) {
+        return false;
+      }
+      page->slice_committed = needed_commit;
+    }
+  }
+
   // and append the extend the free list
-  if (extend < MI_MIN_SLICES || MI_SECURE==0) { //!mi_option_is_enabled(mi_option_secure)) {
-    mi_page_free_list_extend(page, bsize, extend, &tld->stats );
+  if (extend < MI_MIN_SLICES || MI_SECURE<3) { //!mi_option_is_enabled(mi_option_secure)) {
+    mi_page_free_list_extend(page, bsize, extend );
   }
   else {
-    mi_page_free_list_extend_secure(heap, page, bsize, extend, &tld->stats);
+    mi_page_free_list_extend_secure(theap, page, bsize, extend);
   }
   // enable the new free list
   page->capacity += (uint16_t)extend;
-  mi_stat_increase(tld->stats.page_committed, extend * bsize);
+  #if MI_STAT>0
+  mi_theap_stat_increase(theap, page_committed, extend * bsize);
+  #endif
   mi_assert_expensive(mi_page_is_valid_init(page));
+  return true;
 }
 
-// Initialize a fresh page
-static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi_tld_t* tld) {
+// Initialize a fresh page (that is already partially initialized)
+mi_decl_nodiscard bool _mi_page_init(mi_theap_t* theap, mi_page_t* page) {
   mi_assert(page != NULL);
-  mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert(segment != NULL);
-  mi_assert_internal(block_size > 0);
-  // set fields
-  mi_page_set_heap(page, heap);
-  page->block_size = block_size;
+  mi_assert(theap!=NULL);
+  page->heap = (_mi_is_heap_main(_mi_theap_heap(theap)) ? NULL : _mi_theap_heap(theap)); // faster for `mi_page_associated_theap`
+  mi_page_set_theap(page, theap);
+
   size_t page_size;
-  page->page_start = _mi_segment_page_start(segment, page, &page_size);
-  mi_track_mem_noaccess(page->page_start,page_size);
-  mi_assert_internal(mi_page_block_size(page) <= page_size);
-  mi_assert_internal(page_size <= page->slice_count*MI_SEGMENT_SLICE_SIZE);
-  mi_assert_internal(page_size / block_size < (1L<<16));
-  page->reserved = (uint16_t)(page_size / block_size);
+  uint8_t* page_start = mi_page_area(page, &page_size); MI_UNUSED(page_start);
+  mi_track_mem_noaccess(page_start,page_size);
+  mi_assert_internal(page_size / mi_page_block_size(page) < (1L<<16));
   mi_assert_internal(page->reserved > 0);
   #if (MI_PADDING || MI_ENCODE_FREELIST)
-  page->keys[0] = _mi_heap_random_next(heap);
-  page->keys[1] = _mi_heap_random_next(heap);
+  page->keys[0] = _mi_theap_random_next(theap);
+  page->keys[1] = _mi_theap_random_next(theap);
   #endif
-  page->free_is_zero = page->is_zero_init;
   #if MI_DEBUG>2
-  if (page->is_zero_init) {
-    mi_track_mem_defined(page->page_start, page_size);
-    mi_assert_expensive(mi_mem_is_zero(page->page_start, page_size));
+  if (page->memid.initially_zero) {
+    mi_track_mem_defined(page->page_start, mi_page_committed(page));
+    mi_assert_expensive(mi_mem_is_zero(page_start, mi_page_committed(page)));
   }
   #endif
-  mi_assert_internal(page->is_committed);
-  if (block_size > 0 && _mi_is_power_of_two(block_size)) {
-    page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size));
-  }
-  else {
-    page->block_size_shift = 0;
-  }
 
+  mi_assert_internal(page->theap!=NULL);
+  mi_assert_internal(page->theap == mi_page_theap(page));
   mi_assert_internal(page->capacity == 0);
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->used == 0);
-  mi_assert_internal(page->xthread_free == 0);
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(page->xthread_free == 1);
   mi_assert_internal(page->next == NULL);
   mi_assert_internal(page->prev == NULL);
   mi_assert_internal(page->retire_expire == 0);
-  mi_assert_internal(!mi_page_has_aligned(page));
+  mi_assert_internal(!mi_page_has_interior_pointers(page));
   #if (MI_PADDING || MI_ENCODE_FREELIST)
   mi_assert_internal(page->keys[0] != 0);
   mi_assert_internal(page->keys[1] != 0);
   #endif
-  mi_assert_internal(page->block_size_shift == 0 || (block_size == ((size_t)1 << page->block_size_shift)));
   mi_assert_expensive(mi_page_is_valid_init(page));
 
   // initialize an initial free list
-  mi_page_extend_free(heap,page,tld);
+  if (!mi_page_extend_free(theap,page)) return false;
   mi_assert(mi_page_immediate_available(page));
+  return true;
 }
 
 
@@ -713,85 +736,140 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
 -------------------------------------------------------------*/
 
 // Find a page with free blocks of `page->block_size`.
-static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
+static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_theap_t* theap, mi_page_queue_t* pq, bool first_try)
 {
   // search through the pages in "next fit" order
-  #if MI_STAT
   size_t count = 0;
-  #endif
+  long candidate_limit = 0;          // we reset this on the first candidate to limit the search
+  long page_full_retain = (pq->block_size > MI_SMALL_MAX_OBJ_SIZE ? 0 : theap->page_full_retain); // only retain small pages
+  mi_page_t* page_candidate = NULL;  // a page with free space
   mi_page_t* page = pq->first;
+
   while (page != NULL)
   {
-    mi_page_t* next = page->next; // remember next
-    #if MI_STAT
+    mi_page_t* next = page->next; // remember next (as this page can move to another queue)
     count++;
-    #endif
+    candidate_limit--;
 
-    // 0. collect freed blocks by us and other threads
-    _mi_page_free_collect(page, false);
+    // search up to N pages for a best candidate
 
-    // 1. if the page contains free blocks, we are done
-    if (mi_page_immediate_available(page)) {
-      break;  // pick this one
+    // is the local free list non-empty?
+    bool immediate_available = mi_page_immediate_available(page);
+    if (!immediate_available) {
+      // collect freed blocks by us and other threads to we get a proper use count
+      _mi_page_free_collect(page, false);
+      immediate_available = mi_page_immediate_available(page);
     }
 
-    // 2. Try to extend
-    if (page->capacity < page->reserved) {
-      mi_page_extend_free(heap, page, heap->tld);
-      mi_assert_internal(mi_page_immediate_available(page));
-      break;
+    // if the page is completely full, move it to the `mi_pages_full`
+    // queue so we don't visit long-lived pages too often.
+    if (!immediate_available && !mi_page_is_expandable(page)) {
+      page_full_retain--;
+      if (page_full_retain < 0) {
+        mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
+        mi_page_to_full(page, pq);
+      }
+    }
+    else {
+      // the page has free space, make it a candidate
+      // we prefer non-expandable pages with high usage as candidates (to reduce commit, and increase chances of free-ing up pages)
+      if (page_candidate == NULL) {
+        page_candidate = page;
+        candidate_limit = _mi_option_get_fast(mi_option_page_max_candidates);
+      }
+      else if (mi_page_all_free(page_candidate)) {
+        _mi_page_free(page_candidate, pq);
+        page_candidate = page;
+      }
+      // prefer to reuse fuller pages (in the hope the less used page gets freed)
+      else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page)) { // && !mi_page_is_expandable(page)) {
+        page_candidate = page;
+      }
+      // if we find a non-expandable candidate, or searched for N pages, return with the best candidate
+      if (immediate_available || candidate_limit <= 0) {
+        mi_assert_internal(page_candidate!=NULL);
+        break;
+      }
+    }
+
+  #if 0
+    // first-fit algorithm without candidates
+    // If the page contains free blocks, we are done
+    if (mi_page_immediate_available(page) || mi_page_is_expandable(page)) {
+      break;  // pick this one
     }
 
-    // 3. If the page is completely full, move it to the `mi_pages_full`
+    // If the page is completely full, move it to the `mi_pages_full`
     // queue so we don't visit long-lived pages too often.
     mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
     mi_page_to_full(page, pq);
+  #endif
 
     page = next;
   } // for each page
 
-  mi_heap_stat_counter_increase(heap, searches, count);
+  mi_theap_stat_counter_increase(theap, page_searches, count);
+  mi_theap_stat_counter_increase(theap, page_searches_count, 1);
+
+  // set the page to the best candidate
+  if (page_candidate != NULL) {
+    page = page_candidate;
+  }
+  if (page != NULL) {
+    if (!mi_page_immediate_available(page)) {
+      mi_assert_internal(mi_page_is_expandable(page));
+      if (!mi_page_extend_free(theap, page)) {
+        page = NULL; // failed to extend
+      }
+    }
+    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
+  }
 
   if (page == NULL) {
-    _mi_heap_collect_retired(heap, false); // perhaps make a page available?
-    page = mi_page_fresh(heap, pq);
+    _mi_theap_collect_retired(theap, false); // perhaps make a page available
+    page = mi_page_fresh(theap, pq);
+    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
     if (page == NULL && first_try) {
       // out-of-memory _or_ an abandoned page with free blocks was reclaimed, try once again
-      page = mi_page_queue_find_free_ex(heap, pq, false);
+      page = mi_page_queue_find_free_ex(theap, pq, false);
+      mi_assert_internal(page == NULL || mi_page_immediate_available(page));
     }
   }
   else {
-    mi_assert(pq->first == page);
+    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
+    // move the page to the front of the queue
+    mi_page_queue_move_to_front(theap, pq, page);
     page->retire_expire = 0;
+    // _mi_theap_collect_retired(theap, false); // update retire counts; note: increases rss on MemoryLoad bench so don't do this
   }
   mi_assert_internal(page == NULL || mi_page_immediate_available(page));
+
+
   return page;
 }
 
 
 
 // Find a page with free blocks of `size`.
-static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
-  mi_page_queue_t* pq = mi_page_queue(heap,size);
+static mi_page_t* mi_find_free_page(mi_theap_t* theap, mi_page_queue_t* pq) {
+  // mi_page_queue_t* pq = mi_page_queue(theap, size);
+  mi_assert_internal(!mi_page_queue_is_huge(pq));
+
+  // check the first page: we even do this with candidate search or otherwise we re-search every time
   mi_page_t* page = pq->first;
-  if (page != NULL) {
-   #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
-    if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) {
-      mi_page_extend_free(heap, page, heap->tld);
+  if mi_likely(page != NULL && mi_page_free_quick_collect(page)) {
+    #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
+    if (page->capacity < page->reserved && ((_mi_theap_random_next(theap) & 1) == 1)) {
+      (void)mi_page_extend_free(theap, page);  // ok if this fails
       mi_assert_internal(mi_page_immediate_available(page));
     }
-    else
-   #endif
-    {
-      _mi_page_free_collect(page,false);
-    }
-
-    if (mi_page_immediate_available(page)) {
-      page->retire_expire = 0;
-      return page; // fast path
-    }
+    #endif
+    page->retire_expire = 0;
+    return page; // fast path
+  }
+  else {
+    return mi_page_queue_find_free_ex(theap, pq, true);
   }
-  return mi_page_queue_find_free_ex(heap, pq, true);
 }
 
 
@@ -805,12 +883,12 @@ static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
 static mi_deferred_free_fun* volatile deferred_free = NULL;
 static _Atomic(void*) deferred_arg; // = NULL
 
-void _mi_deferred_free(mi_heap_t* heap, bool force) {
-  heap->tld->heartbeat++;
-  if (deferred_free != NULL && !heap->tld->recurse) {
-    heap->tld->recurse = true;
-    deferred_free(force, heap->tld->heartbeat, mi_atomic_load_ptr_relaxed(void,&deferred_arg));
-    heap->tld->recurse = false;
+void _mi_deferred_free(mi_theap_t* theap, bool force) {
+  theap->heartbeat++;
+  if (deferred_free != NULL && !theap->tld->recurse) {
+    theap->tld->recurse = true;
+    deferred_free(force, theap->heartbeat, mi_atomic_load_ptr_relaxed(void,&deferred_arg));
+    theap->tld->recurse = false;
   }
 }
 
@@ -824,46 +902,30 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
   General allocation
 ----------------------------------------------------------- */
 
-// Large and huge page allocation.
-// Huge pages contain just one block, and the segment contains just that page (as `MI_SEGMENT_HUGE`).
+// Huge pages contain just one block, and the segment contains just that page.
 // Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX)
 // so their size is not always `> MI_LARGE_OBJ_SIZE_MAX`.
-static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
-  size_t block_size = _mi_os_good_alloc_size(size);
-  mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
-  bool is_huge = (block_size > MI_LARGE_OBJ_SIZE_MAX || page_alignment > 0);
+static mi_page_t* mi_huge_page_alloc(mi_theap_t* theap, size_t size, size_t page_alignment, mi_page_queue_t* pq) {
+  const size_t block_size = _mi_os_good_alloc_size(size);
+  // mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
   #if MI_HUGE_PAGE_ABANDON
-  mi_page_queue_t* pq = (is_huge ? NULL : mi_page_queue(heap, block_size));
+  #error todo.
   #else
-  mi_page_queue_t* pq = mi_page_queue(heap, is_huge ? MI_LARGE_OBJ_SIZE_MAX+1 : block_size);
-  mi_assert_internal(!is_huge || mi_page_queue_is_huge(pq));
+  // mi_page_queue_t* pq = mi_page_queue(theap, MI_LARGE_MAX_OBJ_SIZE+1);  // always in the huge queue regardless of the block size
+  mi_assert_internal(mi_page_queue_is_huge(pq));
   #endif
-  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
+  mi_page_t* page = mi_page_fresh_alloc(theap, pq, block_size, page_alignment);
   if (page != NULL) {
+    mi_assert_internal(mi_page_block_size(page) >= size);
     mi_assert_internal(mi_page_immediate_available(page));
-
-    if (is_huge) {
-      mi_assert_internal(mi_page_is_huge(page));
-      mi_assert_internal(_mi_page_segment(page)->kind == MI_SEGMENT_HUGE);
-      mi_assert_internal(_mi_page_segment(page)->used==1);
-      #if MI_HUGE_PAGE_ABANDON
-      mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
-      mi_page_set_heap(page, NULL);
-      #endif
-    }
-    else {
-      mi_assert_internal(!mi_page_is_huge(page));
-    }
-
-    const size_t bsize = mi_page_usable_block_size(page);  // note: not `mi_page_block_size` to account for padding
-    if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-      mi_heap_stat_increase(heap, large, bsize);
-      mi_heap_stat_counter_increase(heap, large_count, 1);
-    }
-    else {
-      mi_heap_stat_increase(heap, huge, bsize);
-      mi_heap_stat_counter_increase(heap, huge_count, 1);
-    }
+    mi_assert_internal(mi_page_is_huge(page));
+    mi_assert_internal(mi_page_is_singleton(page));
+    #if MI_HUGE_PAGE_ABANDON
+    mi_assert_internal(mi_page_is_abandoned(page));
+    mi_page_set_theap(page, NULL);
+    #endif
+    mi_theap_stat_increase(theap, malloc_huge, mi_page_block_size(page));
+    mi_theap_stat_counter_increase(theap, malloc_huge_count, 1);
   }
   return page;
 }
@@ -871,53 +933,78 @@ static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size, size_t
 
 // Allocate a page
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
-static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept {
-  // huge allocation?
+static mi_page_t* mi_find_page(mi_theap_t* theap, size_t size, size_t huge_alignment) mi_attr_noexcept {
   const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
-  if mi_unlikely(req_size > (MI_MEDIUM_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) {
-    if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {
-      _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
-      return NULL;
-    }
-    else {
-      return mi_large_huge_page_alloc(heap,size,huge_alignment);
-    }
+  if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {
+    _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
+    return NULL;
+  }
+  mi_page_queue_t* pq = mi_page_queue(theap, (huge_alignment > 0 ? MI_LARGE_MAX_OBJ_SIZE+1 : size));
+  // huge allocation?
+  if mi_unlikely(mi_page_queue_is_huge(pq) || req_size > MI_MAX_ALLOC_SIZE) {
+    return mi_huge_page_alloc(theap,size,huge_alignment,pq);
   }
   else {
     // otherwise find a page with free blocks in our size segregated queues
     #if MI_PADDING
     mi_assert_internal(size >= MI_PADDING_SIZE);
     #endif
-    return mi_find_free_page(heap, size);
+    return mi_find_free_page(theap, pq);
   }
 }
 
+
 // Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed.
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
-// The `huge_alignment` is normally 0 but is set to a multiple of MI_SEGMENT_SIZE for
-// very large requested alignments in which case we use a huge segment.
-void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept
+// The `huge_alignment` is normally 0 but is set to a multiple of MI_SLICE_SIZE for
+// very large requested alignments in which case we use a huge singleton page.
+// Note: we put `bool zero, size_t huge_alignment` into one parameter (with zero in the low bit)
+// to use 4 parameters which compiles better on msvc for the malloc fast path.
+void* _mi_malloc_generic(mi_theap_t* theap, size_t size, size_t zero_huge_alignment, size_t* usable) mi_attr_noexcept
 {
-  mi_assert_internal(heap != NULL);
+  const bool zero = ((zero_huge_alignment & 1) != 0);
+  const size_t huge_alignment = (zero_huge_alignment & ~1);
+
+  #if !MI_THEAP_INITASNULL
+  mi_assert_internal(theap != NULL);
+  #endif
 
   // initialize if necessary
-  if mi_unlikely(!mi_heap_is_initialized(heap)) {
-    heap = mi_heap_get_default(); // calls mi_thread_init
-    if mi_unlikely(!mi_heap_is_initialized(heap)) { return NULL; }
+  if mi_unlikely(!mi_theap_is_initialized(theap)) {
+    if (theap==&_mi_theap_empty_wrong) {
+      // we were unable to allocate a theap for a first-class heap
+      return NULL;
+    }
+    // otherwise we initialize the thread and its default theap
+    mi_thread_init();
+    theap = _mi_theap_default();
+    if mi_unlikely(!mi_theap_is_initialized(theap)) { return NULL; }
+    mi_assert_internal(_mi_theap_default()==theap);
+  }
+  mi_assert_internal(mi_theap_is_initialized(theap));
+
+  // do administrative tasks every N generic mallocs
+  if mi_unlikely(++theap->generic_count >= 1000) {
+    theap->generic_collect_count += theap->generic_count;
+    theap->generic_count = 0;
+    // call potential deferred free routines
+    _mi_deferred_free(theap, false);
+    // free retired pages
+    _mi_theap_collect_retired(theap, false);
+
+    // collect every once in a while (10000 by default)
+    const long generic_collect = mi_option_get_clamp(mi_option_generic_collect, 1, 1000000L);
+    if (theap->generic_collect_count >= generic_collect) {
+      theap->generic_collect_count = 0;
+      mi_theap_collect(theap, false /* force? */);
+    }
   }
-  mi_assert_internal(mi_heap_is_initialized(heap));
-
-  // call potential deferred free routines
-  _mi_deferred_free(heap, false);
-
-  // free delayed frees from other threads (but skip contended ones)
-  _mi_heap_delayed_free_partial(heap);
 
   // find (or allocate) a page of the right size
-  mi_page_t* page = mi_find_page(heap, size, huge_alignment);
+  mi_page_t* page = mi_find_page(theap, size, huge_alignment);
   if mi_unlikely(page == NULL) { // first time out of memory, try to collect and retry the allocation once more
-    mi_heap_collect(heap, true /* force */);
-    page = mi_find_page(heap, size, huge_alignment);
+    mi_theap_collect(theap, true /* force? */);
+    page = mi_find_page(theap, size, huge_alignment);
   }
 
   if mi_unlikely(page == NULL) { // out of memory
@@ -928,16 +1015,17 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
 
   mi_assert_internal(mi_page_immediate_available(page));
   mi_assert_internal(mi_page_block_size(page) >= size);
+  mi_assert_internal(_mi_is_aligned(mi_page_slice_start(page), MI_PAGE_ALIGN));
+  mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
 
   // and try again, this time succeeding! (i.e. this should never recurse through _mi_page_malloc)
-  if mi_unlikely(zero && page->block_size == 0) {
-    // note: we cannot call _mi_page_malloc with zeroing for huge blocks; we zero it afterwards in that case.
-    void* p = _mi_page_malloc(heap, page, size);
-    mi_assert_internal(p != NULL);
-    _mi_memzero_aligned(p, mi_page_usable_block_size(page));
-    return p;
-  }
-  else {
-    return _mi_page_malloc_zero(heap, page, size, zero);
+  if (usable!=NULL) { *usable = mi_page_usable_block_size(page); }
+  void* const p = _mi_page_malloc_zero(theap,page,size,zero);
+  mi_assert_internal(p != NULL);
+
+  // move full pages to the full queue
+  if (mi_page_block_size(page) > MI_SMALL_MAX_OBJ_SIZE && mi_page_is_full(page)) {
+    mi_page_to_full(page, mi_page_queue_of(page));
   }
+  return p;
 }
diff --git a/system/lib/mimalloc/src/prim/emscripten/prim.c b/system/lib/mimalloc/src/prim/emscripten/prim.c
index 069cdb3b57ef8..5640b1e497be0 100644
--- a/system/lib/mimalloc/src/prim/emscripten/prim.c
+++ b/system/lib/mimalloc/src/prim/emscripten/prim.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023, Microsoft Research, Daan Leijen, Alon Zakai
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen, Alon Zakai
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -58,7 +58,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config) {
 extern void emmalloc_free(void*);
 
 int _mi_prim_free(void* addr, size_t size) {
-  MI_UNUSED(size);
+  if (size==0) return 0;
   emmalloc_free(addr);
   return 0;
 }
@@ -71,8 +71,8 @@ int _mi_prim_free(void* addr, size_t size) {
 extern void* emmalloc_memalign(size_t alignment, size_t size);
 
 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
-int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
-  MI_UNUSED(allow_large); MI_UNUSED(commit);
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+  MI_UNUSED(allow_large); MI_UNUSED(commit); MI_UNUSED(hint_addr);
   *is_large = false;
   // TODO: Track the highest address ever seen; first uses of it are zeroes.
   //       That assumes no one else uses sbrk but us (they could go up,
@@ -110,6 +110,11 @@ int _mi_prim_reset(void* addr, size_t size) {
   return 0;
 }
 
+int _mi_prim_reuse(void* addr, size_t size) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  return 0;
+}
+
 int _mi_prim_protect(void* addr, size_t size, bool protect) {
   MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect);
   return 0;
@@ -196,15 +201,15 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
 // Thread init/done
 //----------------------------------------------------------------
 
-#ifdef __EMSCRIPTEN_SHARED_MEMORY__
+#if defined(MI_USE_PTHREADS)
 
 // use pthread local storage keys to detect thread ending
-// (and used with MI_TLS_PTHREADS for the default heap)
+// (and used with MI_TLS_PTHREADS for the default theap)
 pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
 
 static void mi_pthread_done(void* value) {
   if (value!=NULL) {
-    _mi_thread_done((mi_heap_t*)value);
+    _mi_thread_done((mi_theap_t*)value);
   }
 }
 
@@ -219,9 +224,9 @@ void _mi_prim_thread_done_auto_done(void) {
   }
 }
 
-void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+void _mi_prim_thread_associate_default_theap(mi_theap_t* theap) {
   if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
-    pthread_setspecific(_mi_heap_default_key, heap);
+    pthread_setspecific(_mi_heap_default_key, theap);
   }
 }
 
@@ -235,8 +240,11 @@ void _mi_prim_thread_done_auto_done(void) {
   // nothing
 }
 
-void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
-  MI_UNUSED(heap);
-
+void _mi_prim_thread_associate_default_theap(mi_theap_t* theap) {
+  MI_UNUSED(theap);
 }
 #endif
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  return false;
+}
diff --git a/system/lib/mimalloc/src/prim/osx/alloc-override-zone.c b/system/lib/mimalloc/src/prim/osx/alloc-override-zone.c
index 1515b886b20b7..aa971c39fa066 100644
--- a/system/lib/mimalloc/src/prim/osx/alloc-override-zone.c
+++ b/system/lib/mimalloc/src/prim/osx/alloc-override-zone.c
@@ -64,7 +64,8 @@ static void* zone_valloc(malloc_zone_t* zone, size_t size) {
 
 static void zone_free(malloc_zone_t* zone, void* p) {
   MI_UNUSED(zone);
-  mi_cfree(p);
+  // mi_cfree(p);  // checked free as `zone_free` may be called with invalid pointers
+  mi_free(p); // with the page_map and pagemap_commit=1 we can use the regular free
 }
 
 static void* zone_realloc(malloc_zone_t* zone, void* p, size_t newsize) {
@@ -83,7 +84,7 @@ static void zone_destroy(malloc_zone_t* zone) {
 }
 
 static unsigned zone_batch_malloc(malloc_zone_t* zone, size_t size, void** ps, unsigned count) {
-  size_t i;
+  unsigned i;
   for (i = 0; i < count; i++) {
     ps[i] = zone_malloc(zone, size);
     if (ps[i] == NULL) break;
@@ -418,9 +419,9 @@ static inline malloc_zone_t* mi_get_default_zone(void)
 }
 
 #if defined(__clang__)
-__attribute__((constructor(0)))
+__attribute__((constructor(101))) // highest priority
 #else
-__attribute__((constructor))      // seems not supported by g++-11 on the M1
+__attribute__((constructor))      // priority level is not supported by gcc
 #endif
 __attribute__((used))
 static void _mi_macos_override_malloc(void) {
diff --git a/system/lib/mimalloc/src/prim/prim.c b/system/lib/mimalloc/src/prim/prim.c
index 3b7d373642f51..5147bae81feaa 100644
--- a/system/lib/mimalloc/src/prim/prim.c
+++ b/system/lib/mimalloc/src/prim/prim.c
@@ -25,3 +25,52 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "unix/prim.c"     // mmap() (Linux, macOSX, BSD, Illumnos, Haiku, DragonFly, etc.)
 
 #endif
+
+// Generic process initialization
+#ifndef MI_PRIM_HAS_PROCESS_ATTACH
+#if defined(__GNUC__) || defined(__clang__)
+  // gcc,clang: use the constructor/destructor attribute
+  // which for both seem to run before regular constructors/destructors
+  #if defined(__clang__)
+    #define mi_attr_constructor __attribute__((constructor(101)))
+    #define mi_attr_destructor  __attribute__((destructor(101)))
+  #else
+    #define mi_attr_constructor __attribute__((constructor))
+    #define mi_attr_destructor  __attribute__((destructor))
+  #endif
+  static void mi_attr_constructor mi_process_attach(void) {
+    _mi_auto_process_init();
+  }
+  static void mi_attr_destructor mi_process_detach(void) {
+    _mi_auto_process_done();
+  }
+#elif defined(__cplusplus)
+  // C++: use static initialization to detect process start/end
+  // This is not guaranteed to be first/last but the best we can generally do?
+  struct mi_init_done_t {
+    mi_init_done_t() {
+      _mi_auto_process_init();
+    }
+    ~mi_init_done_t() {
+      _mi_auto_process_done();
+    }
+  };
+  static mi_init_done_t mi_init_done;
+ #else
+  #pragma message("define a way to call _mi_auto_process_init/done on your platform")
+#endif
+#endif
+
+// Generic allocator init/done callback
+#ifndef MI_PRIM_HAS_ALLOCATOR_INIT
+bool _mi_is_redirected(void) {
+  return false;
+}
+bool _mi_allocator_init(const char** message) {
+  if (message != NULL) { *message = NULL; }
+  return true;
+}
+void _mi_allocator_done(void) {
+  // nothing to do
+}
+#endif
diff --git a/system/lib/mimalloc/src/prim/unix/prim.c b/system/lib/mimalloc/src/prim/unix/prim.c
index dd665d3d1f192..a40afa589ec47 100644
--- a/system/lib/mimalloc/src/prim/unix/prim.c
+++ b/system/lib/mimalloc/src/prim/unix/prim.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -22,20 +22,22 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"
 
 #include <sys/mman.h>  // mmap
 #include <unistd.h>    // sysconf
 #include <fcntl.h>     // open, close, read, access
+#include <stdlib.h>    // getenv, arc4random_buf
 
 #if defined(__linux__)
   #include <features.h>
-  #if defined(MI_NO_THP)
-  #include <sys/prctl.h>
+  #include <sys/prctl.h>    // THP disable, PR_SET_VMA
+  #include <sys/sysinfo.h>  // sysinfo
+  #if defined(__GLIBC__) && !defined(PR_SET_VMA)
+  #include <linux/prctl.h>
   #endif
   #if defined(__GLIBC__)
-  #include <linux/mman.h> // linux mmap flags
+  #include <linux/mman.h>   // linux mmap flags
   #else
   #include <sys/mman.h>
   #endif
@@ -48,6 +50,7 @@ terms of the MIT license. A copy of the license can be found in the file
   #if !defined(MAC_OS_X_VERSION_10_7)
   #define MAC_OS_X_VERSION_10_7   1070
   #endif
+  #include <sys/sysctl.h>
 #elif defined(__FreeBSD__) || defined(__DragonFly__)
   #include <sys/param.h>
   #if __FreeBSD_version >= 1200000
@@ -57,11 +60,19 @@ terms of the MIT license. A copy of the license can be found in the file
   #include <sys/sysctl.h>
 #endif
 
-#if defined(__linux__) || defined(__FreeBSD__)
+#if (defined(__linux__) && !defined(__ANDROID__)) || defined(__FreeBSD__)
   #define MI_HAS_SYSCALL_H
   #include <sys/syscall.h>
 #endif
 
+#if !defined(MADV_DONTNEED) && defined(POSIX_MADV_DONTNEED)  // QNX
+#define MADV_DONTNEED  POSIX_MADV_DONTNEED
+#endif
+#if !defined(MADV_FREE) && defined(POSIX_MADV_FREE)  // QNX
+#define MADV_FREE  POSIX_MADV_FREE
+#endif
+
+#define MI_UNIX_LARGE_PAGE_SIZE (2*MI_MiB) // TODO: can we query the OS for this?
 
 //------------------------------------------------------------------------------------
 // Use syscalls for some primitives to allow for libraries that override open/read/close etc.
@@ -110,50 +121,101 @@ static inline int mi_prim_access(const char *fpath, int mode) {
 
 static bool unix_detect_overcommit(void) {
   bool os_overcommit = true;
-#if defined(__linux__)
-  int fd = mi_prim_open("/proc/sys/vm/overcommit_memory", O_RDONLY);
-	if (fd >= 0) {
+  #if defined(__linux__)
+    int fd = mi_prim_open("/proc/sys/vm/overcommit_memory", O_RDONLY);
+    if (fd >= 0) {
+      char buf[32];
+      ssize_t nread = mi_prim_read(fd, &buf, sizeof(buf));
+      mi_prim_close(fd);
+      // <https://www.kernel.org/doc/Documentation/vm/overcommit-accounting>
+      // 0: heuristic overcommit, 1: always overcommit, 2: never overcommit (ignore NORESERVE)
+      if (nread >= 1) {
+        os_overcommit = (buf[0] == '0' || buf[0] == '1');
+      }
+    }
+  #elif defined(__FreeBSD__)
+    int val = 0;
+    size_t olen = sizeof(val);
+    if (sysctlbyname("vm.overcommit", &val, &olen, NULL, 0) == 0) {
+      os_overcommit = (val != 0);
+    }
+  #else
+    // default: overcommit is true
+  #endif
+  return os_overcommit;
+}
+
+static bool unix_detect_thp(void) {
+  bool thp_enabled = false;
+  #if defined(__linux__)
+  int fd = mi_prim_open("/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
+  if (fd >= 0) {
     char buf[32];
     ssize_t nread = mi_prim_read(fd, &buf, sizeof(buf));
     mi_prim_close(fd);
-    // <https://www.kernel.org/doc/Documentation/vm/overcommit-accounting>
-    // 0: heuristic overcommit, 1: always overcommit, 2: never overcommit (ignore NORESERVE)
+    // <https://www.kernel.org/doc/html/latest/admin-guide/mm/transhuge.html>
+    // between brackets is the current value, for example: always [madvise] never
     if (nread >= 1) {
-      os_overcommit = (buf[0] == '0' || buf[0] == '1');
+      thp_enabled = (_mi_strnstr(buf,32,"[never]") == NULL);
     }
   }
-#elif defined(__FreeBSD__)
-  int val = 0;
-  size_t olen = sizeof(val);
-  if (sysctlbyname("vm.overcommit", &val, &olen, NULL, 0) == 0) {
-    os_overcommit = (val != 0);
-  }
-#else
-  // default: overcommit is true
-#endif
-  return os_overcommit;
+  #endif
+  return thp_enabled;
+}
+
+// try to detect the physical memory dynamically (if possible)
+static void unix_detect_physical_memory( size_t page_size, size_t* physical_memory_in_kib ) {
+  #if defined(CTL_HW) && (defined(HW_PHYSMEM64) || defined(HW_MEMSIZE))  // freeBSD, macOS
+    MI_UNUSED(page_size);
+    int64_t physical_memory = 0;
+    size_t length = sizeof(int64_t);
+    #if defined(HW_PHYSMEM64)
+    int mib[2] = { CTL_HW, HW_PHYSMEM64 };
+    #else
+    int mib[2] = { CTL_HW, HW_MEMSIZE };
+    #endif
+    const int err = sysctl(mib, 2, &physical_memory, &length, NULL, 0);
+    if (err==0 && physical_memory > 0) {
+      const int64_t phys_in_kib = physical_memory / MI_KiB;
+      if (phys_in_kib > 0 && (uint64_t)phys_in_kib <= SIZE_MAX) {
+        *physical_memory_in_kib = (size_t)phys_in_kib;
+      }
+    }
+  #elif defined(__linux__)
+    MI_UNUSED(page_size);
+    struct sysinfo info; _mi_memzero_var(info);
+    const int err = sysinfo(&info);
+    if (err==0 && info.totalram > 0 && info.totalram <= SIZE_MAX) {
+      *physical_memory_in_kib = (size_t)info.totalram / MI_KiB;
+    }
+  #elif defined(_SC_PHYS_PAGES)  // do not use by default as it might cause allocation (by using `fopen` to parse /proc/meminfo) (issue #1100)
+    const long pphys = sysconf(_SC_PHYS_PAGES);
+    const size_t psize_in_kib = page_size / MI_KiB;
+    if (psize_in_kib > 0 && pphys > 0 && (unsigned long)pphys <= SIZE_MAX && (size_t)pphys <= (SIZE_MAX/psize_in_kib)) {
+      *physical_memory_in_kib = (size_t)pphys * psize_in_kib;
+    }
+  #endif
 }
 
 void _mi_prim_mem_init( mi_os_mem_config_t* config )
 {
   long psize = sysconf(_SC_PAGESIZE);
-  if (psize > 0) {
+  if (psize > 0 && (unsigned long)psize < SIZE_MAX) {
     config->page_size = (size_t)psize;
     config->alloc_granularity = (size_t)psize;
+    unix_detect_physical_memory(config->page_size, &config->physical_memory_in_kib);
   }
-  config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
+  config->large_page_size = MI_UNIX_LARGE_PAGE_SIZE;
   config->has_overcommit = unix_detect_overcommit();
   config->has_partial_free = true;    // mmap can free in parts
   config->has_virtual_reserve = true; // todo: check if this true for NetBSD?  (for anonymous mmap with PROT_NONE)
+  config->has_transparent_huge_pages = unix_detect_thp();
 
   // disable transparent huge pages for this process?
   #if (defined(__linux__) || defined(__ANDROID__)) && defined(PR_GET_THP_DISABLE)
-  #if defined(MI_NO_THP)
-  if (true)
-  #else
-  if (!mi_option_is_enabled(mi_option_allow_large_os_pages)) // disable THP also if large OS pages are not allowed in the options
-  #endif
+  if (!mi_option_is_enabled(mi_option_allow_thp)) // disable THP if requested through an option
   {
+    config->has_transparent_huge_pages = false;
     int val = 0;
     if (prctl(PR_GET_THP_DISABLE, &val, 0, 0, 0) != 0) {
       // Most likely since distros often come with always/madvise settings.
@@ -171,6 +233,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
 //---------------------------------------------
 
 int _mi_prim_free(void* addr, size_t size ) {
+  if (size==0) return 0;
   bool err = (munmap(addr, size) == -1);
   return (err ? errno : 0);
 }
@@ -182,20 +245,34 @@ int _mi_prim_free(void* addr, size_t size ) {
 
 static int unix_madvise(void* addr, size_t size, int advice) {
   #if defined(__sun)
-  return madvise((caddr_t)addr, size, advice);  // Solaris needs cast (issue #520)
+  int res = madvise((caddr_t)addr, size, advice);  // Solaris needs cast (issue #520)
+  #elif defined(__QNX__)
+  int res = posix_madvise(addr, size, advice);
   #else
-  return madvise(addr, size, advice);
+  int res = madvise(addr, size, advice);
+  #endif
+  return (res==0 ? 0 : errno);
+}
+
+static void* unix_mmap_prim(void* addr, size_t size, int protect_flags, int flags, int fd) {
+  void* p = mmap(addr, size, protect_flags, flags, fd, 0 /* offset */);
+  #if defined(__linux__) && defined(PR_SET_VMA)
+  if (p!=MAP_FAILED && p!=NULL) {
+    prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, p, size, "mimalloc");
+  }
   #endif
+  return p;
 }
 
-static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
+static void* unix_mmap_prim_aligned(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
   MI_UNUSED(try_alignment);
   void* p = NULL;
   #if defined(MAP_ALIGNED)  // BSD
   if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
-    size_t n = mi_bsr(try_alignment);
+    size_t n = 0;
+    mi_bsr(try_alignment, &n);
     if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
-      p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0);
+      p = unix_mmap_prim(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd);
       if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) {
         int err = errno;
         _mi_trace_message("unable to directly request aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, addr);
@@ -206,7 +283,7 @@ static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int p
   }
   #elif defined(MAP_ALIGN)  // Solaris
   if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
-    p = mmap((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd, 0);  // addr parameter is the required alignment
+    p = unix_mmap_prim((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd);  // addr parameter is the required alignment
     if (p!=MAP_FAILED) return p;
     // fall back to regular mmap
   }
@@ -216,7 +293,7 @@ static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int p
   if (addr == NULL) {
     void* hint = _mi_os_get_aligned_hint(try_alignment, size);
     if (hint != NULL) {
-      p = mmap(hint, size, protect_flags, flags, fd, 0);
+      p = unix_mmap_prim(hint, size, protect_flags, flags, fd);
       if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) {
         #if MI_TRACK_ENABLED  // asan sometimes does not instrument errno correctly?
         int err = 0;
@@ -231,7 +308,7 @@ static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int p
   }
   #endif
   // regular mmap
-  p = mmap(addr, size, protect_flags, flags, fd, 0);
+  p = unix_mmap_prim(addr, size, protect_flags, flags, fd);
   if (p!=MAP_FAILED) return p;
   // failed to allocate
   return NULL;
@@ -241,7 +318,7 @@ static int unix_mmap_fd(void) {
   #if defined(VM_MAKE_TAG)
   // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
   int os_tag = (int)mi_option_get(mi_option_os_tag);
-  if (os_tag < 100 || os_tag > 255) { os_tag = 100; }
+  if (os_tag < 100 || os_tag > 255) { os_tag = 254; }
   return VM_MAKE_TAG(os_tag);
   #else
   return -1;
@@ -265,7 +342,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
   protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD
   #endif
   // huge page allocation
-  if ((large_only || _mi_os_use_large_page(size, try_alignment)) && allow_large) {
+  if (allow_large && (large_only || (_mi_os_canuse_large_page(size, try_alignment) && mi_option_is_enabled(mi_option_allow_large_os_pages)))) {
     static _Atomic(size_t) large_page_try_ok; // = 0;
     size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
     if (!large_only && try_ok > 0) {
@@ -286,7 +363,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
       #endif
       #ifdef MAP_HUGE_1GB
       static bool mi_huge_pages_available = true;
-      if ((size % MI_GiB) == 0 && mi_huge_pages_available) {
+      if (large_only && (size % MI_GiB) == 0 && mi_huge_pages_available) {
         lflags |= MAP_HUGE_1GB;
       }
       else
@@ -302,13 +379,15 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
       if (large_only || lflags != flags) {
         // try large OS page allocation
         *is_large = true;
-        p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd);
+        p = unix_mmap_prim_aligned(addr, size, try_alignment, protect_flags, lflags, lfd);
         #ifdef MAP_HUGE_1GB
         if (p == NULL && (lflags & MAP_HUGE_1GB) == MAP_HUGE_1GB) {
           mi_huge_pages_available = false; // don't try huge 1GiB pages again
-          _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (errno: %i)\n", errno);
+          if (large_only) {
+            _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (errno: %i)\n", errno);
+          }
           lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
-          p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd);
+          p = unix_mmap_prim_aligned(addr, size, try_alignment, protect_flags, lflags, lfd);
         }
         #endif
         if (large_only) return p;
@@ -321,8 +400,9 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
   // regular allocation
   if (p == NULL) {
     *is_large = false;
-    p = unix_mmap_prim(addr, size, try_alignment, protect_flags, flags, fd);
-    if (p != NULL) {
+    p = unix_mmap_prim_aligned(addr, size, try_alignment, protect_flags, flags, fd);
+    #if !defined(MI_NO_THP)
+    if (p != NULL && allow_large && mi_option_is_enabled(mi_option_allow_thp) && _mi_os_canuse_large_page(size, try_alignment)) {
       #if defined(MADV_HUGEPAGE)
       // Many Linux systems don't allow MAP_HUGETLB but they support instead
       // transparent huge pages (THP). Generally, it is not required to call `madvise` with MADV_HUGE
@@ -330,35 +410,35 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
       // in that case -- in particular for our large regions (in `memory.c`).
       // However, some systems only allow THP if called with explicit `madvise`, so
       // when large OS pages are enabled for mimalloc, we call `madvise` anyways.
-      if (allow_large && _mi_os_use_large_page(size, try_alignment)) {
-        if (unix_madvise(p, size, MADV_HUGEPAGE) == 0) {
-          *is_large = true; // possibly
-        };
-      }
+      if (unix_madvise(p, size, MADV_HUGEPAGE) == 0) {
+        // *is_large = true; // possibly
+      };
       #elif defined(__sun)
-      if (allow_large && _mi_os_use_large_page(size, try_alignment)) {
-        struct memcntl_mha cmd = {0};
-        cmd.mha_pagesize = _mi_os_large_page_size();
-        cmd.mha_cmd = MHA_MAPSIZE_VA;
-        if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
-          *is_large = true;
-        }
+      struct memcntl_mha cmd = {0};
+      cmd.mha_pagesize = _mi_os_large_page_size();
+      cmd.mha_cmd = MHA_MAPSIZE_VA;
+      if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
+        // *is_large = true; // possibly
       }
       #endif
     }
+    #endif
   }
   return p;
 }
 
 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
-int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(commit || !allow_large);
   mi_assert_internal(try_alignment > 0);
+  if (hint_addr == NULL && size >= 8*MI_UNIX_LARGE_PAGE_SIZE && try_alignment > 1 && _mi_is_power_of_two(try_alignment) && try_alignment < MI_UNIX_LARGE_PAGE_SIZE) {
+    try_alignment = MI_UNIX_LARGE_PAGE_SIZE; // try to align along large page size for larger allocations
+  }
 
   *is_zero = true;
   int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
-  *addr = unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
+  *addr = unix_mmap(hint_addr, size, try_alignment, protect_flags, false, allow_large, is_large);
   return (*addr != NULL ? 0 : errno);
 }
 
@@ -368,7 +448,7 @@ int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_la
 //---------------------------------------------
 
 static void unix_mprotect_hint(int err) {
-  #if defined(__linux__) && (MI_SECURE>=2) // guard page around every mimalloc page
+  #if defined(__linux__) && ((MI_SECURE >= 2 && (!MI_PAGE_META_IS_SEPARATED || MI_PAGE_META_ALIGNED_FREE_SMALL)) || MI_SECURE >= 4) // guard page around every mimalloc page
   if (err == ENOMEM) {
     _mi_warning_message("The next warning may be caused by a low memory map limit.\n"
                         "  On Linux this is controlled by the vm.max_map_count -- maybe increase it?\n"
@@ -379,6 +459,10 @@ static void unix_mprotect_hint(int err) {
   #endif
 }
 
+
+
+
+
 int _mi_prim_commit(void* start, size_t size, bool* is_zero) {
   // commit: ensure we can access the area
   // note: we may think that *is_zero can be true since the memory
@@ -394,11 +478,25 @@ int _mi_prim_commit(void* start, size_t size, bool* is_zero) {
   return err;
 }
 
+int _mi_prim_reuse(void* start, size_t size) {
+  MI_UNUSED(start); MI_UNUSED(size);
+  #if defined(__APPLE__) && defined(MADV_FREE_REUSE)
+  return unix_madvise(start, size, MADV_FREE_REUSE);
+  #endif
+  return 0;
+}
+
 int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
   int err = 0;
-  // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
-  err = unix_madvise(start, size, MADV_DONTNEED);
-  #if !MI_DEBUG && !MI_SECURE
+  #if defined(__APPLE__) && defined(MADV_FREE_REUSABLE)
+    // decommit on macOS: use MADV_FREE_REUSABLE as it does immediate rss accounting (issue #1097)
+    err = unix_madvise(start, size, MADV_FREE_REUSABLE);
+    if (err) { err = unix_madvise(start, size, MADV_DONTNEED); }
+  #else
+    // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
+    err = unix_madvise(start, size, MADV_DONTNEED);
+  #endif
+  #if !MI_DEBUG && MI_SECURE<=2
     *needs_recommit = false;
   #else
     *needs_recommit = true;
@@ -415,14 +513,22 @@ int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
 }
 
 int _mi_prim_reset(void* start, size_t size) {
-  // We try to use `MADV_FREE` as that is the fastest. A drawback though is that it
+  int err = 0;
+
+  // on macOS can use MADV_FREE_REUSABLE (but we disable this for now as it seems slower)
+  #if 0 && defined(__APPLE__) && defined(MADV_FREE_REUSABLE)
+  err = unix_madvise(start, size, MADV_FREE_REUSABLE);
+  if (err==0) return 0;
+  // fall through
+  #endif
+
+  #if defined(MADV_FREE)
+  // Otherwise, we try to use `MADV_FREE` as that is the fastest. A drawback though is that it
   // will not reduce the `rss` stats in tools like `top` even though the memory is available
   // to other processes. With the default `MIMALLOC_PURGE_DECOMMITS=1` we ensure that by
   // default `MADV_DONTNEED` is used though.
-  #if defined(MADV_FREE)
   static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
   int oadvice = (int)mi_atomic_load_relaxed(&advice);
-  int err;
   while ((err = unix_madvise(start, size, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
   if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) {
     // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
@@ -430,7 +536,7 @@ int _mi_prim_reset(void* start, size_t size) {
     err = unix_madvise(start, size, MADV_DONTNEED);
   }
   #else
-  int err = unix_madvise(start, size, MADV_DONTNEED);
+  err = unix_madvise(start, size, MADV_DONTNEED);
   #endif
   return err;
 }
@@ -468,7 +574,7 @@ static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, co
 int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
   bool is_large = true;
   *is_zero = true;
-  *addr = unix_mmap(hint_addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
+  *addr = unix_mmap(hint_addr, size, MI_ARENA_SLICE_ALIGN, PROT_READ | PROT_WRITE, true, true, &is_large);
   if (*addr != NULL && numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
     unsigned long numa_mask = (1UL << numa_node);
     // TODO: does `mbind` work correctly for huge OS pages? should we
@@ -760,7 +866,7 @@ bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
 #include <CommonCrypto/CommonRandom.h>
 
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {
-  // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
+  // We prefer CCRandomGenerateBytes as it returns an error code while arc4random_buf
   // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>
   return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);
 }
@@ -770,7 +876,6 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
       defined(__sun) || \
       (defined(__APPLE__) && (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7))
 
-#include <stdlib.h>
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {
   arc4random_buf(buf, buf_len);
   return true;
@@ -835,12 +940,12 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
 #if defined(MI_USE_PTHREADS)
 
 // use pthread local storage keys to detect thread ending
-// (and used with MI_TLS_PTHREADS for the default heap)
+// (and used with MI_TLS_PTHREADS for the default theap)
 pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
 
 static void mi_pthread_done(void* value) {
   if (value!=NULL) {
-    _mi_thread_done((mi_heap_t*)value);
+    _mi_thread_done((mi_theap_t*)value);
   }
 }
 
@@ -855,9 +960,9 @@ void _mi_prim_thread_done_auto_done(void) {
   }
 }
 
-void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+void _mi_prim_thread_associate_default_theap(mi_theap_t* theap) {
   if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
-    pthread_setspecific(_mi_heap_default_key, heap);
+    pthread_setspecific(_mi_heap_default_key, theap);
   }
 }
 
@@ -871,8 +976,12 @@ void _mi_prim_thread_done_auto_done(void) {
   // nothing
 }
 
-void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
-  MI_UNUSED(heap);
+void _mi_prim_thread_associate_default_theap(mi_theap_t* theap) {
+  MI_UNUSED(theap);
 }
 
 #endif
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  return false;
+}
diff --git a/system/lib/mimalloc/src/prim/wasi/prim.c b/system/lib/mimalloc/src/prim/wasi/prim.c
index e95f67f587ea3..31b02bb601871 100644
--- a/system/lib/mimalloc/src/prim/wasi/prim.c
+++ b/system/lib/mimalloc/src/prim/wasi/prim.c
@@ -9,7 +9,6 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"
 
 #include <stdio.h>   // fputs
@@ -22,7 +21,7 @@ terms of the MIT license. A copy of the license can be found in the file
 void _mi_prim_mem_init( mi_os_mem_config_t* config ) {
   config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
   config->alloc_granularity = 16;
-  config->has_overcommit = false;  
+  config->has_overcommit = false;
   config->has_partial_free = false;
   config->has_virtual_reserve = false;
 }
@@ -33,7 +32,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) {
 
 int _mi_prim_free(void* addr, size_t size ) {
   MI_UNUSED(addr); MI_UNUSED(size);
-  // wasi heap cannot be shrunk
+  // wasi theap cannot be shrunk
   return 0;
 }
 
@@ -63,7 +62,7 @@ int _mi_prim_free(void* addr, size_t size ) {
 #endif
 
 #if defined(MI_USE_PTHREADS)
-static pthread_mutex_t mi_heap_grow_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t mi_theap_grow_mutex = PTHREAD_MUTEX_INITIALIZER;
 #endif
 
 static void* mi_prim_mem_grow(size_t size, size_t try_alignment) {
@@ -71,11 +70,11 @@ static void* mi_prim_mem_grow(size_t size, size_t try_alignment) {
   if (try_alignment <= 1) {
     // `sbrk` is not thread safe in general so try to protect it (we could skip this on WASM but leave it in for now)
     #if defined(MI_USE_PTHREADS)
-    pthread_mutex_lock(&mi_heap_grow_mutex);
+    pthread_mutex_lock(&mi_theap_grow_mutex);
     #endif
     p = mi_memory_grow(size);
     #if defined(MI_USE_PTHREADS)
-    pthread_mutex_unlock(&mi_heap_grow_mutex);
+    pthread_mutex_unlock(&mi_theap_grow_mutex);
     #endif
   }
   else {
@@ -85,21 +84,21 @@ static void* mi_prim_mem_grow(size_t size, size_t try_alignment) {
     // between getting the current size and actual allocation
     // (also, `sbrk` is not thread safe in general)
     #if defined(MI_USE_PTHREADS)
-    pthread_mutex_lock(&mi_heap_grow_mutex);
+    pthread_mutex_lock(&mi_theap_grow_mutex);
     #endif
     {
       void* current = mi_memory_grow(0);  // get current size
       if (current != NULL) {
-        void* aligned_current = mi_align_up_ptr(current, try_alignment);  // and align from there to minimize wasted space
+        void* aligned_current = _mi_align_up_ptr(current, try_alignment);  // and align from there to minimize wasted space
         alloc_size = _mi_align_up( ((uint8_t*)aligned_current - (uint8_t*)current) + size, _mi_os_page_size());
         base = mi_memory_grow(alloc_size);
       }
     }
     #if defined(MI_USE_PTHREADS)
-    pthread_mutex_unlock(&mi_heap_grow_mutex);
+    pthread_mutex_unlock(&mi_theap_grow_mutex);
     #endif
     if (base != NULL) {
-      p = mi_align_up_ptr(base, try_alignment);
+      p = _mi_align_up_ptr(base, try_alignment);
       if ((uint8_t*)p + size > (uint8_t*)base + alloc_size) {
         // another thread used wasm_memory_grow/sbrk in-between and we do not have enough
         // space after alignment. Give up (and waste the space as we cannot shrink :-( )
@@ -120,8 +119,8 @@ static void* mi_prim_mem_grow(size_t size, size_t try_alignment) {
 }
 
 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
-int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
-  MI_UNUSED(allow_large); MI_UNUSED(commit);
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+  MI_UNUSED(allow_large); MI_UNUSED(commit); MI_UNUSED(hint_addr);
   *is_large = false;
   *is_zero = false;
   *addr = mi_prim_mem_grow(size, try_alignment);
@@ -134,7 +133,7 @@ int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_la
 //---------------------------------------------
 
 int _mi_prim_commit(void* addr, size_t size, bool* is_zero) {
-  MI_UNUSED(addr); MI_UNUSED(size); 
+  MI_UNUSED(addr); MI_UNUSED(size);
   *is_zero = false;
   return 0;
 }
@@ -150,6 +149,11 @@ int _mi_prim_reset(void* addr, size_t size) {
   return 0;
 }
 
+int _mi_prim_reuse(void* addr, size_t size) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  return 0;
+}
+
 int _mi_prim_protect(void* addr, size_t size, bool protect) {
   MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect);
   return 0;
@@ -199,9 +203,9 @@ mi_msecs_t _mi_prim_clock_now(void) {
 // low resolution timer
 mi_msecs_t _mi_prim_clock_now(void) {
   #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0)
-  return (mi_msecs_t)clock();  
+  return (mi_msecs_t)clock();
   #elif (CLOCKS_PER_SEC < 1000)
-  return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC);  
+  return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC);
   #else
   return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000);
   #endif
@@ -275,6 +279,10 @@ void _mi_prim_thread_done_auto_done(void) {
   // nothing
 }
 
-void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
-  MI_UNUSED(heap);
+void _mi_prim_thread_associate_default_theap(mi_theap_t* theap) {
+  MI_UNUSED(theap);
+}
+
+bool _mi_prim_thread_is_in_threadpool(void) {
+  return false;
 }
diff --git a/system/lib/mimalloc/src/prim/windows/prim.c b/system/lib/mimalloc/src/prim/windows/prim.c
index 5074ad4cbd54c..6112a097d6a50 100644
--- a/system/lib/mimalloc/src/prim/windows/prim.c
+++ b/system/lib/mimalloc/src/prim/windows/prim.c
@@ -9,15 +9,25 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"
 #include <stdio.h>   // fputs, stderr
 
+// xbox has no console IO
+#if !defined(WINAPI_FAMILY_PARTITION) || WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP | WINAPI_PARTITION_SYSTEM)
+#define MI_HAS_CONSOLE_IO
+#endif
 
 //---------------------------------------------
 // Dynamically bind Windows API points for portability
 //---------------------------------------------
 
+#if defined(_MSC_VER)
+#pragma warning(disable:4996)   // don't use GetVersionExW
+#endif
+
+static DWORD win_major_version = 6;
+static DWORD win_minor_version = 0;
+
 // We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
 // So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility)
 // NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
@@ -46,22 +56,35 @@ typedef struct MI_MEM_ADDRESS_REQUIREMENTS_S {
 #define MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE   0x00000010
 
 #include <winternl.h>
-typedef PVOID    (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
-typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
+typedef PVOID (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
+typedef LONG  (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);  // avoid NTSTATUS as it is not defined on xbox (pr #1084)
 static PVirtualAlloc2 pVirtualAlloc2 = NULL;
 static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
 
-// Similarly, GetNumaProcesorNodeEx is only supported since Windows 7
+// Similarly, GetNumaProcessorNodeEx is only supported since Windows 7  (and GetNumaNodeProcessorMask is not supported on xbox)
 typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER;
 
 typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber);
 typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(MI_PROCESSOR_NUMBER* Processor, PUSHORT NodeNumber);
 typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask);
 typedef BOOL (__stdcall *PGetNumaProcessorNode)(UCHAR Processor, PUCHAR NodeNumber);
+typedef BOOL (__stdcall* PGetNumaNodeProcessorMask)(UCHAR Node, PULONGLONG ProcessorMask);
+typedef BOOL (__stdcall* PGetNumaHighestNodeNumber)(PULONG Node);
 static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL;
 static PGetNumaProcessorNodeEx      pGetNumaProcessorNodeEx = NULL;
 static PGetNumaNodeProcessorMaskEx  pGetNumaNodeProcessorMaskEx = NULL;
 static PGetNumaProcessorNode        pGetNumaProcessorNode = NULL;
+static PGetNumaNodeProcessorMask    pGetNumaNodeProcessorMask = NULL;
+static PGetNumaHighestNodeNumber    pGetNumaHighestNodeNumber = NULL;
+
+// Not available on xbox
+typedef SIZE_T(__stdcall* PGetLargePageMinimum)(VOID);
+static PGetLargePageMinimum pGetLargePageMinimum = NULL;
+
+// Available after Windows XP
+typedef BOOL (__stdcall *PGetPhysicallyInstalledSystemMemory)( PULONGLONG TotalMemoryInKilobytes );
+typedef BOOL (__stdcall* PGetVersionExW)(LPOSVERSIONINFOW lpVersionInformation);
+
 
 //---------------------------------------------
 // Enable large page support dynamically (if possible)
@@ -72,6 +95,7 @@ static bool win_enable_large_os_pages(size_t* large_page_size)
   static bool large_initialized = false;
   if (large_initialized) return (_mi_os_large_page_size() > 0);
   large_initialized = true;
+  if (pGetLargePageMinimum==NULL) return false;  // no large page support (xbox etc.)
 
   // Try to see if large OS pages are supported
   // To use large pages on Windows, we first need access permission
@@ -90,8 +114,8 @@ static bool win_enable_large_os_pages(size_t* large_page_size)
       if (ok) {
         err = GetLastError();
         ok = (err == ERROR_SUCCESS);
-        if (ok && large_page_size != NULL) {
-          *large_page_size = GetLargePageMinimum();
+        if (ok && large_page_size != NULL && pGetLargePageMinimum != NULL) {
+          *large_page_size = (*pGetLargePageMinimum)();
         }
       }
     }
@@ -109,19 +133,30 @@ static bool win_enable_large_os_pages(size_t* large_page_size)
 // Initialize
 //---------------------------------------------
 
+static DWORD win_allocation_granularity = 64*MI_KiB;
+
 void _mi_prim_mem_init( mi_os_mem_config_t* config )
 {
   config->has_overcommit = false;
   config->has_partial_free = false;
   config->has_virtual_reserve = true;
+
   // get the page size
-  SYSTEM_INFO si;
+  SYSTEM_INFO si; _mi_memzero_var(si);
   GetSystemInfo(&si);
   if (si.dwPageSize > 0) { config->page_size = si.dwPageSize; }
-  if (si.dwAllocationGranularity > 0) { config->alloc_granularity = si.dwAllocationGranularity; }
+  if (si.dwAllocationGranularity > 0) {
+    config->alloc_granularity = si.dwAllocationGranularity;
+    win_allocation_granularity = si.dwAllocationGranularity;
+  }
+  // get virtual address bits
+  if ((uintptr_t)si.lpMaximumApplicationAddress > 0) {
+    const size_t vbits = MI_SIZE_BITS - mi_clz((uintptr_t)si.lpMaximumApplicationAddress);
+    config->virtual_address_bits = vbits;
+  }
+
   // get the VirtualAlloc2 function
-  HINSTANCE  hDll;
-  hDll = LoadLibrary(TEXT("kernelbase.dll"));
+  HINSTANCE hDll = LoadLibrary(TEXT("kernelbase.dll"));
   if (hDll != NULL) {
     // use VirtualAlloc2FromApp if possible as it is available to Windows store apps
     pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2FromApp");
@@ -141,8 +176,32 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
     pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNodeEx");
     pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx");
     pGetNumaProcessorNode = (PGetNumaProcessorNode)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNode");
+    pGetNumaNodeProcessorMask = (PGetNumaNodeProcessorMask)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMask");
+    pGetNumaHighestNodeNumber = (PGetNumaHighestNodeNumber)(void (*)(void))GetProcAddress(hDll, "GetNumaHighestNodeNumber");
+    pGetLargePageMinimum = (PGetLargePageMinimum)(void (*)(void))GetProcAddress(hDll, "GetLargePageMinimum");
+    // Get physical memory (not available on XP, so check dynamically)
+    PGetPhysicallyInstalledSystemMemory pGetPhysicallyInstalledSystemMemory = (PGetPhysicallyInstalledSystemMemory)(void (*)(void))GetProcAddress(hDll,"GetPhysicallyInstalledSystemMemory");
+    if (pGetPhysicallyInstalledSystemMemory != NULL) {
+      ULONGLONG memInKiB = 0;
+      if ((*pGetPhysicallyInstalledSystemMemory)(&memInKiB)) {
+        if (memInKiB > 0 && memInKiB <= SIZE_MAX) {
+          config->physical_memory_in_kib = (size_t)memInKiB;
+        }
+      }
+    }
+    // Get Windows version
+    PGetVersionExW pGetVersionExW = (PGetVersionExW)(void (*)(void))GetProcAddress(hDll, "GetVersionExW");
+    if (pGetVersionExW != NULL) {
+      OSVERSIONINFOW version; _mi_memzero_var(version);
+      version.dwOSVersionInfoSize = sizeof(version);
+      if ((*pGetVersionExW)(&version)) {
+        win_major_version = version.dwMajorVersion;
+        win_minor_version = version.dwMinorVersion;
+      }
+    }
     FreeLibrary(hDll);
   }
+  // Enable large/huge OS page support?
   if (mi_option_is_enabled(mi_option_allow_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
     win_enable_large_os_pages(&config->large_page_size);
   }
@@ -162,9 +221,9 @@ int _mi_prim_free(void* addr, size_t size ) {
     // In mi_os_mem_alloc_aligned the fallback path may have returned a pointer inside
     // the memory region returned by VirtualAlloc; in that case we need to free using
     // the start of the region.
-    MEMORY_BASIC_INFORMATION info = { 0 };
+    MEMORY_BASIC_INFORMATION info; _mi_memzero_var(info);
     VirtualQuery(addr, &info, sizeof(info));
-    if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)MI_SEGMENT_SIZE) {
+    if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)(4*MI_MiB)) {
       errcode = 0;
       err = (VirtualFree(info.AllocationBase, 0, MEM_RELEASE) == 0);
       if (err) { errcode = GetLastError(); }
@@ -192,7 +251,7 @@ static void* win_virtual_alloc_prim_once(void* addr, size_t size, size_t try_ali
   }
   #endif
   // on modern Windows try use VirtualAlloc2 for aligned allocation
-  if (try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
+  if (addr == NULL && try_alignment > win_allocation_granularity && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
     MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
     reqs.Alignment = try_alignment;
     MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
@@ -228,10 +287,10 @@ static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignmen
       // success, return the address
       return p;
     }
-    else if (max_retry_msecs > 0 && (try_alignment <= 2*MI_SEGMENT_ALIGN) &&
+    else if (max_retry_msecs > 0 && (try_alignment <= 8*MI_MiB) &&
               (flags&MEM_COMMIT) != 0 && (flags&MEM_LARGE_PAGES) == 0 &&
               win_is_out_of_memory_error(GetLastError())) {
-      // if committing regular memory and being out-of-memory, 
+      // if committing regular memory and being out-of-memory,
       // keep trying for a bit in case memory frees up after all. See issue #894
       _mi_warning_message("out-of-memory on OS allocation, try again... (attempt %lu, 0x%zx bytes, error code: 0x%x, address: %p, alignment: 0x%zx, flags: 0x%x)\n", tries, size, GetLastError(), addr, try_alignment, flags);
       long sleep_msecs = tries*40;  // increasing waits
@@ -252,8 +311,9 @@ static void* win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DW
   static _Atomic(size_t) large_page_try_ok; // = 0;
   void* p = NULL;
   // Try to allocate large OS pages (2MiB) if allowed or required.
-  if ((large_only || _mi_os_use_large_page(size, try_alignment))
-      && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
+  if ((large_only || (_mi_os_canuse_large_page(size, try_alignment) && mi_option_is_enabled(mi_option_allow_large_os_pages)))
+      && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0)
+  {
     size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
     if (!large_only && try_ok > 0) {
       // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
@@ -280,14 +340,14 @@ static void* win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DW
   return p;
 }
 
-int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(commit || !allow_large);
   mi_assert_internal(try_alignment > 0);
   *is_zero = true;
   int flags = MEM_RESERVE;
   if (commit) { flags |= MEM_COMMIT; }
-  *addr = win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large);
+  *addr = win_virtual_alloc(hint_addr, size, try_alignment, flags, false, allow_large, is_large);
   return (*addr != NULL ? 0 : (int)GetLastError());
 }
 
@@ -316,7 +376,7 @@ int _mi_prim_commit(void* addr, size_t size, bool* is_zero) {
   return 0;
 }
 
-int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {  
+int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {
   BOOL ok = VirtualFree(addr, size, MEM_DECOMMIT);
   *needs_recommit = true;  // for safety, assume always decommitted even in the case of an error.
   return (ok ? 0 : (int)GetLastError());
@@ -333,6 +393,11 @@ int _mi_prim_reset(void* addr, size_t size) {
   return (p != NULL ? 0 : (int)GetLastError());
 }
 
+int _mi_prim_reuse(void* addr, size_t size) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  return 0;
+}
+
 int _mi_prim_protect(void* addr, size_t size, bool protect) {
   DWORD oldprotect = 0;
   BOOL ok = VirtualProtect(addr, size, protect ? PAGE_NOACCESS : PAGE_READWRITE, &oldprotect);
@@ -364,7 +429,7 @@ static void* _mi_prim_alloc_huge_os_pagesx(void* hint_addr, size_t size, int num
     }
     SIZE_T psize = size;
     void* base = hint_addr;
-    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
+    LONG err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
     if (err == 0 && base != NULL) {
       return base;
     }
@@ -418,9 +483,11 @@ size_t _mi_prim_numa_node(void) {
 
 size_t _mi_prim_numa_node_count(void) {
   ULONG numa_max = 0;
-  GetNumaHighestNodeNumber(&numa_max);
+  if (pGetNumaHighestNodeNumber!=NULL) {
+    (*pGetNumaHighestNodeNumber)(&numa_max);
+  }
   // find the highest node number that has actual processors assigned to it. Issue #282
-  while(numa_max > 0) {
+  while (numa_max > 0) {
     if (pGetNumaNodeProcessorMaskEx != NULL) {
       // Extended API is supported
       GROUP_AFFINITY affinity;
@@ -431,8 +498,10 @@ size_t _mi_prim_numa_node_count(void) {
     else {
       // Vista or earlier, use older API that is limited to 64 processors.
       ULONGLONG mask;
-      if (GetNumaNodeProcessorMask((UCHAR)numa_max, &mask)) {
-        if (mask != 0) break; // found the maximum non-empty node
+      if (pGetNumaNodeProcessorMask != NULL) {
+        if ((*pGetNumaNodeProcessorMask)((UCHAR)numa_max, &mask)) {
+          if (mask != 0) break; // found the maximum non-empty node
+        }
       };
     }
     // max node was invalid or had no processor assigned, try again
@@ -468,7 +537,6 @@ mi_msecs_t _mi_prim_clock_now(void) {
 // Process Info
 //----------------------------------------------------------------
 
-#include <windows.h>
 #include <psapi.h>
 
 static mi_msecs_t filetime_msecs(const FILETIME* ftime) {
@@ -491,7 +559,7 @@ void _mi_prim_process_info(mi_process_info_t* pinfo)
   GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
   pinfo->utime = filetime_msecs(&ut);
   pinfo->stime = filetime_msecs(&st);
-  
+
   // load psapi on demand
   if (pGetProcessMemoryInfo == NULL) {
     HINSTANCE hDll = LoadLibrary(TEXT("psapi.dll"));
@@ -501,11 +569,10 @@ void _mi_prim_process_info(mi_process_info_t* pinfo)
   }
 
   // get process info
-  PROCESS_MEMORY_COUNTERS info;
-  memset(&info, 0, sizeof(info));
+  PROCESS_MEMORY_COUNTERS info; _mi_memzero_var(info);
   if (pGetProcessMemoryInfo != NULL) {
     pGetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
-  } 
+  }
   pinfo->current_rss    = (size_t)info.WorkingSetSize;
   pinfo->peak_rss       = (size_t)info.PeakWorkingSetSize;
   pinfo->current_commit = (size_t)info.PagefileUsage;
@@ -517,24 +584,28 @@ void _mi_prim_process_info(mi_process_info_t* pinfo)
 // Output
 //----------------------------------------------------------------
 
-void _mi_prim_out_stderr( const char* msg ) 
+void _mi_prim_out_stderr( const char* msg )
 {
   // on windows with redirection, the C runtime cannot handle locale dependent output
   // after the main thread closes so we use direct console output.
   if (!_mi_preloading()) {
     // _cputs(msg);  // _cputs cannot be used as it aborts when failing to lock the console
     static HANDLE hcon = INVALID_HANDLE_VALUE;
-    static bool hconIsConsole;
+    static bool hconIsConsole = false;
     if (hcon == INVALID_HANDLE_VALUE) {
-      CONSOLE_SCREEN_BUFFER_INFO sbi;
       hcon = GetStdHandle(STD_ERROR_HANDLE);
+      #ifdef MI_HAS_CONSOLE_IO
+      CONSOLE_SCREEN_BUFFER_INFO sbi;
       hconIsConsole = ((hcon != INVALID_HANDLE_VALUE) && GetConsoleScreenBufferInfo(hcon, &sbi));
+      #endif
     }
     const size_t len = _mi_strlen(msg);
     if (len > 0 && len < UINT32_MAX) {
       DWORD written = 0;
       if (hconIsConsole) {
+        #ifdef MI_HAS_CONSOLE_IO
         WriteConsoleA(hcon, msg, (DWORD)len, &written, NULL);
+        #endif
       }
       else if (hcon != INVALID_HANDLE_VALUE) {
         // use direct write if stderr was redirected
@@ -564,7 +635,6 @@ bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
 }
 
 
-
 //----------------------------------------------------------------
 // Random
 //----------------------------------------------------------------
@@ -600,64 +670,322 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
     }
     if (pBCryptGenRandom == NULL) return false;
   }
-  return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);  
+  return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);
 }
 
 #endif  // MI_USE_RTLGENRANDOM
 
+
 //----------------------------------------------------------------
-// Thread init/done
+// Thread pool?
 //----------------------------------------------------------------
 
-#if !defined(MI_SHARED_LIB)
-
-// use thread local storage keys to detect thread ending
-// note: another design could be to use special linker sections (see issue #869)
-#include <fibersapi.h>
-#if (_WIN32_WINNT < 0x600)  // before Windows Vista
-WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback );
-WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex );
-WINBASEAPI BOOL  WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData );
-WINBASEAPI BOOL  WINAPI FlsFree(_In_ DWORD dwFlsIndex);
+bool _mi_prim_thread_is_in_threadpool(void) {
+#if (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64)
+  if (win_major_version >= 6) {
+    // check if this thread belongs to a windows threadpool
+    // see: <https://www.geoffchappell.com/studies/windows/km/ntoskrnl/inc/api/pebteb/teb/index.htm>
+    struct _TEB* const teb = NtCurrentTeb();
+    void* const pool_data = *((void**)((uint8_t*)teb + (MI_SIZE_BITS == 32 ? 0x0F90 : 0x1778)));
+    return (pool_data != NULL);
+  }
 #endif
+  return false;
+}
+
+
+//----------------------------------------------------------------
+// Process & Thread Init/Done
+//----------------------------------------------------------------
 
-static DWORD mi_fls_key = (DWORD)(-1);
+//static void mi_debug_out(const char* s) {
+//  HANDLE h = GetStdHandle(STD_ERROR_HANDLE);
+//  WriteConsole(h, s, (DWORD)_mi_strlen(s), NULL, NULL);
+//}
 
-static void NTAPI mi_fls_done(PVOID value) {
-  mi_heap_t* heap = (mi_heap_t*)value;
-  if (heap != NULL) {
-    _mi_thread_done(heap);
-    FlsSetValue(mi_fls_key, NULL);  // prevent recursion as _mi_thread_done may set it back to the main heap, issue #672
+static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
+  MI_UNUSED(reserved);
+  MI_UNUSED(module);
+  if (reason==DLL_PROCESS_ATTACH) {
+    _mi_auto_process_init();
+  }
+  else if (reason==DLL_PROCESS_DETACH) {
+    _mi_auto_process_done();
+  }
+  else if (reason==DLL_THREAD_DETACH && !_mi_is_redirected()) {
+    _mi_thread_done(NULL);
   }
 }
 
-void _mi_prim_thread_init_auto_done(void) {
-  mi_fls_key = FlsAlloc(&mi_fls_done);
-}
 
-void _mi_prim_thread_done_auto_done(void) {
-  // call thread-done on all threads (except the main thread) to prevent 
-  // dangling callback pointer if statically linked with a DLL; Issue #208
-  FlsFree(mi_fls_key);  
-}
+/* ----------------------------------------------------------------------
+   Auto initialize and finalize mimalloc on process and thread start/end.
+   By default we use a combination of _pRawDllMain and TLS sections for
+   both static and dynamic linkage
+------------------------------------------------------------------------- */
+#ifndef MI_WIN_NO_RAW_DLLMAIN
+  #define MI_PRIM_HAS_PROCESS_ATTACH  1
+  // nothing to do since `_mi_thread_done` is handled through the DLL_THREAD_DETACH event.
+  void _mi_prim_thread_init_auto_done(void) {}
+  void _mi_prim_thread_done_auto_done(void) {}
+  void _mi_prim_thread_associate_default_theap(mi_theap_t* theap) {
+    MI_UNUSED(theap);
+  }
 
-void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
-  mi_assert_internal(mi_fls_key != (DWORD)(-1));
-  FlsSetValue(mi_fls_key, heap);
-}
+  // If linked into a DLL module, this raw entry is called before the CRT attach and
+  // after the CRT detach through the CRT _pRawDllMain pointer.
+  static BOOL NTAPI mi_dll_main_raw(PVOID module, DWORD reason, LPVOID reserved) {
+    //if (reason == DLL_PROCESS_ATTACH)      { mi_debug_out("dll process attach\n"); }
+    //else if (reason == DLL_PROCESS_DETACH) { mi_debug_out("dll process detach\n"); }
+    //else if (reason == DLL_THREAD_ATTACH)  { mi_debug_out("dll thread attach\n"); }
+    //else if (reason == DLL_THREAD_DETACH)  { mi_debug_out("dll thread detach\n"); }
+    mi_win_main(module, reason, reserved);
+    return TRUE;
+  }
 
-#else
+  // Set the value of the CRT _pRawDllMain pointer
+  #if defined(__cplusplus)
+  extern "C"
+  #endif
+  PVOID _pRawDllMain = &mi_dll_main_raw;
+
+  // We also hook into the Windows loader TLS initialization and finalization.
+  // If we are linked into an EXE module we rely on these as `mi_dll_main_raw`
+  // is not called (and otherwise we ignore the TLS callbacks by checking if we are in a DLL).
+  static bool mi_module_is_dll(PVOID mod) {
+    if (mod==NULL) return false;
+    PIMAGE_DOS_HEADER imageDosHeader = (PIMAGE_DOS_HEADER)mod;
+    PIMAGE_NT_HEADERS imageNtHeaders = (PIMAGE_NT_HEADERS)((unsigned char*)imageDosHeader + imageDosHeader->e_lfanew);
+    return ((imageNtHeaders->FileHeader.Characteristics & IMAGE_FILE_DLL) == IMAGE_FILE_DLL);
+  }
 
-// Dll; nothing to do as in that case thread_done is handled through the DLL_THREAD_DETACH event.
+  static void NTAPI mi_tls_attach(PVOID module, DWORD reason, LPVOID reserved) {
+    if (reason == DLL_PROCESS_ATTACH || reason == DLL_THREAD_ATTACH) {
+      if (!mi_module_is_dll(module)) {
+        // mi_debug_out(reason==DLL_PROCESS_ATTACH ? "exe process attach\n" : "exe thread attach\n");
+        mi_win_main(module, reason, reserved);
+      }
+    }
+  }
 
-void _mi_prim_thread_init_auto_done(void) {
-}
+  static void NTAPI mi_tls_detach(PVOID module, DWORD reason, LPVOID reserved) {
+    if (reason == DLL_PROCESS_DETACH || reason == DLL_THREAD_DETACH) {
+      if (!mi_module_is_dll(module)) {
+        // mi_debug_out(reason==DLL_PROCESS_DETACH ? "exe process detach\n" : "exe thread detach\n");
+        mi_win_main(module, reason, reserved);
+      }
+    }
+  }
 
-void _mi_prim_thread_done_auto_done(void) {
-}
+  // Set up TLS callbacks in a statically linked library by using special data sections.
+  // See <https://stackoverflow.com/questions/14538159/tls-callback-in-windows>
+  // We use 2 entries to ensure we call attach events before constructors
+  // are called, and detach events after destructors are called.
+  #if defined(__cplusplus)
+  extern "C" {
+  #endif
 
-void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
-  MI_UNUSED(heap);
-}
+  #if defined(_WIN64)
+    #pragma comment(linker, "/INCLUDE:_tls_used")
+    #pragma comment(linker, "/INCLUDE:_mi_tls_callback_pre")
+    #pragma comment(linker, "/INCLUDE:_mi_tls_callback_post")
+    #pragma const_seg(".CRT$XLB")
+      extern const PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[];
+      const PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[] = { &mi_tls_attach };
+    #pragma const_seg()
+    #pragma const_seg(".CRT$XLY")
+      extern const PIMAGE_TLS_CALLBACK _mi_tls_callback_post[];
+      const PIMAGE_TLS_CALLBACK _mi_tls_callback_post[] = { &mi_tls_detach };
+    #pragma const_seg()
+  #else
+    #pragma comment(linker, "/INCLUDE:__tls_used")
+    #pragma comment(linker, "/INCLUDE:__mi_tls_callback_pre")
+    #pragma comment(linker, "/INCLUDE:__mi_tls_callback_post")
+    #pragma data_seg(".CRT$XLB")
+      PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[] = { &mi_tls_attach };
+    #pragma data_seg()
+    #pragma data_seg(".CRT$XLY")
+      PIMAGE_TLS_CALLBACK _mi_tls_callback_post[] = { &mi_tls_detach };
+    #pragma data_seg()
+  #endif
 
+  #if defined(__cplusplus)
+  }
+  #endif
+
+/* ----------------------------------------
+   legacy options: DllMain, TLS, and FLS
+*/
+#elif defined(MI_SHARED_LIB)
+  #define MI_PRIM_HAS_PROCESS_ATTACH  1
+
+  // Windows DLL: easy to hook into process_init and thread_done
+  BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
+    mi_win_main((PVOID)inst,reason,reserved);
+    return TRUE;
+  }
+
+  // nothing to do since `_mi_thread_done` is handled through the DLL_THREAD_DETACH event.
+  void _mi_prim_thread_init_auto_done(void) { }
+  void _mi_prim_thread_done_auto_done(void) { }
+  void _mi_prim_thread_associate_default_theap(mi_theap_t* theap) {
+    MI_UNUSED(theap);
+  }
+
+#elif !defined(MI_WIN_USE_FLS)
+  #define MI_PRIM_HAS_PROCESS_ATTACH  1
+
+  static void NTAPI mi_win_main_attach(PVOID module, DWORD reason, LPVOID reserved) {
+    if (reason == DLL_PROCESS_ATTACH || reason == DLL_THREAD_ATTACH) {
+      mi_win_main(module, reason, reserved);
+    }
+  }
+  static void NTAPI mi_win_main_detach(PVOID module, DWORD reason, LPVOID reserved) {
+    if (reason == DLL_PROCESS_DETACH || reason == DLL_THREAD_DETACH) {
+      mi_win_main(module, reason, reserved);
+    }
+  }
+
+  // Set up TLS callbacks in a statically linked library by using special data sections.
+  // See <https://stackoverflow.com/questions/14538159/tls-callback-in-windows>
+  // We use 2 entries to ensure we call attach events before constructors
+  // are called, and detach events after destructors are called.
+  #if defined(__cplusplus)
+  extern "C" {
+  #endif
+
+  #if defined(_WIN64)
+    #pragma comment(linker, "/INCLUDE:_tls_used")
+    #pragma comment(linker, "/INCLUDE:_mi_tls_callback_pre")
+    #pragma comment(linker, "/INCLUDE:_mi_tls_callback_post")
+    #pragma const_seg(".CRT$XLB")
+    extern const PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[];
+    const PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[] = { &mi_win_main_attach };
+    #pragma const_seg()
+    #pragma const_seg(".CRT$XLY")
+    extern const PIMAGE_TLS_CALLBACK _mi_tls_callback_post[];
+    const PIMAGE_TLS_CALLBACK _mi_tls_callback_post[] = { &mi_win_main_detach };
+    #pragma const_seg()
+  #else
+    #pragma comment(linker, "/INCLUDE:__tls_used")
+    #pragma comment(linker, "/INCLUDE:__mi_tls_callback_pre")
+    #pragma comment(linker, "/INCLUDE:__mi_tls_callback_post")
+    #pragma data_seg(".CRT$XLB")
+    PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[] = { &mi_win_main_attach };
+    #pragma data_seg()
+    #pragma data_seg(".CRT$XIY")
+    PIMAGE_TLS_CALLBACK _mi_tls_callback_post[] = { &mi_win_main_detach };
+    #pragma data_seg()
+  #endif
+
+  #if defined(__cplusplus)
+  }
+  #endif
+
+  // nothing to do since `_mi_thread_done` is handled through the DLL_THREAD_DETACH event.
+  void _mi_prim_thread_init_auto_done(void) { }
+  void _mi_prim_thread_done_auto_done(void) { }
+  void _mi_prim_thread_associate_default_theap(mi_theap_t* theap) {
+    MI_UNUSED(theap);
+  }
+
+#else // deprecated: statically linked, use fiber api
+
+  #if defined(_MSC_VER) // on clang/gcc use the constructor attribute (in `src/prim/prim.c`)
+    // MSVC: use data section magic for static libraries
+    // See <https://www.codeguru.com/cpp/misc/misc/applicationcontrol/article.php/c6945/Running-Code-Before-and-After-Main.htm>
+    #define MI_PRIM_HAS_PROCESS_ATTACH 1
+
+    static int mi_process_attach(void) {
+      mi_win_main(NULL,DLL_PROCESS_ATTACH,NULL);
+      atexit(&_mi_auto_process_done);
+      return 0;
+    }
+    typedef int(*mi_crt_callback_t)(void);
+    #if defined(_WIN64)
+      #pragma comment(linker, "/INCLUDE:_mi_tls_callback")
+      #pragma section(".CRT$XIU", long, read)
+    #else
+      #pragma comment(linker, "/INCLUDE:__mi_tls_callback")
+    #endif
+    #pragma data_seg(".CRT$XIU")
+    mi_decl_externc mi_crt_callback_t _mi_tls_callback[] = { &mi_process_attach };
+    #pragma data_seg()
+  #endif
+
+  // use the fiber api for calling `_mi_thread_done`.
+  #include <fibersapi.h>
+  #if (_WIN32_WINNT < 0x600)  // before Windows Vista
+  WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback );
+  WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex );
+  WINBASEAPI BOOL  WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData );
+  WINBASEAPI BOOL  WINAPI FlsFree(_In_ DWORD dwFlsIndex);
+  #endif
+
+  static DWORD mi_fls_key = (DWORD)(-1);
+
+  static void NTAPI mi_fls_done(PVOID value) {
+    mi_theap_t* theap = (mi_theap_t*)value;
+    if (theap != NULL) {
+      _mi_thread_done(theap);
+      FlsSetValue(mi_fls_key, NULL);  // prevent recursion as _mi_thread_done may set it back to the main theap, issue #672
+    }
+  }
+
+  void _mi_prim_thread_init_auto_done(void) {
+    mi_fls_key = FlsAlloc(&mi_fls_done);
+  }
+
+  void _mi_prim_thread_done_auto_done(void) {
+    // call thread-done on all threads (except the main thread) to prevent
+    // dangling callback pointer if statically linked with a DLL; Issue #208
+    FlsFree(mi_fls_key);
+  }
+
+  void _mi_prim_thread_associate_default_theap(mi_theap_t* theap) {
+    mi_assert_internal(mi_fls_key != (DWORD)(-1));
+    FlsSetValue(mi_fls_key, theap);
+  }
 #endif
+
+// ----------------------------------------------------
+// Communicate with the redirection module on Windows
+// ----------------------------------------------------
+#if defined(MI_SHARED_LIB) && !defined(MI_WIN_NOREDIRECT)
+  #define MI_PRIM_HAS_ALLOCATOR_INIT 1
+
+  static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
+
+  bool _mi_is_redirected(void) {
+    return mi_redirected;
+  }
+
+  #ifdef __cplusplus
+  extern "C" {
+  #endif
+  mi_decl_export void _mi_redirect_entry(DWORD reason) {
+    // called on redirection; careful as this may be called before DllMain
+    if (reason == DLL_PROCESS_ATTACH) {
+      mi_redirected = true;
+    }
+    else if (reason == DLL_PROCESS_DETACH) {
+      mi_redirected = false;
+    }
+    else if (reason == DLL_THREAD_DETACH) {
+      // mi_debug_out("redirect thread detach\n");
+      _mi_thread_done(NULL);
+    }
+  }
+  __declspec(dllimport) bool mi_cdecl mi_allocator_init(const char** message);
+  __declspec(dllimport) void mi_cdecl mi_allocator_done(void);
+  #ifdef __cplusplus
+  }
+  #endif
+  bool _mi_allocator_init(const char** message) {
+    return mi_allocator_init(message);
+  }
+  void _mi_allocator_done(void) {
+    mi_allocator_done();
+  }
+#endif
+
diff --git a/system/lib/mimalloc/src/random.c b/system/lib/mimalloc/src/random.c
index 4fc8b2f8fb0bc..990e4894f3be8 100644
--- a/system/lib/mimalloc/src/random.c
+++ b/system/lib/mimalloc/src/random.c
@@ -7,7 +7,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "mimalloc/prim.h"    // _mi_prim_random_buf
-#include <string.h>       // memset
 
 /* ----------------------------------------------------------------------------
 We use our own PRNG to keep predictable performance of random number generation
@@ -33,15 +32,11 @@ The implementation uses regular C code which compiles very well on modern compil
 (gcc x64 has no register spills, and clang 6+ uses SSE instructions)
 -----------------------------------------------------------------------------*/
 
-static inline uint32_t rotl(uint32_t x, uint32_t shift) {
-  return (x << shift) | (x >> (32 - shift));
-}
-
 static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) {
-  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 16);
-  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 12);
-  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 8);
-  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7);
+  x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 16);
+  x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 12);
+  x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 8);
+  x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 7);
 }
 
 static void chacha_block(mi_random_ctx_t* ctx)
@@ -99,7 +94,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
   // since we only use chacha for randomness (and not encryption) we
   // do not _need_ to read 32-bit values as little endian but we do anyways
   // just for being compatible :-)
-  memset(ctx, 0, sizeof(*ctx));
+  _mi_memzero(ctx, sizeof(*ctx));
   for (size_t i = 0; i < 4; i++) {
     const uint8_t* sigma = (uint8_t*)"expand 32-byte k";
     ctx->input[i] = read32(sigma,i);
@@ -114,7 +109,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
 }
 
 static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
-  memset(ctx_new, 0, sizeof(*ctx_new));
+  _mi_memzero(ctx_new, sizeof(*ctx_new));
   _mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input));
   ctx_new->input[12] = 0;
   ctx_new->input[13] = 0;
@@ -143,13 +138,17 @@ void _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* ctx_new) {
 
 uintptr_t _mi_random_next(mi_random_ctx_t* ctx) {
   mi_assert_internal(mi_random_is_initialized(ctx));
-  #if MI_INTPTR_SIZE <= 4
-    return chacha_next32(ctx);
-  #elif MI_INTPTR_SIZE == 8
-    return (((uintptr_t)chacha_next32(ctx) << 32) | chacha_next32(ctx));
-  #else
-  # error "define mi_random_next for this platform"
-  #endif
+  uintptr_t r;
+  do {
+    #if MI_INTPTR_SIZE <= 4
+    r = chacha_next32(ctx);
+    #elif MI_INTPTR_SIZE == 8
+    r = (((uintptr_t)chacha_next32(ctx) << 32) | chacha_next32(ctx));
+    #else
+    # error "define mi_random_next for this platform"
+    #endif
+  } while (r==0);
+  return r;
 }
 
 
@@ -160,10 +159,10 @@ If we cannot get good randomness, we fall back to weak randomness based on a tim
 
 uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
   uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random
-  x ^= _mi_prim_clock_now();  
+  x ^= _mi_prim_clock_now();
   // and do a few randomization steps
   uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
-  for (uintptr_t i = 0; i < max; i++) {
+  for (uintptr_t i = 0; i < max || x==0; i++, x++) {
     x = _mi_random_shuffle(x);
   }
   mi_assert_internal(x != 0);
@@ -179,7 +178,7 @@ static void mi_random_init_ex(mi_random_ctx_t* ctx, bool use_weak) {
     if (!use_weak) { _mi_warning_message("unable to use secure randomness\n"); }
     #endif
     uintptr_t x = _mi_os_random_weak(0);
-    for (size_t i = 0; i < 8; i++) {  // key is eight 32-bit words.
+    for (size_t i = 0; i < 8; i++, x++) {  // key is eight 32-bit words.
       x = _mi_random_shuffle(x);
       ((uint32_t*)key)[i] = (uint32_t)x;
     }
diff --git a/system/lib/mimalloc/src/segment-map.c b/system/lib/mimalloc/src/segment-map.c
deleted file mode 100644
index 1efb1e2360bf2..0000000000000
--- a/system/lib/mimalloc/src/segment-map.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-/* -----------------------------------------------------------
-  The following functions are to reliably find the segment or
-  block that encompasses any pointer p (or NULL if it is not
-  in any of our segments).
-  We maintain a bitmap of all memory with 1 bit per MI_SEGMENT_SIZE (64MiB)
-  set to 1 if it contains the segment meta data.
------------------------------------------------------------ */
-#include "mimalloc.h"
-#include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
-
-#if (MI_INTPTR_SIZE>=8) && MI_TRACK_ASAN
-#define MI_MAX_ADDRESS    ((size_t)140 << 40) // 140TB (see issue #881)
-#elif (MI_INTPTR_SIZE >= 8)
-#define MI_MAX_ADDRESS    ((size_t)40 << 40)  // 40TB (to include huge page areas)
-#else
-#define MI_MAX_ADDRESS    ((size_t)2 << 30)   // 2Gb
-#endif
-
-#define MI_SEGMENT_MAP_BITS  (MI_MAX_ADDRESS / MI_SEGMENT_SIZE)
-#define MI_SEGMENT_MAP_SIZE  (MI_SEGMENT_MAP_BITS / 8)
-#define MI_SEGMENT_MAP_WSIZE (MI_SEGMENT_MAP_SIZE / MI_INTPTR_SIZE)
-
-static _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE + 1];  // 2KiB per TB with 64MiB segments
-
-static size_t mi_segment_map_index_of(const mi_segment_t* segment, size_t* bitidx) {
-  // note: segment can be invalid or NULL.
-  mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE?
-  if ((uintptr_t)segment >= MI_MAX_ADDRESS) {
-    *bitidx = 0;
-    return MI_SEGMENT_MAP_WSIZE;
-  }
-  else {
-    const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_SIZE;
-    *bitidx = segindex % MI_INTPTR_BITS;
-    const size_t mapindex = segindex / MI_INTPTR_BITS;
-    mi_assert_internal(mapindex < MI_SEGMENT_MAP_WSIZE);
-    return mapindex;
-  }
-}
-
-void _mi_segment_map_allocated_at(const mi_segment_t* segment) {
-  size_t bitidx;
-  size_t index = mi_segment_map_index_of(segment, &bitidx);
-  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
-  if (index==MI_SEGMENT_MAP_WSIZE) return;
-  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
-  uintptr_t newmask;
-  do {
-    newmask = (mask | ((uintptr_t)1 << bitidx));
-  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
-}
-
-void _mi_segment_map_freed_at(const mi_segment_t* segment) {
-  size_t bitidx;
-  size_t index = mi_segment_map_index_of(segment, &bitidx);
-  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
-  if (index == MI_SEGMENT_MAP_WSIZE) return;
-  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
-  uintptr_t newmask;
-  do {
-    newmask = (mask & ~((uintptr_t)1 << bitidx));
-  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
-}
-
-// Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
-static mi_segment_t* _mi_segment_of(const void* p) {
-  if (p == NULL) return NULL;
-  mi_segment_t* segment = _mi_ptr_segment(p);  // segment can be NULL  
-  size_t bitidx;
-  size_t index = mi_segment_map_index_of(segment, &bitidx);
-  // fast path: for any pointer to valid small/medium/large object or first MI_SEGMENT_SIZE in huge
-  const uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
-  if mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0) {
-    return segment; // yes, allocated by us
-  }
-  if (index==MI_SEGMENT_MAP_WSIZE) return NULL;
-
-  // TODO: maintain max/min allocated range for efficiency for more efficient rejection of invalid pointers?
-
-  // search downwards for the first segment in case it is an interior pointer
-  // could be slow but searches in MI_INTPTR_SIZE * MI_SEGMENT_SIZE (512MiB) steps trough
-  // valid huge objects
-  // note: we could maintain a lowest index to speed up the path for invalid pointers?
-  size_t lobitidx;
-  size_t loindex;
-  uintptr_t lobits = mask & (((uintptr_t)1 << bitidx) - 1);
-  if (lobits != 0) {
-    loindex = index;
-    lobitidx = mi_bsr(lobits);    // lobits != 0
-  }
-  else if (index == 0) {
-    return NULL;
-  }
-  else {
-    mi_assert_internal(index > 0);
-    uintptr_t lomask = mask;
-    loindex = index;
-    do {
-      loindex--;  
-      lomask = mi_atomic_load_relaxed(&mi_segment_map[loindex]);      
-    } while (lomask != 0 && loindex > 0);
-    if (lomask == 0) return NULL;
-    lobitidx = mi_bsr(lomask);    // lomask != 0
-  }
-  mi_assert_internal(loindex < MI_SEGMENT_MAP_WSIZE);
-  // take difference as the addresses could be larger than the MAX_ADDRESS space.
-  size_t diff = (((index - loindex) * (8*MI_INTPTR_SIZE)) + bitidx - lobitidx) * MI_SEGMENT_SIZE;
-  segment = (mi_segment_t*)((uint8_t*)segment - diff);
-
-  if (segment == NULL) return NULL;
-  mi_assert_internal((void*)segment < p);
-  bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(cookie_ok);
-  if mi_unlikely(!cookie_ok) return NULL;
-  if (((uint8_t*)segment + mi_segment_size(segment)) <= (uint8_t*)p) return NULL; // outside the range
-  mi_assert_internal(p >= (void*)segment && (uint8_t*)p < (uint8_t*)segment + mi_segment_size(segment));
-  return segment;
-}
-
-// Is this a valid pointer in our heap?
-static bool  mi_is_valid_pointer(const void* p) {
-  return ((_mi_segment_of(p) != NULL) || (_mi_arena_contains(p)));
-}
-
-mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
-  return mi_is_valid_pointer(p);
-}
-
-/*
-// Return the full segment range belonging to a pointer
-static void* mi_segment_range_of(const void* p, size_t* size) {
-  mi_segment_t* segment = _mi_segment_of(p);
-  if (segment == NULL) {
-    if (size != NULL) *size = 0;
-    return NULL;
-  }
-  else {
-    if (size != NULL) *size = segment->segment_size;
-    return segment;
-  }
-  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
-  mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size);
-  mi_reset_delayed(tld);
-  mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld));
-  return page;
-}
-*/
diff --git a/system/lib/mimalloc/src/segment.c b/system/lib/mimalloc/src/segment.c
deleted file mode 100644
index 4e4dcb80ee177..0000000000000
--- a/system/lib/mimalloc/src/segment.c
+++ /dev/null
@@ -1,1524 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-#include "mimalloc.h"
-#include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
-
-#include <string.h>  // memset
-#include <stdio.h>
-
-// -------------------------------------------------------------------
-// Segments
-// mimalloc pages reside in segments. See `mi_segment_valid` for invariants.
-// -------------------------------------------------------------------
-
-
-static void mi_segment_try_purge(mi_segment_t* segment, bool force, mi_stats_t* stats);
-
-
-// -------------------------------------------------------------------
-// commit mask
-// -------------------------------------------------------------------
-
-static bool mi_commit_mask_all_set(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    if ((commit->mask[i] & cm->mask[i]) != cm->mask[i]) return false;
-  }
-  return true;
-}
-
-static bool mi_commit_mask_any_set(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    if ((commit->mask[i] & cm->mask[i]) != 0) return true;
-  }
-  return false;
-}
-
-static void mi_commit_mask_create_intersect(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm, mi_commit_mask_t* res) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    res->mask[i] = (commit->mask[i] & cm->mask[i]);
-  }
-}
-
-static void mi_commit_mask_clear(mi_commit_mask_t* res, const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    res->mask[i] &= ~(cm->mask[i]);
-  }
-}
-
-static void mi_commit_mask_set(mi_commit_mask_t* res, const mi_commit_mask_t* cm) {
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    res->mask[i] |= cm->mask[i];
-  }
-}
-
-static void mi_commit_mask_create(size_t bitidx, size_t bitcount, mi_commit_mask_t* cm) {
-  mi_assert_internal(bitidx < MI_COMMIT_MASK_BITS);
-  mi_assert_internal((bitidx + bitcount) <= MI_COMMIT_MASK_BITS);
-  if (bitcount == MI_COMMIT_MASK_BITS) {
-    mi_assert_internal(bitidx==0);
-    mi_commit_mask_create_full(cm);
-  }
-  else if (bitcount == 0) {
-    mi_commit_mask_create_empty(cm);
-  }
-  else {
-    mi_commit_mask_create_empty(cm);
-    size_t i = bitidx / MI_COMMIT_MASK_FIELD_BITS;
-    size_t ofs = bitidx % MI_COMMIT_MASK_FIELD_BITS;
-    while (bitcount > 0) {
-      mi_assert_internal(i < MI_COMMIT_MASK_FIELD_COUNT);
-      size_t avail = MI_COMMIT_MASK_FIELD_BITS - ofs;
-      size_t count = (bitcount > avail ? avail : bitcount);
-      size_t mask = (count >= MI_COMMIT_MASK_FIELD_BITS ? ~((size_t)0) : (((size_t)1 << count) - 1) << ofs);
-      cm->mask[i] = mask;
-      bitcount -= count;
-      ofs = 0;
-      i++;
-    }
-  }
-}
-
-size_t _mi_commit_mask_committed_size(const mi_commit_mask_t* cm, size_t total) {
-  mi_assert_internal((total%MI_COMMIT_MASK_BITS)==0);
-  size_t count = 0;
-  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
-    size_t mask = cm->mask[i];
-    if (~mask == 0) {
-      count += MI_COMMIT_MASK_FIELD_BITS;
-    }
-    else {
-      for (; mask != 0; mask >>= 1) {  // todo: use popcount
-        if ((mask&1)!=0) count++;
-      }
-    }
-  }
-  // we use total since for huge segments each commit bit may represent a larger size
-  return ((total / MI_COMMIT_MASK_BITS) * count);
-}
-
-
-size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx) {
-  size_t i = (*idx) / MI_COMMIT_MASK_FIELD_BITS;
-  size_t ofs = (*idx) % MI_COMMIT_MASK_FIELD_BITS;
-  size_t mask = 0;
-  // find first ones
-  while (i < MI_COMMIT_MASK_FIELD_COUNT) {
-    mask = cm->mask[i];
-    mask >>= ofs;
-    if (mask != 0) {
-      while ((mask&1) == 0) {
-        mask >>= 1;
-        ofs++;
-      }
-      break;
-    }
-    i++;
-    ofs = 0;
-  }
-  if (i >= MI_COMMIT_MASK_FIELD_COUNT) {
-    // not found
-    *idx = MI_COMMIT_MASK_BITS;
-    return 0;
-  }
-  else {
-    // found, count ones
-    size_t count = 0;
-    *idx = (i*MI_COMMIT_MASK_FIELD_BITS) + ofs;
-    do {
-      mi_assert_internal(ofs < MI_COMMIT_MASK_FIELD_BITS && (mask&1) == 1);
-      do {
-        count++;
-        mask >>= 1;
-      } while ((mask&1) == 1);
-      if ((((*idx + count) % MI_COMMIT_MASK_FIELD_BITS) == 0)) {
-        i++;
-        if (i >= MI_COMMIT_MASK_FIELD_COUNT) break;
-        mask = cm->mask[i];
-        ofs = 0;
-      }
-    } while ((mask&1) == 1);
-    mi_assert_internal(count > 0);
-    return count;
-  }
-}
-
-
-/* --------------------------------------------------------------------------------
-  Segment allocation
--------------------------------------------------------------------------------- */
-
-
-/* -----------------------------------------------------------
-   Slices
------------------------------------------------------------ */
-
-
-static const mi_slice_t* mi_segment_slices_end(const mi_segment_t* segment) {
-  return &segment->slices[segment->slice_entries];
-}
-
-static uint8_t* mi_slice_start(const mi_slice_t* slice) {
-  mi_segment_t* segment = _mi_ptr_segment(slice);
-  mi_assert_internal(slice >= segment->slices && slice < mi_segment_slices_end(segment));
-  return ((uint8_t*)segment + ((slice - segment->slices)*MI_SEGMENT_SLICE_SIZE));
-}
-
-
-/* -----------------------------------------------------------
-   Bins
------------------------------------------------------------ */
-// Use bit scan forward to quickly find the first zero bit if it is available
-
-static inline size_t mi_slice_bin8(size_t slice_count) {
-  if (slice_count<=1) return slice_count;
-  mi_assert_internal(slice_count <= MI_SLICES_PER_SEGMENT);
-  slice_count--;
-  size_t s = mi_bsr(slice_count);  // slice_count > 1
-  if (s <= 2) return slice_count + 1;
-  size_t bin = ((s << 2) | ((slice_count >> (s - 2))&0x03)) - 4;
-  return bin;
-}
-
-static inline size_t mi_slice_bin(size_t slice_count) {
-  mi_assert_internal(slice_count*MI_SEGMENT_SLICE_SIZE <= MI_SEGMENT_SIZE);
-  mi_assert_internal(mi_slice_bin8(MI_SLICES_PER_SEGMENT) <= MI_SEGMENT_BIN_MAX);
-  size_t bin = mi_slice_bin8(slice_count);
-  mi_assert_internal(bin <= MI_SEGMENT_BIN_MAX);
-  return bin;
-}
-
-static inline size_t mi_slice_index(const mi_slice_t* slice) {
-  mi_segment_t* segment = _mi_ptr_segment(slice);
-  ptrdiff_t index = slice - segment->slices;
-  mi_assert_internal(index >= 0 && index < (ptrdiff_t)segment->slice_entries);
-  return index;
-}
-
-
-/* -----------------------------------------------------------
-   Slice span queues
------------------------------------------------------------ */
-
-static void mi_span_queue_push(mi_span_queue_t* sq, mi_slice_t* slice) {
-  // todo: or push to the end?
-  mi_assert_internal(slice->prev == NULL && slice->next==NULL);
-  slice->prev = NULL; // paranoia
-  slice->next = sq->first;
-  sq->first = slice;
-  if (slice->next != NULL) slice->next->prev = slice;
-                     else sq->last = slice;
-  slice->block_size = 0; // free
-}
-
-static mi_span_queue_t* mi_span_queue_for(size_t slice_count, mi_segments_tld_t* tld) {
-  size_t bin = mi_slice_bin(slice_count);
-  mi_span_queue_t* sq = &tld->spans[bin];
-  mi_assert_internal(sq->slice_count >= slice_count);
-  return sq;
-}
-
-static void mi_span_queue_delete(mi_span_queue_t* sq, mi_slice_t* slice) {
-  mi_assert_internal(slice->block_size==0 && slice->slice_count>0 && slice->slice_offset==0);
-  // should work too if the queue does not contain slice (which can happen during reclaim)
-  if (slice->prev != NULL) slice->prev->next = slice->next;
-  if (slice == sq->first) sq->first = slice->next;
-  if (slice->next != NULL) slice->next->prev = slice->prev;
-  if (slice == sq->last) sq->last = slice->prev;
-  slice->prev = NULL;
-  slice->next = NULL;
-  slice->block_size = 1; // no more free
-}
-
-
-/* -----------------------------------------------------------
- Invariant checking
------------------------------------------------------------ */
-
-static bool mi_slice_is_used(const mi_slice_t* slice) {
-  return (slice->block_size > 0);
-}
-
-
-#if (MI_DEBUG>=3)
-static bool mi_span_queue_contains(mi_span_queue_t* sq, mi_slice_t* slice) {
-  for (mi_slice_t* s = sq->first; s != NULL; s = s->next) {
-    if (s==slice) return true;
-  }
-  return false;
-}
-
-static bool mi_segment_is_valid(mi_segment_t* segment, mi_segments_tld_t* tld) {
-  mi_assert_internal(segment != NULL);
-  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(segment->abandoned <= segment->used);
-  mi_assert_internal(segment->thread_id == 0 || segment->thread_id == _mi_thread_id());
-  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask)); // can only decommit committed blocks
-  //mi_assert_internal(segment->segment_info_size % MI_SEGMENT_SLICE_SIZE == 0);
-  mi_slice_t* slice = &segment->slices[0];
-  const mi_slice_t* end = mi_segment_slices_end(segment);
-  size_t used_count = 0;
-  mi_span_queue_t* sq;
-  while(slice < end) {
-    mi_assert_internal(slice->slice_count > 0);
-    mi_assert_internal(slice->slice_offset == 0);
-    size_t index = mi_slice_index(slice);
-    size_t maxindex = (index + slice->slice_count >= segment->slice_entries ? segment->slice_entries : index + slice->slice_count) - 1;
-    if (mi_slice_is_used(slice)) { // a page in use, we need at least MAX_SLICE_OFFSET_COUNT valid back offsets
-      used_count++;
-      mi_assert_internal(slice->is_huge == (segment->kind == MI_SEGMENT_HUGE));
-      for (size_t i = 0; i <= MI_MAX_SLICE_OFFSET_COUNT && index + i <= maxindex; i++) {
-        mi_assert_internal(segment->slices[index + i].slice_offset == i*sizeof(mi_slice_t));
-        mi_assert_internal(i==0 || segment->slices[index + i].slice_count == 0);
-        mi_assert_internal(i==0 || segment->slices[index + i].block_size == 1);
-      }
-      // and the last entry as well (for coalescing)
-      const mi_slice_t* last = slice + slice->slice_count - 1;
-      if (last > slice && last < mi_segment_slices_end(segment)) {
-        mi_assert_internal(last->slice_offset == (slice->slice_count-1)*sizeof(mi_slice_t));
-        mi_assert_internal(last->slice_count == 0);
-        mi_assert_internal(last->block_size == 1);
-      }
-    }
-    else {  // free range of slices; only last slice needs a valid back offset
-      mi_slice_t* last = &segment->slices[maxindex];
-      if (segment->kind != MI_SEGMENT_HUGE || slice->slice_count <= (segment->slice_entries - segment->segment_info_slices)) {
-        mi_assert_internal((uint8_t*)slice == (uint8_t*)last - last->slice_offset);
-      }
-      mi_assert_internal(slice == last || last->slice_count == 0 );
-      mi_assert_internal(last->block_size == 0 || (segment->kind==MI_SEGMENT_HUGE && last->block_size==1));
-      if (segment->kind != MI_SEGMENT_HUGE && segment->thread_id != 0) { // segment is not huge or abandoned
-        sq = mi_span_queue_for(slice->slice_count,tld);
-        mi_assert_internal(mi_span_queue_contains(sq,slice));
-      }
-    }
-    slice = &segment->slices[maxindex+1];
-  }
-  mi_assert_internal(slice == end);
-  mi_assert_internal(used_count == segment->used + 1);
-  return true;
-}
-#endif
-
-/* -----------------------------------------------------------
- Segment size calculations
------------------------------------------------------------ */
-
-static size_t mi_segment_info_size(mi_segment_t* segment) {
-  return segment->segment_info_slices * MI_SEGMENT_SLICE_SIZE;
-}
-
-static uint8_t* _mi_segment_page_start_from_slice(const mi_segment_t* segment, const mi_slice_t* slice, size_t block_size, size_t* page_size)
-{
-  const ptrdiff_t idx = slice - segment->slices;
-  const size_t psize = (size_t)slice->slice_count * MI_SEGMENT_SLICE_SIZE;
-  uint8_t* const pstart = (uint8_t*)segment + (idx*MI_SEGMENT_SLICE_SIZE);
-  // make the start not OS page aligned for smaller blocks to avoid page/cache effects
-  // note: the offset must always be a block_size multiple since we assume small allocations
-  // are aligned (see `mi_heap_malloc_aligned`).
-  size_t start_offset = 0;
-  if (block_size > 0 && block_size <= MI_MAX_ALIGN_GUARANTEE) {
-    // for small objects, ensure the page start is aligned with the block size (PR#66 by kickunderscore)
-    const size_t adjust = block_size - ((uintptr_t)pstart % block_size);
-    if (adjust < block_size && psize >= block_size + adjust) {
-      start_offset += adjust;
-    }
-  }
-  if (block_size >= MI_INTPTR_SIZE) {
-    if (block_size <= 64) { start_offset += 3*block_size; }
-    else if (block_size <= 512) { start_offset += block_size; }
-  }
-  if (page_size != NULL) { *page_size = psize - start_offset; }
-  return (pstart + start_offset);
-}
-
-// Start of the page available memory; can be used on uninitialized pages
-uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size)
-{
-  const mi_slice_t* slice = mi_page_to_slice((mi_page_t*)page);
-  uint8_t* p = _mi_segment_page_start_from_slice(segment, slice, mi_page_block_size(page), page_size);
-  mi_assert_internal(mi_page_block_size(page) > 0 || _mi_ptr_page(p) == page);
-  mi_assert_internal(_mi_ptr_segment(p) == segment);
-  return p;
-}
-
-
-static size_t mi_segment_calculate_slices(size_t required, size_t* info_slices) {
-  size_t page_size = _mi_os_page_size();
-  size_t isize     = _mi_align_up(sizeof(mi_segment_t), page_size);
-  size_t guardsize = 0;
-
-  if (MI_SECURE>0) {
-    // in secure mode, we set up a protected page in between the segment info
-    // and the page data (and one at the end of the segment)
-    guardsize = page_size;
-    if (required > 0) {
-      required = _mi_align_up(required, MI_SEGMENT_SLICE_SIZE) + page_size;
-    }
-  }
-
-  isize = _mi_align_up(isize + guardsize, MI_SEGMENT_SLICE_SIZE);
-  if (info_slices != NULL) *info_slices = isize / MI_SEGMENT_SLICE_SIZE;
-  size_t segment_size = (required==0 ? MI_SEGMENT_SIZE : _mi_align_up( required + isize + guardsize, MI_SEGMENT_SLICE_SIZE) );
-  mi_assert_internal(segment_size % MI_SEGMENT_SLICE_SIZE == 0);
-  return (segment_size / MI_SEGMENT_SLICE_SIZE);
-}
-
-
-/* ----------------------------------------------------------------------------
-Segment caches
-We keep a small segment cache per thread to increase local
-reuse and avoid setting/clearing guard pages in secure mode.
-------------------------------------------------------------------------------- */
-
-static void mi_segments_track_size(long segment_size, mi_segments_tld_t* tld) {
-  if (segment_size>=0) _mi_stat_increase(&tld->stats->segments,1);
-                  else _mi_stat_decrease(&tld->stats->segments,1);
-  tld->count += (segment_size >= 0 ? 1 : -1);
-  if (tld->count > tld->peak_count) tld->peak_count = tld->count;
-  tld->current_size += segment_size;
-  if (tld->current_size > tld->peak_size) tld->peak_size = tld->current_size;
-}
-
-static void mi_segment_os_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
-  segment->thread_id = 0;
-  _mi_segment_map_freed_at(segment);
-  mi_segments_track_size(-((long)mi_segment_size(segment)),tld);
-  if (segment->was_reclaimed) {
-    tld->reclaim_count--;
-    segment->was_reclaimed = false;
-  }
-  if (MI_SECURE>0) {
-    // _mi_os_unprotect(segment, mi_segment_size(segment)); // ensure no more guard pages are set
-    // unprotect the guard pages; we cannot just unprotect the whole segment size as part may be decommitted
-    size_t os_pagesize = _mi_os_page_size();
-    _mi_os_unprotect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize);
-    uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize;
-    _mi_os_unprotect(end, os_pagesize);
-  }
-
-  // purge delayed decommits now? (no, leave it to the arena)
-  // mi_segment_try_purge(segment,true,tld->stats);
-
-  const size_t size = mi_segment_size(segment);
-  const size_t csize = _mi_commit_mask_committed_size(&segment->commit_mask, size);
-
-  _mi_abandoned_await_readers();  // wait until safe to free
-  _mi_arena_free(segment, mi_segment_size(segment), csize, segment->memid, tld->stats);
-}
-
-/* -----------------------------------------------------------
-   Commit/Decommit ranges
------------------------------------------------------------ */
-
-static void mi_segment_commit_mask(mi_segment_t* segment, bool conservative, uint8_t* p, size_t size, uint8_t** start_p, size_t* full_size, mi_commit_mask_t* cm) {
-  mi_assert_internal(_mi_ptr_segment(p + 1) == segment);
-  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
-  mi_commit_mask_create_empty(cm);
-  if (size == 0 || size > MI_SEGMENT_SIZE || segment->kind == MI_SEGMENT_HUGE) return;
-  const size_t segstart = mi_segment_info_size(segment);
-  const size_t segsize = mi_segment_size(segment);
-  if (p >= (uint8_t*)segment + segsize) return;
-
-  size_t pstart = (p - (uint8_t*)segment);
-  mi_assert_internal(pstart + size <= segsize);
-
-  size_t start;
-  size_t end;
-  if (conservative) {
-    // decommit conservative
-    start = _mi_align_up(pstart, MI_COMMIT_SIZE);
-    end   = _mi_align_down(pstart + size, MI_COMMIT_SIZE);
-    mi_assert_internal(start >= segstart);
-    mi_assert_internal(end <= segsize);
-  }
-  else {
-    // commit liberal
-    start = _mi_align_down(pstart, MI_MINIMAL_COMMIT_SIZE);
-    end   = _mi_align_up(pstart + size, MI_MINIMAL_COMMIT_SIZE);
-  }
-  if (pstart >= segstart && start < segstart) {  // note: the mask is also calculated for an initial commit of the info area
-    start = segstart;
-  }
-  if (end > segsize) {
-    end = segsize;
-  }
-
-  mi_assert_internal(start <= pstart && (pstart + size) <= end);
-  mi_assert_internal(start % MI_COMMIT_SIZE==0 && end % MI_COMMIT_SIZE == 0);
-  *start_p   = (uint8_t*)segment + start;
-  *full_size = (end > start ? end - start : 0);
-  if (*full_size == 0) return;
-
-  size_t bitidx = start / MI_COMMIT_SIZE;
-  mi_assert_internal(bitidx < MI_COMMIT_MASK_BITS);
-
-  size_t bitcount = *full_size / MI_COMMIT_SIZE; // can be 0
-  if (bitidx + bitcount > MI_COMMIT_MASK_BITS) {
-    _mi_warning_message("commit mask overflow: idx=%zu count=%zu start=%zx end=%zx p=0x%p size=%zu fullsize=%zu\n", bitidx, bitcount, start, end, p, size, *full_size);
-  }
-  mi_assert_internal((bitidx + bitcount) <= MI_COMMIT_MASK_BITS);
-  mi_commit_mask_create(bitidx, bitcount, cm);
-}
-
-static bool mi_segment_commit(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
-  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask));
-
-  // commit liberal
-  uint8_t* start = NULL;
-  size_t   full_size = 0;
-  mi_commit_mask_t mask;
-  mi_segment_commit_mask(segment, false /* conservative? */, p, size, &start, &full_size, &mask);
-  if (mi_commit_mask_is_empty(&mask) || full_size == 0) return true;
-
-  if (!mi_commit_mask_all_set(&segment->commit_mask, &mask)) {
-    // committing
-    bool is_zero = false;
-    mi_commit_mask_t cmask;
-    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
-    _mi_stat_decrease(&_mi_stats_main.committed, _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for overlap
-    if (!_mi_os_commit(start, full_size, &is_zero, stats)) return false;
-    mi_commit_mask_set(&segment->commit_mask, &mask);
-  }
-
-  // increase purge expiration when using part of delayed purges -- we assume more allocations are coming soon.
-  if (mi_commit_mask_any_set(&segment->purge_mask, &mask)) {
-    segment->purge_expire = _mi_clock_now() + mi_option_get(mi_option_purge_delay);
-  }
-
-  // always clear any delayed purges in our range (as they are either committed now)
-  mi_commit_mask_clear(&segment->purge_mask, &mask);
-  return true;
-}
-
-static bool mi_segment_ensure_committed(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
-  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask));
-  // note: assumes commit_mask is always full for huge segments as otherwise the commit mask bits can overflow
-  if (mi_commit_mask_is_full(&segment->commit_mask) && mi_commit_mask_is_empty(&segment->purge_mask)) return true; // fully committed
-  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
-  return mi_segment_commit(segment, p, size, stats);
-}
-
-static bool mi_segment_purge(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
-  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask));
-  if (!segment->allow_purge) return true;
-
-  // purge conservative
-  uint8_t* start = NULL;
-  size_t   full_size = 0;
-  mi_commit_mask_t mask;
-  mi_segment_commit_mask(segment, true /* conservative? */, p, size, &start, &full_size, &mask);
-  if (mi_commit_mask_is_empty(&mask) || full_size==0) return true;
-
-  if (mi_commit_mask_any_set(&segment->commit_mask, &mask)) {
-    // purging
-    mi_assert_internal((void*)start != (void*)segment);
-    mi_assert_internal(segment->allow_decommit);
-    const bool decommitted = _mi_os_purge(start, full_size, stats);  // reset or decommit
-    if (decommitted) {
-      mi_commit_mask_t cmask;
-      mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
-      _mi_stat_increase(&_mi_stats_main.committed, full_size - _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for double counting
-      mi_commit_mask_clear(&segment->commit_mask, &mask);
-    }
-  }
-
-  // always clear any scheduled purges in our range
-  mi_commit_mask_clear(&segment->purge_mask, &mask);
-  return true;
-}
-
-static void mi_segment_schedule_purge(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
-  if (!segment->allow_purge) return;
-
-  if (mi_option_get(mi_option_purge_delay) == 0) {
-    mi_segment_purge(segment, p, size, stats);
-  }
-  else {
-    // register for future purge in the purge mask
-    uint8_t* start = NULL;
-    size_t   full_size = 0;
-    mi_commit_mask_t mask;
-    mi_segment_commit_mask(segment, true /*conservative*/, p, size, &start, &full_size, &mask);
-    if (mi_commit_mask_is_empty(&mask) || full_size==0) return;
-
-    // update delayed commit
-    mi_assert_internal(segment->purge_expire > 0 || mi_commit_mask_is_empty(&segment->purge_mask));
-    mi_commit_mask_t cmask;
-    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);  // only purge what is committed; span_free may try to decommit more
-    mi_commit_mask_set(&segment->purge_mask, &cmask);
-    mi_msecs_t now = _mi_clock_now();
-    if (segment->purge_expire == 0) {
-      // no previous purgess, initialize now
-      segment->purge_expire = now + mi_option_get(mi_option_purge_delay);
-    }
-    else if (segment->purge_expire <= now) {
-      // previous purge mask already expired
-      if (segment->purge_expire + mi_option_get(mi_option_purge_extend_delay) <= now) {
-        mi_segment_try_purge(segment, true, stats);
-      }
-      else {
-        segment->purge_expire = now + mi_option_get(mi_option_purge_extend_delay); // (mi_option_get(mi_option_purge_delay) / 8); // wait a tiny bit longer in case there is a series of free's
-      }
-    }
-    else {
-      // previous purge mask is not yet expired, increase the expiration by a bit.
-      segment->purge_expire += mi_option_get(mi_option_purge_extend_delay);
-    }
-  }
-}
-
-static void mi_segment_try_purge(mi_segment_t* segment, bool force, mi_stats_t* stats) {
-  if (!segment->allow_purge || segment->purge_expire == 0 || mi_commit_mask_is_empty(&segment->purge_mask)) return;
-  mi_msecs_t now = _mi_clock_now();
-  if (!force && now < segment->purge_expire) return;
-
-  mi_commit_mask_t mask = segment->purge_mask;
-  segment->purge_expire = 0;
-  mi_commit_mask_create_empty(&segment->purge_mask);
-
-  size_t idx;
-  size_t count;
-  mi_commit_mask_foreach(&mask, idx, count) {
-    // if found, decommit that sequence
-    if (count > 0) {
-      uint8_t* p = (uint8_t*)segment + (idx*MI_COMMIT_SIZE);
-      size_t size = count * MI_COMMIT_SIZE;
-      mi_segment_purge(segment, p, size, stats);
-    }
-  }
-  mi_commit_mask_foreach_end()
-  mi_assert_internal(mi_commit_mask_is_empty(&segment->purge_mask));
-}
-
-// called from `mi_heap_collect_ex`
-// this can be called per-page so it is important that try_purge has fast exit path
-void _mi_segment_collect(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
-  mi_segment_try_purge(segment, force, tld->stats);
-}
-
-/* -----------------------------------------------------------
-   Span free
------------------------------------------------------------ */
-
-static bool mi_segment_is_abandoned(mi_segment_t* segment) {
-  return (mi_atomic_load_relaxed(&segment->thread_id) == 0);
-}
-
-// note: can be called on abandoned segments
-static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size_t slice_count, bool allow_purge, mi_segments_tld_t* tld) {
-  mi_assert_internal(slice_index < segment->slice_entries);
-  mi_span_queue_t* sq = (segment->kind == MI_SEGMENT_HUGE || mi_segment_is_abandoned(segment)
-                          ? NULL : mi_span_queue_for(slice_count,tld));
-  if (slice_count==0) slice_count = 1;
-  mi_assert_internal(slice_index + slice_count - 1 < segment->slice_entries);
-
-  // set first and last slice (the intermediates can be undetermined)
-  mi_slice_t* slice = &segment->slices[slice_index];
-  slice->slice_count = (uint32_t)slice_count;
-  mi_assert_internal(slice->slice_count == slice_count); // no overflow?
-  slice->slice_offset = 0;
-  if (slice_count > 1) {
-    mi_slice_t* last = slice + slice_count - 1;
-    mi_slice_t* end  = (mi_slice_t*)mi_segment_slices_end(segment);
-    if (last > end) { last = end; }
-    last->slice_count = 0;
-    last->slice_offset = (uint32_t)(sizeof(mi_page_t)*(slice_count - 1));
-    last->block_size = 0;
-  }
-
-  // perhaps decommit
-  if (allow_purge) {
-    mi_segment_schedule_purge(segment, mi_slice_start(slice), slice_count * MI_SEGMENT_SLICE_SIZE, tld->stats);
-  }
-
-  // and push it on the free page queue (if it was not a huge page)
-  if (sq != NULL) mi_span_queue_push( sq, slice );
-             else slice->block_size = 0; // mark huge page as free anyways
-}
-
-/*
-// called from reclaim to add existing free spans
-static void mi_segment_span_add_free(mi_slice_t* slice, mi_segments_tld_t* tld) {
-  mi_segment_t* segment = _mi_ptr_segment(slice);
-  mi_assert_internal(slice->xblock_size==0 && slice->slice_count>0 && slice->slice_offset==0);
-  size_t slice_index = mi_slice_index(slice);
-  mi_segment_span_free(segment,slice_index,slice->slice_count,tld);
-}
-*/
-
-static void mi_segment_span_remove_from_queue(mi_slice_t* slice, mi_segments_tld_t* tld) {
-  mi_assert_internal(slice->slice_count > 0 && slice->slice_offset==0 && slice->block_size==0);
-  mi_assert_internal(_mi_ptr_segment(slice)->kind != MI_SEGMENT_HUGE);
-  mi_span_queue_t* sq = mi_span_queue_for(slice->slice_count, tld);
-  mi_span_queue_delete(sq, slice);
-}
-
-// note: can be called on abandoned segments
-static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_tld_t* tld) {
-  mi_assert_internal(slice != NULL && slice->slice_count > 0 && slice->slice_offset == 0);
-  mi_segment_t* const segment = _mi_ptr_segment(slice);
-  const bool is_abandoned = (segment->thread_id == 0); // mi_segment_is_abandoned(segment);
-
-  // for huge pages, just mark as free but don't add to the queues
-  if (segment->kind == MI_SEGMENT_HUGE) {
-    // issue #691: segment->used can be 0 if the huge page block was freed while abandoned (reclaim will get here in that case)
-    mi_assert_internal((segment->used==0 && slice->block_size==0) || segment->used == 1);  // decreased right after this call in `mi_segment_page_clear`
-    slice->block_size = 0;  // mark as free anyways
-    // we should mark the last slice `xblock_size=0` now to maintain invariants but we skip it to
-    // avoid a possible cache miss (and the segment is about to be freed)
-    return slice;
-  }
-
-  // otherwise coalesce the span and add to the free span queues
-  size_t slice_count = slice->slice_count;
-  mi_slice_t* next = slice + slice->slice_count;
-  mi_assert_internal(next <= mi_segment_slices_end(segment));
-  if (next < mi_segment_slices_end(segment) && next->block_size==0) {
-    // free next block -- remove it from free and merge
-    mi_assert_internal(next->slice_count > 0 && next->slice_offset==0);
-    slice_count += next->slice_count; // extend
-    if (!is_abandoned) { mi_segment_span_remove_from_queue(next, tld); }
-  }
-  if (slice > segment->slices) {
-    mi_slice_t* prev = mi_slice_first(slice - 1);
-    mi_assert_internal(prev >= segment->slices);
-    if (prev->block_size==0) {
-      // free previous slice -- remove it from free and merge
-      mi_assert_internal(prev->slice_count > 0 && prev->slice_offset==0);
-      slice_count += prev->slice_count;
-      if (!is_abandoned) { mi_segment_span_remove_from_queue(prev, tld); }
-      slice = prev;
-    }
-  }
-
-  // and add the new free page
-  mi_segment_span_free(segment, mi_slice_index(slice), slice_count, true, tld);
-  return slice;
-}
-
-
-
-/* -----------------------------------------------------------
-   Page allocation
------------------------------------------------------------ */
-
-// Note: may still return NULL if committing the memory failed
-static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_index, size_t slice_count, mi_segments_tld_t* tld) {
-  mi_assert_internal(slice_index < segment->slice_entries);
-  mi_slice_t* const slice = &segment->slices[slice_index];
-  mi_assert_internal(slice->block_size==0 || slice->block_size==1);
-
-  // commit before changing the slice data
-  if (!mi_segment_ensure_committed(segment, _mi_segment_page_start_from_slice(segment, slice, 0, NULL), slice_count * MI_SEGMENT_SLICE_SIZE, tld->stats)) {
-    return NULL;  // commit failed!
-  }
-
-  // convert the slices to a page
-  slice->slice_offset = 0;
-  slice->slice_count = (uint32_t)slice_count;
-  mi_assert_internal(slice->slice_count == slice_count);
-  const size_t bsize = slice_count * MI_SEGMENT_SLICE_SIZE;
-  slice->block_size = bsize;
-  mi_page_t*  page = mi_slice_to_page(slice);
-  mi_assert_internal(mi_page_block_size(page) == bsize);
-
-  // set slice back pointers for the first MI_MAX_SLICE_OFFSET_COUNT entries
-  size_t extra = slice_count-1;
-  if (extra > MI_MAX_SLICE_OFFSET_COUNT) extra = MI_MAX_SLICE_OFFSET_COUNT;
-  if (slice_index + extra >= segment->slice_entries) extra = segment->slice_entries - slice_index - 1;  // huge objects may have more slices than avaiable entries in the segment->slices
-
-  mi_slice_t* slice_next = slice + 1;
-  for (size_t i = 1; i <= extra; i++, slice_next++) {
-    slice_next->slice_offset = (uint32_t)(sizeof(mi_slice_t)*i);
-    slice_next->slice_count = 0;
-    slice_next->block_size = 1;
-  }
-
-  // and also for the last one (if not set already) (the last one is needed for coalescing and for large alignments)
-  // note: the cast is needed for ubsan since the index can be larger than MI_SLICES_PER_SEGMENT for huge allocations (see #543)
-  mi_slice_t* last = slice + slice_count - 1;
-  mi_slice_t* end = (mi_slice_t*)mi_segment_slices_end(segment);
-  if (last > end) last = end;
-  if (last > slice) {
-    last->slice_offset = (uint32_t)(sizeof(mi_slice_t) * (last - slice));
-    last->slice_count = 0;
-    last->block_size = 1;
-  }
-
-  // and initialize the page
-  page->is_committed = true;
-  page->is_huge = (segment->kind == MI_SEGMENT_HUGE);
-  segment->used++;
-  return page;
-}
-
-static void mi_segment_slice_split(mi_segment_t* segment, mi_slice_t* slice, size_t slice_count, mi_segments_tld_t* tld) {
-  mi_assert_internal(_mi_ptr_segment(slice) == segment);
-  mi_assert_internal(slice->slice_count >= slice_count);
-  mi_assert_internal(slice->block_size > 0); // no more in free queue
-  if (slice->slice_count <= slice_count) return;
-  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
-  size_t next_index = mi_slice_index(slice) + slice_count;
-  size_t next_count = slice->slice_count - slice_count;
-  mi_segment_span_free(segment, next_index, next_count, false /* don't purge left-over part */, tld);
-  slice->slice_count = (uint32_t)slice_count;
-}
-
-static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld) {
-  mi_assert_internal(slice_count*MI_SEGMENT_SLICE_SIZE <= MI_LARGE_OBJ_SIZE_MAX);
-  // search from best fit up
-  mi_span_queue_t* sq = mi_span_queue_for(slice_count, tld);
-  if (slice_count == 0) slice_count = 1;
-  while (sq <= &tld->spans[MI_SEGMENT_BIN_MAX]) {
-    for (mi_slice_t* slice = sq->first; slice != NULL; slice = slice->next) {
-      if (slice->slice_count >= slice_count) {
-        // found one
-        mi_segment_t* segment = _mi_ptr_segment(slice);
-        if (_mi_arena_memid_is_suitable(segment->memid, req_arena_id)) {
-          // found a suitable page span
-          mi_span_queue_delete(sq, slice);
-
-          if (slice->slice_count > slice_count) {
-            mi_segment_slice_split(segment, slice, slice_count, tld);
-          }
-          mi_assert_internal(slice != NULL && slice->slice_count == slice_count && slice->block_size > 0);
-          mi_page_t* page = mi_segment_span_allocate(segment, mi_slice_index(slice), slice->slice_count, tld);
-          if (page == NULL) {
-            // commit failed; return NULL but first restore the slice
-            mi_segment_span_free_coalesce(slice, tld);
-            return NULL;
-          }
-          return page;
-        }
-      }
-    }
-    sq++;
-  }
-  // could not find a page..
-  return NULL;
-}
-
-
-/* -----------------------------------------------------------
-   Segment allocation
------------------------------------------------------------ */
-
-static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment, bool eager_delayed, mi_arena_id_t req_arena_id,
-                                          size_t* psegment_slices, size_t* pinfo_slices,
-                                          bool commit, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
-
-{
-  mi_memid_t memid;
-  bool   allow_large = (!eager_delayed && (MI_SECURE == 0)); // only allow large OS pages once we are no longer lazy
-  size_t align_offset = 0;
-  size_t alignment = MI_SEGMENT_ALIGN;
-
-  if (page_alignment > 0) {
-    // mi_assert_internal(huge_page != NULL);
-    mi_assert_internal(page_alignment >= MI_SEGMENT_ALIGN);
-    alignment = page_alignment;
-    const size_t info_size = (*pinfo_slices) * MI_SEGMENT_SLICE_SIZE;
-    align_offset = _mi_align_up( info_size, MI_SEGMENT_ALIGN );
-    const size_t extra = align_offset - info_size;
-    // recalculate due to potential guard pages
-    *psegment_slices = mi_segment_calculate_slices(required + extra, pinfo_slices);
-    mi_assert_internal(*psegment_slices > 0 && *psegment_slices <= UINT32_MAX);
-  }
-
-  const size_t segment_size = (*psegment_slices) * MI_SEGMENT_SLICE_SIZE;
-  mi_segment_t* segment = (mi_segment_t*)_mi_arena_alloc_aligned(segment_size, alignment, align_offset, commit, allow_large, req_arena_id, &memid, os_tld);
-  if (segment == NULL) {
-    return NULL;  // failed to allocate
-  }
-
-  // ensure metadata part of the segment is committed
-  mi_commit_mask_t commit_mask;
-  if (memid.initially_committed) {
-    mi_commit_mask_create_full(&commit_mask);
-  }
-  else {
-    // at least commit the info slices
-    const size_t commit_needed = _mi_divide_up((*pinfo_slices)*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
-    mi_assert_internal(commit_needed>0);
-    mi_commit_mask_create(0, commit_needed, &commit_mask);
-    mi_assert_internal(commit_needed*MI_COMMIT_SIZE >= (*pinfo_slices)*MI_SEGMENT_SLICE_SIZE);
-    if (!_mi_os_commit(segment, commit_needed*MI_COMMIT_SIZE, NULL, tld->stats)) {
-      _mi_arena_free(segment,segment_size,0,memid,tld->stats);
-      return NULL;
-    }
-  }
-  mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
-
-  segment->memid = memid;
-  segment->allow_decommit = !memid.is_pinned;
-  segment->allow_purge = segment->allow_decommit && (mi_option_get(mi_option_purge_delay) >= 0);
-  segment->segment_size = segment_size;
-  segment->commit_mask = commit_mask;
-  segment->purge_expire = 0;
-  mi_commit_mask_create_empty(&segment->purge_mask);
-
-  mi_segments_track_size((long)(segment_size), tld);
-  _mi_segment_map_allocated_at(segment);
-  return segment;
-}
-
-
-// Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` .
-static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld, mi_page_t** huge_page)
-{
-  mi_assert_internal((required==0 && huge_page==NULL) || (required>0 && huge_page != NULL));
-
-  // calculate needed sizes first
-  size_t info_slices;
-  size_t segment_slices = mi_segment_calculate_slices(required, &info_slices);
-  mi_assert_internal(segment_slices > 0 && segment_slices <= UINT32_MAX);
-
-  // Commit eagerly only if not the first N lazy segments (to reduce impact of many threads that allocate just a little)
-  const bool eager_delay = (// !_mi_os_has_overcommit() &&             // never delay on overcommit systems
-                            _mi_current_thread_count() > 1 &&       // do not delay for the first N threads
-                            tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
-  const bool eager = !eager_delay && mi_option_is_enabled(mi_option_eager_commit);
-  bool commit = eager || (required > 0);
-
-  // Allocate the segment from the OS
-  mi_segment_t* segment = mi_segment_os_alloc(required, page_alignment, eager_delay, req_arena_id,
-                                              &segment_slices, &info_slices, commit, tld, os_tld);
-  if (segment == NULL) return NULL;
-
-  // zero the segment info? -- not always needed as it may be zero initialized from the OS
-  if (!segment->memid.initially_zero) {
-    ptrdiff_t ofs    = offsetof(mi_segment_t, next);
-    size_t    prefix = offsetof(mi_segment_t, slices) - ofs;
-    size_t    zsize  = prefix + (sizeof(mi_slice_t) * (segment_slices + 1)); // one more
-    _mi_memzero((uint8_t*)segment + ofs, zsize);
-  }
-
-  // initialize the rest of the segment info
-  const size_t slice_entries = (segment_slices > MI_SLICES_PER_SEGMENT ? MI_SLICES_PER_SEGMENT : segment_slices);
-  segment->segment_slices = segment_slices;
-  segment->segment_info_slices = info_slices;
-  segment->thread_id = _mi_thread_id();
-  segment->cookie = _mi_ptr_cookie(segment);
-  segment->slice_entries = slice_entries;
-  segment->kind = (required == 0 ? MI_SEGMENT_NORMAL : MI_SEGMENT_HUGE);
-
-  // _mi_memzero(segment->slices, sizeof(mi_slice_t)*(info_slices+1));
-  _mi_stat_increase(&tld->stats->page_committed, mi_segment_info_size(segment));
-
-  // set up guard pages
-  size_t guard_slices = 0;
-  if (MI_SECURE>0) {
-    // in secure mode, we set up a protected page in between the segment info
-    // and the page data, and at the end of the segment.
-    size_t os_pagesize = _mi_os_page_size();    
-    _mi_os_protect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize);
-    uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize;
-    mi_segment_ensure_committed(segment, end, os_pagesize, tld->stats);
-    _mi_os_protect(end, os_pagesize);
-    if (slice_entries == segment_slices) segment->slice_entries--; // don't use the last slice :-(
-    guard_slices = 1;
-  }
-
-  // reserve first slices for segment info
-  mi_page_t* page0 = mi_segment_span_allocate(segment, 0, info_slices, tld);
-  mi_assert_internal(page0!=NULL); if (page0==NULL) return NULL; // cannot fail as we always commit in advance
-  mi_assert_internal(segment->used == 1);
-  segment->used = 0; // don't count our internal slices towards usage
-
-  // initialize initial free pages
-  if (segment->kind == MI_SEGMENT_NORMAL) { // not a huge page
-    mi_assert_internal(huge_page==NULL);
-    mi_segment_span_free(segment, info_slices, segment->slice_entries - info_slices, false /* don't purge */, tld);
-  }
-  else {
-    mi_assert_internal(huge_page!=NULL);
-    mi_assert_internal(mi_commit_mask_is_empty(&segment->purge_mask));
-    mi_assert_internal(mi_commit_mask_is_full(&segment->commit_mask));
-    *huge_page = mi_segment_span_allocate(segment, info_slices, segment_slices - info_slices - guard_slices, tld);
-    mi_assert_internal(*huge_page != NULL); // cannot fail as we commit in advance
-  }
-
-  mi_assert_expensive(mi_segment_is_valid(segment,tld));
-  return segment;
-}
-
-
-static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
-  MI_UNUSED(force);
-  mi_assert_internal(segment != NULL);
-  mi_assert_internal(segment->next == NULL);
-  mi_assert_internal(segment->used == 0);
-
-  // Remove the free pages
-  mi_slice_t* slice = &segment->slices[0];
-  const mi_slice_t* end = mi_segment_slices_end(segment);
-  #if MI_DEBUG>1
-  size_t page_count = 0;
-  #endif
-  while (slice < end) {
-    mi_assert_internal(slice->slice_count > 0);
-    mi_assert_internal(slice->slice_offset == 0);
-    mi_assert_internal(mi_slice_index(slice)==0 || slice->block_size == 0); // no more used pages ..
-    if (slice->block_size == 0 && segment->kind != MI_SEGMENT_HUGE) {
-      mi_segment_span_remove_from_queue(slice, tld);
-    }
-    #if MI_DEBUG>1
-    page_count++;
-    #endif
-    slice = slice + slice->slice_count;
-  }
-  mi_assert_internal(page_count == 2); // first page is allocated by the segment itself
-
-  // stats
-  _mi_stat_decrease(&tld->stats->page_committed, mi_segment_info_size(segment));
-
-  // return it to the OS
-  mi_segment_os_free(segment, tld);
-}
-
-
-/* -----------------------------------------------------------
-   Page Free
------------------------------------------------------------ */
-
-static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld);
-
-// note: can be called on abandoned pages
-static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld) {
-  mi_assert_internal(page->block_size > 0);
-  mi_assert_internal(mi_page_all_free(page));
-  mi_segment_t* segment = _mi_ptr_segment(page);
-  mi_assert_internal(segment->used > 0);
-
-  size_t inuse = page->capacity * mi_page_block_size(page);
-  _mi_stat_decrease(&tld->stats->page_committed, inuse);
-  _mi_stat_decrease(&tld->stats->pages, 1);
-
-  // reset the page memory to reduce memory pressure?
-  if (segment->allow_decommit && mi_option_is_enabled(mi_option_deprecated_page_reset)) {
-    size_t psize;
-    uint8_t* start = _mi_segment_page_start(segment, page, &psize);
-    _mi_os_reset(start, psize, tld->stats);
-  }
-
-  // zero the page data, but not the segment fields and heap tag
-  page->is_zero_init = false;
-  uint8_t heap_tag = page->heap_tag;
-  ptrdiff_t ofs = offsetof(mi_page_t, capacity);
-  _mi_memzero((uint8_t*)page + ofs, sizeof(*page) - ofs);
-  page->block_size = 1;
-  page->heap_tag = heap_tag;
-
-  // and free it
-  mi_slice_t* slice = mi_segment_span_free_coalesce(mi_page_to_slice(page), tld);
-  segment->used--;
-  // cannot assert segment valid as it is called during reclaim
-  // mi_assert_expensive(mi_segment_is_valid(segment, tld));
-  return slice;
-}
-
-void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
-{
-  mi_assert(page != NULL);
-
-  mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert_expensive(mi_segment_is_valid(segment,tld));
-
-  // mark it as free now
-  mi_segment_page_clear(page, tld);
-  mi_assert_expensive(mi_segment_is_valid(segment, tld));
-
-  if (segment->used == 0) {
-    // no more used pages; remove from the free list and free the segment
-    mi_segment_free(segment, force, tld);
-  }
-  else if (segment->used == segment->abandoned) {
-    // only abandoned pages; remove from free list and abandon
-    mi_segment_abandon(segment,tld);
-  }
-  else {
-    // perform delayed purges
-    mi_segment_try_purge(segment, false /* force? */, tld->stats);
-  }
-}
-
-
-/* -----------------------------------------------------------
-Abandonment
-
-When threads terminate, they can leave segments with
-live blocks (reachable through other threads). Such segments
-are "abandoned" and will be reclaimed by other threads to
-reuse their pages and/or free them eventually. The
-`thread_id` of such segments is 0.
-
-When a block is freed in an abandoned segment, the segment
-is reclaimed into that thread.
-
-Moreover, if threads are looking for a fresh segment, they
-will first consider abondoned segments -- these can be found
-by scanning the arena memory
-(segments outside arena memoryare only reclaimed by a free).
------------------------------------------------------------ */
-
-// legacy: Wait until there are no more pending reads on segments that used to be in the abandoned list
-void _mi_abandoned_await_readers(void) {
-  // nothing needed
-}
-
-/* -----------------------------------------------------------
-   Abandon segment/page
------------------------------------------------------------ */
-
-static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
-  mi_assert_internal(segment->used == segment->abandoned);
-  mi_assert_internal(segment->used > 0);
-  mi_assert_internal(segment->abandoned_visits == 0);
-  mi_assert_expensive(mi_segment_is_valid(segment,tld));
-
-  // remove the free pages from the free page queues
-  mi_slice_t* slice = &segment->slices[0];
-  const mi_slice_t* end = mi_segment_slices_end(segment);
-  while (slice < end) {
-    mi_assert_internal(slice->slice_count > 0);
-    mi_assert_internal(slice->slice_offset == 0);
-    if (slice->block_size == 0) { // a free page
-      mi_segment_span_remove_from_queue(slice,tld);
-      slice->block_size = 0; // but keep it free
-    }
-    slice = slice + slice->slice_count;
-  }
-
-  // perform delayed decommits (forcing is much slower on mstress)
-  // Only abandoned segments in arena memory can be reclaimed without a free
-  // so if a segment is not from an arena we force purge here to be conservative.
-  const bool force_purge = (segment->memid.memkind != MI_MEM_ARENA) || mi_option_is_enabled(mi_option_abandoned_page_purge);
-  mi_segment_try_purge(segment, force_purge, tld->stats);
-
-  // all pages in the segment are abandoned; add it to the abandoned list
-  _mi_stat_increase(&tld->stats->segments_abandoned, 1);
-  mi_segments_track_size(-((long)mi_segment_size(segment)), tld);
-  segment->thread_id = 0;
-  segment->abandoned_visits = 1;   // from 0 to 1 to signify it is abandoned
-  if (segment->was_reclaimed) {
-    tld->reclaim_count--;
-    segment->was_reclaimed = false;
-  }
-  _mi_arena_segment_mark_abandoned(segment);
-}
-
-void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
-  mi_assert(page != NULL);
-  mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
-  mi_assert_internal(mi_page_heap(page) == NULL);
-  mi_segment_t* segment = _mi_page_segment(page);
-
-  mi_assert_expensive(mi_segment_is_valid(segment,tld));
-  segment->abandoned++;
-
-  _mi_stat_increase(&tld->stats->pages_abandoned, 1);
-  mi_assert_internal(segment->abandoned <= segment->used);
-  if (segment->used == segment->abandoned) {
-    // all pages are abandoned, abandon the entire segment
-    mi_segment_abandon(segment, tld);
-  }
-}
-
-/* -----------------------------------------------------------
-  Reclaim abandoned pages
------------------------------------------------------------ */
-
-static mi_slice_t* mi_slices_start_iterate(mi_segment_t* segment, const mi_slice_t** end) {
-  mi_slice_t* slice = &segment->slices[0];
-  *end = mi_segment_slices_end(segment);
-  mi_assert_internal(slice->slice_count>0 && slice->block_size>0); // segment allocated page
-  slice = slice + slice->slice_count; // skip the first segment allocated page
-  return slice;
-}
-
-// Possibly free pages and check if free space is available
-static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, size_t block_size, mi_segments_tld_t* tld)
-{
-  mi_assert_internal(mi_segment_is_abandoned(segment));
-  bool has_page = false;
-
-  // for all slices
-  const mi_slice_t* end;
-  mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
-  while (slice < end) {
-    mi_assert_internal(slice->slice_count > 0);
-    mi_assert_internal(slice->slice_offset == 0);
-    if (mi_slice_is_used(slice)) { // used page
-      // ensure used count is up to date and collect potential concurrent frees
-      mi_page_t* const page = mi_slice_to_page(slice);
-      _mi_page_free_collect(page, false);
-      if (mi_page_all_free(page)) {
-        // if this page is all free now, free it without adding to any queues (yet)
-        mi_assert_internal(page->next == NULL && page->prev==NULL);
-        _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
-        segment->abandoned--;
-        slice = mi_segment_page_clear(page, tld); // re-assign slice due to coalesce!
-        mi_assert_internal(!mi_slice_is_used(slice));
-        if (slice->slice_count >= slices_needed) {
-          has_page = true;
-        }
-      }
-      else if (mi_page_block_size(page) == block_size && mi_page_has_any_available(page)) {
-        // a page has available free blocks of the right size
-        has_page = true;
-      }
-    }
-    else {
-      // empty span
-      if (slice->slice_count >= slices_needed) {
-        has_page = true;
-      }
-    }
-    slice = slice + slice->slice_count;
-  }
-  return has_page;
-}
-
-// Reclaim an abandoned segment; returns NULL if the segment was freed
-// set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full.
-static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) {
-  if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; }
-  // can be 0 still with abandoned_next, or already a thread id for segments outside an arena that are reclaimed on a free.
-  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0 || mi_atomic_load_relaxed(&segment->thread_id) == _mi_thread_id());
-  mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
-  segment->abandoned_visits = 0;
-  segment->was_reclaimed = true;
-  tld->reclaim_count++;
-  mi_segments_track_size((long)mi_segment_size(segment), tld);
-  mi_assert_internal(segment->next == NULL);
-  _mi_stat_decrease(&tld->stats->segments_abandoned, 1);
-
-  // for all slices
-  const mi_slice_t* end;
-  mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
-  while (slice < end) {
-    mi_assert_internal(slice->slice_count > 0);
-    mi_assert_internal(slice->slice_offset == 0);
-    if (mi_slice_is_used(slice)) {
-      // in use: reclaim the page in our heap
-      mi_page_t* page = mi_slice_to_page(slice);
-      mi_assert_internal(page->is_committed);
-      mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
-      mi_assert_internal(mi_page_heap(page) == NULL);
-      mi_assert_internal(page->next == NULL && page->prev==NULL);
-      _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
-      segment->abandoned--;
-      // set the heap again and allow heap thread delayed free again.
-      mi_heap_t* target_heap = _mi_heap_by_tag(heap, page->heap_tag);  // allow custom heaps to separate objects
-      if (target_heap == NULL) {
-        target_heap = heap;
-        _mi_error_message(EINVAL, "page with tag %u cannot be reclaimed by a heap with the same tag (using %u instead)\n", page->heap_tag, heap->tag );
-      }
-      mi_page_set_heap(page, target_heap);
-      _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set)
-      _mi_page_free_collect(page, false); // ensure used count is up to date
-      if (mi_page_all_free(page)) {
-        // if everything free by now, free the page
-        slice = mi_segment_page_clear(page, tld);   // set slice again due to coalesceing
-      }
-      else {
-        // otherwise reclaim it into the heap
-        _mi_page_reclaim(target_heap, page);
-        if (requested_block_size == mi_page_block_size(page) && mi_page_has_any_available(page) && heap == target_heap) {
-          if (right_page_reclaimed != NULL) { *right_page_reclaimed = true; }
-        }
-      }
-    }
-    else {
-      // the span is free, add it to our page queues
-      slice = mi_segment_span_free_coalesce(slice, tld); // set slice again due to coalesceing
-    }
-    mi_assert_internal(slice->slice_count>0 && slice->slice_offset==0);
-    slice = slice + slice->slice_count;
-  }
-
-  mi_assert(segment->abandoned == 0);
-  mi_assert_expensive(mi_segment_is_valid(segment, tld));
-  if (segment->used == 0) {  // due to page_clear
-    mi_assert_internal(right_page_reclaimed == NULL || !(*right_page_reclaimed));
-    mi_segment_free(segment, false, tld);
-    return NULL;
-  }
-  else {
-    return segment;
-  }
-}
-
-// attempt to reclaim a particular segment (called from multi threaded free `alloc.c:mi_free_block_mt`)
-bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) {
-  if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false;  // it is not abandoned
-  // don't reclaim more from a free than half the current segments
-  // this is to prevent a pure free-ing thread to start owning too many segments
-  if (heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) return false;
-  if (_mi_arena_segment_clear_abandoned(segment)) {  // atomically unabandon
-    mi_segment_t* res = mi_segment_reclaim(segment, heap, 0, NULL, &heap->tld->segments);
-    mi_assert_internal(res == segment);
-    return (res != NULL);
-  }
-  return false;
-}
-
-void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
-  mi_segment_t* segment;
-  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, &current);
-  while ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL) {
-    mi_segment_reclaim(segment, heap, 0, NULL, tld);
-  }
-}
-
-static long mi_segment_get_reclaim_tries(void) {
-  // limit the tries to 10% (default) of the abandoned segments with at least 8 and at most 1024 tries.
-  const size_t perc = (size_t)mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 100);
-  if (perc <= 0) return 0;
-  const size_t total_count = _mi_arena_segment_abandoned_count();
-  if (total_count == 0) return 0;
-  const size_t relative_count = (total_count > 10000 ? (total_count / 100) * perc : (total_count * perc) / 100); // avoid overflow
-  long max_tries = (long)(relative_count <= 1 ? 1 : (relative_count > 1024 ? 1024 : relative_count));
-  if (max_tries < 8 && total_count > 8) { max_tries = 8;  }
-  return max_tries;
-}
-
-static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slices, size_t block_size, bool* reclaimed, mi_segments_tld_t* tld)
-{
-  *reclaimed = false;
-  long max_tries = mi_segment_get_reclaim_tries();
-  if (max_tries <= 0) return NULL;
-
-  mi_segment_t* segment;
-  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, &current);
-  while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL))
-  {
-    segment->abandoned_visits++;
-    // todo: should we respect numa affinity for abondoned reclaim? perhaps only for the first visit?
-    // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments and use many tries
-    // Perhaps we can skip non-suitable ones in a better way?
-    bool is_suitable = _mi_heap_memid_is_suitable(heap, segment->memid);
-    bool has_page = mi_segment_check_free(segment,needed_slices,block_size,tld); // try to free up pages (due to concurrent frees)
-    if (segment->used == 0) {
-      // free the segment (by forced reclaim) to make it available to other threads.
-      // note1: we prefer to free a segment as that might lead to reclaiming another
-      // segment that is still partially used.
-      // note2: we could in principle optimize this by skipping reclaim and directly
-      // freeing but that would violate some invariants temporarily)
-      mi_segment_reclaim(segment, heap, 0, NULL, tld);
-    }
-    else if (has_page && is_suitable) {
-      // found a large enough free span, or a page of the right block_size with free space
-      // we return the result of reclaim (which is usually `segment`) as it might free
-      // the segment due to concurrent frees (in which case `NULL` is returned).
-      return mi_segment_reclaim(segment, heap, block_size, reclaimed, tld);
-    }
-    else if (segment->abandoned_visits > 3 && is_suitable) {
-      // always reclaim on 3rd visit to limit the abandoned queue length.
-      mi_segment_reclaim(segment, heap, 0, NULL, tld);
-    }
-    else {
-      // otherwise, push on the visited list so it gets not looked at too quickly again
-      mi_segment_try_purge(segment, false /* true force? */, tld->stats); // force purge if needed as we may not visit soon again
-      _mi_arena_segment_mark_abandoned(segment);
-    }
-  }
-  return NULL;
-}
-
-
-void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld)
-{
-  mi_segment_t* segment;
-  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, &current);
-  long max_tries = (force ? (long)_mi_arena_segment_abandoned_count() : 1024);  // limit latency
-  while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL)) {
-    mi_segment_check_free(segment,0,0,tld); // try to free up pages (due to concurrent frees)
-    if (segment->used == 0) {
-      // free the segment (by forced reclaim) to make it available to other threads.
-      // note: we could in principle optimize this by skipping reclaim and directly
-      // freeing but that would violate some invariants temporarily)
-      mi_segment_reclaim(segment, heap, 0, NULL, tld);
-    }
-    else {
-      // otherwise, purge if needed and push on the visited list
-      // note: forced purge can be expensive if many threads are destroyed/created as in mstress.
-      mi_segment_try_purge(segment, force, tld->stats);
-      _mi_arena_segment_mark_abandoned(segment);
-    }
-  }
-}
-
-/* -----------------------------------------------------------
-   Reclaim or allocate
------------------------------------------------------------ */
-
-static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_slices, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
-{
-  mi_assert_internal(block_size <= MI_LARGE_OBJ_SIZE_MAX);
-
-  // 1. try to reclaim an abandoned segment
-  bool reclaimed;
-  mi_segment_t* segment = mi_segment_try_reclaim(heap, needed_slices, block_size, &reclaimed, tld);
-  if (reclaimed) {
-    // reclaimed the right page right into the heap
-    mi_assert_internal(segment != NULL);
-    return NULL; // pretend out-of-memory as the page will be in the page queue of the heap with available blocks
-  }
-  else if (segment != NULL) {
-    // reclaimed a segment with a large enough empty span in it
-    return segment;
-  }
-  // 2. otherwise allocate a fresh segment
-  return mi_segment_alloc(0, 0, heap->arena_id, tld, os_tld, NULL);
-}
-
-
-/* -----------------------------------------------------------
-   Page allocation
------------------------------------------------------------ */
-
-static mi_page_t* mi_segments_page_alloc(mi_heap_t* heap, mi_page_kind_t page_kind, size_t required, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
-{
-  mi_assert_internal(required <= MI_LARGE_OBJ_SIZE_MAX && page_kind <= MI_PAGE_LARGE);
-
-  // find a free page
-  size_t page_size = _mi_align_up(required, (required > MI_MEDIUM_PAGE_SIZE ? MI_MEDIUM_PAGE_SIZE : MI_SEGMENT_SLICE_SIZE));
-  size_t slices_needed = page_size / MI_SEGMENT_SLICE_SIZE;
-  mi_assert_internal(slices_needed * MI_SEGMENT_SLICE_SIZE == page_size);
-  mi_page_t* page = mi_segments_page_find_and_allocate(slices_needed, heap->arena_id, tld); //(required <= MI_SMALL_SIZE_MAX ? 0 : slices_needed), tld);
-  if (page==NULL) {
-    // no free page, allocate a new segment and try again
-    if (mi_segment_reclaim_or_alloc(heap, slices_needed, block_size, tld, os_tld) == NULL) {
-      // OOM or reclaimed a good page in the heap
-      return NULL;
-    }
-    else {
-      // otherwise try again
-      return mi_segments_page_alloc(heap, page_kind, required, block_size, tld, os_tld);
-    }
-  }
-  mi_assert_internal(page != NULL && page->slice_count*MI_SEGMENT_SLICE_SIZE == page_size);
-  mi_assert_internal(_mi_ptr_segment(page)->thread_id == _mi_thread_id());
-  mi_segment_try_purge(_mi_ptr_segment(page), false, tld->stats);
-  return page;
-}
-
-
-
-/* -----------------------------------------------------------
-   Huge page allocation
------------------------------------------------------------ */
-
-static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
-{
-  mi_page_t* page = NULL;
-  mi_segment_t* segment = mi_segment_alloc(size,page_alignment,req_arena_id,tld,os_tld,&page);
-  if (segment == NULL || page==NULL) return NULL;
-  mi_assert_internal(segment->used==1);
-  mi_assert_internal(mi_page_block_size(page) >= size);
-  #if MI_HUGE_PAGE_ABANDON
-  segment->thread_id = 0; // huge segments are immediately abandoned
-  #endif
-
-  // for huge pages we initialize the block_size as we may
-  // overallocate to accommodate large alignments.
-  size_t psize;
-  uint8_t* start = _mi_segment_page_start(segment, page, &psize);
-  page->block_size = psize;
-  mi_assert_internal(page->is_huge);
-
-  // decommit the part of the prefix of a page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE)
-  if (page_alignment > 0 && segment->allow_decommit) {
-    uint8_t* aligned_p = (uint8_t*)_mi_align_up((uintptr_t)start, page_alignment);
-    mi_assert_internal(_mi_is_aligned(aligned_p, page_alignment));
-    mi_assert_internal(psize - (aligned_p - start) >= size);
-    uint8_t* decommit_start = start + sizeof(mi_block_t);              // for the free list
-    ptrdiff_t decommit_size = aligned_p - decommit_start;
-    _mi_os_reset(decommit_start, decommit_size, &_mi_stats_main);   // note: cannot use segment_decommit on huge segments
-  }
-
-  return page;
-}
-
-#if MI_HUGE_PAGE_ABANDON
-// free huge block from another thread
-void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
-  // huge page segments are always abandoned and can be freed immediately by any thread
-  mi_assert_internal(segment->kind==MI_SEGMENT_HUGE);
-  mi_assert_internal(segment == _mi_page_segment(page));
-  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id)==0);
-
-  // claim it and free
-  mi_heap_t* heap = mi_heap_get_default(); // issue #221; don't use the internal get_default_heap as we need to ensure the thread is initialized.
-  // paranoia: if this it the last reference, the cas should always succeed
-  size_t expected_tid = 0;
-  if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected_tid, heap->thread_id)) {
-    mi_block_set_next(page, block, page->free);
-    page->free = block;
-    page->used--;
-    page->is_zero_init = false;
-    mi_assert(page->used == 0);
-    mi_tld_t* tld = heap->tld;
-    _mi_segment_page_free(page, true, &tld->segments);
-  }
-#if (MI_DEBUG!=0)
-  else {
-    mi_assert_internal(false);
-  }
-#endif
-}
-
-#else
-// reset memory of a huge block from another thread
-void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
-  MI_UNUSED(page);
-  mi_assert_internal(segment->kind == MI_SEGMENT_HUGE);
-  mi_assert_internal(segment == _mi_page_segment(page));
-  mi_assert_internal(page->used == 1); // this is called just before the free
-  mi_assert_internal(page->free == NULL);
-  if (segment->allow_decommit) {
-    size_t csize = mi_usable_size(block);
-    if (csize > sizeof(mi_block_t)) {
-      csize = csize - sizeof(mi_block_t);
-      uint8_t* p = (uint8_t*)block + sizeof(mi_block_t);
-      _mi_os_reset(p, csize, &_mi_stats_main);  // note: cannot use segment_decommit on huge segments
-    }
-  }
-}
-#endif
-
-/* -----------------------------------------------------------
-   Page allocation and free
------------------------------------------------------------ */
-mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
-  mi_page_t* page;
-  if mi_unlikely(page_alignment > MI_BLOCK_ALIGNMENT_MAX) {
-    mi_assert_internal(_mi_is_power_of_two(page_alignment));
-    mi_assert_internal(page_alignment >= MI_SEGMENT_SIZE);
-    if (page_alignment < MI_SEGMENT_SIZE) { page_alignment = MI_SEGMENT_SIZE; }
-    page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld,os_tld);
-  }
-  else if (block_size <= MI_SMALL_OBJ_SIZE_MAX) {
-    page = mi_segments_page_alloc(heap,MI_PAGE_SMALL,block_size,block_size,tld,os_tld);
-  }
-  else if (block_size <= MI_MEDIUM_OBJ_SIZE_MAX) {
-    page = mi_segments_page_alloc(heap,MI_PAGE_MEDIUM,MI_MEDIUM_PAGE_SIZE,block_size,tld, os_tld);
-  }
-  else if (block_size <= MI_LARGE_OBJ_SIZE_MAX) {
-    page = mi_segments_page_alloc(heap,MI_PAGE_LARGE,block_size,block_size,tld, os_tld);
-  }
-  else {
-    page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld,os_tld);
-  }
-  mi_assert_internal(page == NULL || _mi_heap_memid_is_suitable(heap, _mi_page_segment(page)->memid));
-  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
-  return page;
-}
-
-
diff --git a/system/lib/mimalloc/src/static.c b/system/lib/mimalloc/src/static.c
index bf025eb794675..2383f65961357 100644
--- a/system/lib/mimalloc/src/static.c
+++ b/system/lib/mimalloc/src/static.c
@@ -20,10 +20,11 @@ terms of the MIT license. A copy of the license can be found in the file
 // containing the whole library. If it is linked first
 // it will override all the standard library allocation
 // functions (on Unix's).
-#include "alloc.c"          // includes alloc-override.c
+#include "alloc.c"          // includes alloc-override.c and free.c
 #include "alloc-aligned.c"
 #include "alloc-posix.c"
 #include "arena.c"
+#include "arena-meta.c"
 #include "bitmap.c"
 #include "heap.c"
 #include "init.c"
@@ -31,10 +32,11 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "options.c"
 #include "os.c"
 #include "page.c"           // includes page-queue.c
-#include "random.c" 
-#include "segment.c"
-#include "segment-map.c"
+#include "page-map.c"
+#include "random.c"
 #include "stats.c"
+#include "theap.c"
+#include "threadlocal.c"
 #include "prim/prim.c"
 #if MI_OSX_ZONE
 #include "prim/osx/alloc-override-zone.c"
diff --git a/system/lib/mimalloc/src/stats.c b/system/lib/mimalloc/src/stats.c
index a936402744d07..2bb5003f06a66 100644
--- a/system/lib/mimalloc/src/stats.c
+++ b/system/lib/mimalloc/src/stats.c
@@ -1,10 +1,11 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 #include "mimalloc.h"
+#include "mimalloc-stats.h"
 #include "mimalloc/internal.h"
 #include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"
@@ -19,115 +20,121 @@ terms of the MIT license. A copy of the license can be found in the file
   Statistics operations
 ----------------------------------------------------------- */
 
-static bool mi_is_in_main(void* stat) {
-  return ((uint8_t*)stat >= (uint8_t*)&_mi_stats_main
-         && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t)));
+static void mi_stat_update_mt(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  // add atomically
+  int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
+  mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
+  if (amount > 0) {
+    mi_atomic_addi64_relaxed(&stat->total, amount);
+  }
 }
 
 static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
   if (amount == 0) return;
-  if (mi_is_in_main(stat))
-  {
-    // add atomically (for abandoned pages)
-    int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
-    mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
-    if (amount > 0) {
-      mi_atomic_addi64_relaxed(&stat->allocated,amount);
-    }
-    else {
-      mi_atomic_addi64_relaxed(&stat->freed, -amount);
-    }
-  }
-  else {
-    // add thread local
-    stat->current += amount;
-    if (stat->current > stat->peak) stat->peak = stat->current;
-    if (amount > 0) {
-      stat->allocated += amount;
-    }
-    else {
-      stat->freed += -amount;
-    }
-  }
+  // add thread local
+  stat->current += amount;
+  if (stat->current > stat->peak) { stat->peak = stat->current; }
+  if (amount > 0) { stat->total += amount; }
 }
 
-void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
-  if (mi_is_in_main(stat)) {
-    mi_atomic_addi64_relaxed( &stat->count, 1 );
-    mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount );
-  }
-  else {
-    stat->count++;
-    stat->total += amount;
-  }
+
+void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount) {
+  mi_atomic_addi64_relaxed(&stat->total, (int64_t)amount);
+}
+
+void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
+  stat->total += amount;
 }
 
-void _mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update_mt(stat, (int64_t)amount);
+}
+void __mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
   mi_stat_update(stat, (int64_t)amount);
 }
 
-void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update_mt(stat, -((int64_t)amount));
+}
+void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
   mi_stat_update(stat, -((int64_t)amount));
 }
 
+
+// Adjust stats to compensate; for example before committing a range,
+// first adjust downwards with parts that were already committed so
+// we avoid double counting.
+static void mi_stat_adjust_mt(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  // adjust atomically
+  mi_atomic_addi64_relaxed(&stat->current, amount);
+  mi_atomic_addi64_relaxed(&stat->total, amount);
+}
+
+static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  stat->current += amount;
+  stat->total += amount;
+}
+
+void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_adjust_mt(stat, (int64_t)amount);
+}
+void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_adjust(stat, (int64_t)amount);
+}
+void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_adjust_mt(stat, -((int64_t)amount));
+}
+void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_adjust(stat, -((int64_t)amount));
+}
+
+
 // must be thread safe as it is called from stats_merge
-static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) {
+static void mi_stat_count_add_mt(mi_stat_count_t* stat, const mi_stat_count_t* src) {
   if (stat==src) return;
-  if (src->allocated==0 && src->freed==0) return;
-  mi_atomic_addi64_relaxed( &stat->allocated, src->allocated * unit);
-  mi_atomic_addi64_relaxed( &stat->current, src->current * unit);
-  mi_atomic_addi64_relaxed( &stat->freed, src->freed * unit);
-  // peak scores do not work across threads..
-  mi_atomic_addi64_relaxed( &stat->peak, src->peak * unit);
+  mi_atomic_void_addi64_relaxed(&stat->total, &src->total);
+  const int64_t prev_current = mi_atomic_addi64_relaxed(&stat->current, src->current);
+
+  // Global current plus thread peak approximates new global peak
+  // note: peak scores do really not work across threads.
+  // we used to just add them together but that often overestimates in practice.
+  // similarly, max does not seem to work well. The current approach
+  // by Artem Kharytoniuk (@artem-lunarg) seems to work better, see PR#1112
+  // for a longer description.
+  mi_atomic_maxi64_relaxed(&stat->peak, prev_current + src->peak);
 }
 
-static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src, int64_t unit) {
+static void mi_stat_counter_add_mt(mi_stat_counter_t* stat, const mi_stat_counter_t* src) {
   if (stat==src) return;
-  mi_atomic_addi64_relaxed( &stat->total, src->total * unit);
-  mi_atomic_addi64_relaxed( &stat->count, src->count * unit);
+  mi_atomic_void_addi64_relaxed(&stat->total, &src->total);
 }
 
+#define MI_STAT_COUNT(stat)    mi_stat_count_add_mt(&stats->stat, &src->stat);
+#define MI_STAT_COUNTER(stat)  mi_stat_counter_add_mt(&stats->stat, &src->stat);
+
 // must be thread safe as it is called from stats_merge
 static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   if (stats==src) return;
-  mi_stat_add(&stats->segments, &src->segments,1);
-  mi_stat_add(&stats->pages, &src->pages,1);
-  mi_stat_add(&stats->reserved, &src->reserved, 1);
-  mi_stat_add(&stats->committed, &src->committed, 1);
-  mi_stat_add(&stats->reset, &src->reset, 1);
-  mi_stat_add(&stats->purged, &src->purged, 1);
-  mi_stat_add(&stats->page_committed, &src->page_committed, 1);
-
-  mi_stat_add(&stats->pages_abandoned, &src->pages_abandoned, 1);
-  mi_stat_add(&stats->segments_abandoned, &src->segments_abandoned, 1);
-  mi_stat_add(&stats->threads, &src->threads, 1);
-
-  mi_stat_add(&stats->malloc, &src->malloc, 1);
-  mi_stat_add(&stats->segments_cache, &src->segments_cache, 1);
-  mi_stat_add(&stats->normal, &src->normal, 1);
-  mi_stat_add(&stats->huge, &src->huge, 1);
-  mi_stat_add(&stats->large, &src->large, 1);
-
-  mi_stat_counter_add(&stats->pages_extended, &src->pages_extended, 1);
-  mi_stat_counter_add(&stats->mmap_calls, &src->mmap_calls, 1);
-  mi_stat_counter_add(&stats->commit_calls, &src->commit_calls, 1);
-  mi_stat_counter_add(&stats->reset_calls, &src->reset_calls, 1);
-  mi_stat_counter_add(&stats->purge_calls, &src->purge_calls, 1);
-
-  mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
-  mi_stat_counter_add(&stats->searches, &src->searches, 1);
-  mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1);
-  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
-  mi_stat_counter_add(&stats->large_count, &src->large_count, 1);
-#if MI_STAT>1
+
+  // copy all fields
+  MI_STAT_FIELDS()
+
+  #if MI_STAT>1
   for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
-    if (src->normal_bins[i].allocated > 0 || src->normal_bins[i].freed > 0) {
-      mi_stat_add(&stats->normal_bins[i], &src->normal_bins[i], 1);
-    }
+    mi_stat_count_add_mt(&stats->malloc_bins[i], &src->malloc_bins[i]);
+  }
+  #endif
+  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
+    mi_stat_count_add_mt(&stats->page_bins[i], &src->page_bins[i]);
   }
-#endif
 }
 
+#undef MI_STAT_COUNT
+#undef MI_STAT_COUNTER
+
 /* -----------------------------------------------------------
   Display statistics
 ----------------------------------------------------------- */
@@ -136,7 +143,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
 // unit == 0: count as decimal
 // unit < 0 : count in binary
 static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg, const char* fmt) {
-  char buf[32]; buf[0] = 0;
+  char buf[32]; _mi_memzero_var(buf);
   int  len = 32;
   const char* suffix = (unit <= 0 ? " " : "B");
   const int64_t base = (unit == 0 ? 1000 : 1024);
@@ -174,30 +181,30 @@ static void mi_print_count(int64_t n, int64_t unit, mi_output_fun* out, void* ar
 }
 
 static void mi_stat_print_ex(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg, const char* notok ) {
-  _mi_fprintf(out, arg,"%10s:", msg);
+  _mi_fprintf(out, arg,"  %-10s:", msg);
   if (unit != 0) {
     if (unit > 0) {
       mi_print_amount(stat->peak, unit, out, arg);
-      mi_print_amount(stat->allocated, unit, out, arg);
-      mi_print_amount(stat->freed, unit, out, arg);
+      mi_print_amount(stat->total, unit, out, arg);
+      // mi_print_amount(stat->freed, unit, out, arg);
       mi_print_amount(stat->current, unit, out, arg);
       mi_print_amount(unit, 1, out, arg);
-      mi_print_count(stat->allocated, unit, out, arg);
+      mi_print_count(stat->total, unit, out, arg);
     }
     else {
       mi_print_amount(stat->peak, -1, out, arg);
-      mi_print_amount(stat->allocated, -1, out, arg);
-      mi_print_amount(stat->freed, -1, out, arg);
+      mi_print_amount(stat->total, -1, out, arg);
+      // mi_print_amount(stat->freed, -1, out, arg);
       mi_print_amount(stat->current, -1, out, arg);
       if (unit == -1) {
         _mi_fprintf(out, arg, "%24s", "");
       }
       else {
         mi_print_amount(-unit, 1, out, arg);
-        mi_print_count((stat->allocated / -unit), 0, out, arg);
+        mi_print_count((stat->total / -unit), 0, out, arg);
       }
     }
-    if (stat->allocated > stat->freed) {
+    if (stat->current != 0) {
       _mi_fprintf(out, arg, "  ");
       _mi_fprintf(out, arg, (notok == NULL ? "not all freed" : notok));
       _mi_fprintf(out, arg, "\n");
@@ -207,10 +214,9 @@ static void mi_stat_print_ex(const mi_stat_count_t* stat, const char* msg, int64
     }
   }
   else {
-    mi_print_amount(stat->peak, 1, out, arg);
-    mi_print_amount(stat->allocated, 1, out, arg);
-    _mi_fprintf(out, arg, "%11s", " ");  // no freed
-    mi_print_amount(stat->current, 1, out, arg);
+    mi_print_amount(stat->peak, 0, out, arg);
+    mi_print_amount(stat->total, 0, out, arg);
+    mi_print_amount(stat->current, 0, out, arg);
     _mi_fprintf(out, arg, "\n");
   }
 }
@@ -219,47 +225,59 @@ static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t
   mi_stat_print_ex(stat, msg, unit, out, arg, NULL);
 }
 
-static void mi_stat_peak_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg) {
-  _mi_fprintf(out, arg, "%10s:", msg);
-  mi_print_amount(stat->peak, unit, out, arg);
+#if MI_STAT>1
+static void mi_stat_total_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg) {
+  _mi_fprintf(out, arg, "  %-10s:", msg);
+  _mi_fprintf(out, arg, "%12s", " ");  // no peak
+  mi_print_amount(stat->total, unit, out, arg);
   _mi_fprintf(out, arg, "\n");
 }
+#endif
 
 static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg ) {
-  _mi_fprintf(out, arg, "%10s:", msg);
-  mi_print_amount(stat->total, -1, out, arg);
+  _mi_fprintf(out, arg, "  %-10s:", msg);
+  mi_print_amount(stat->total, 0, out, arg);
   _mi_fprintf(out, arg, "\n");
 }
 
+static void mi_stat_counter_print_size(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg ) {
+  _mi_fprintf(out, arg, "  %-10s:", msg);
+  mi_print_amount(stat->total, 1, out, arg);
+  _mi_fprintf(out, arg, "\n");
+}
 
-static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg) {
-  const int64_t avg_tens = (stat->count == 0 ? 0 : (stat->total*10 / stat->count));
-  const long avg_whole = (long)(avg_tens/10);
-  const long avg_frac1 = (long)(avg_tens%10);
-  _mi_fprintf(out, arg, "%10s: %5ld.%ld avg\n", msg, avg_whole, avg_frac1);
+static void mi_stat_average_print(int64_t count, int64_t total, const char* msg, mi_output_fun* out, void* arg) {
+  const int64_t avg_tens = (count == 0 ? 0 : (total*10 / count));
+  const int64_t avg_whole = avg_tens/10;
+  const int64_t avg_frac1 = avg_tens%10;
+  _mi_fprintf(out, arg, "  %-10s: %5lld.%lld avg\n", msg, avg_whole, avg_frac1);
 }
 
 
-static void mi_print_header(mi_output_fun* out, void* arg ) {
-  _mi_fprintf(out, arg, "%10s: %11s %11s %11s %11s %11s %11s\n", "heap stats", "peak   ", "total   ", "freed   ", "current   ", "unit   ", "count   ");
+static void mi_print_header(const char* name,mi_output_fun* out, void* arg ) {
+  _mi_fprintf(out, arg, " %-11s %11s %11s %11s %11s %11s\n",
+                        name, "peak   ", "total   ", "current   ", "block   ", "total#   ");
 }
 
 #if MI_STAT>1
-static void mi_stats_print_bins(const mi_stat_count_t* bins, size_t max, const char* fmt, mi_output_fun* out, void* arg) {
+static bool mi_stats_print_bins(const mi_stat_count_t* bins, size_t max, mi_output_fun* out, void* arg) {
   bool found = false;
   char buf[64];
   for (size_t i = 0; i <= max; i++) {
-    if (bins[i].allocated > 0) {
+    if (bins[i].total > 0) {
       found = true;
-      int64_t unit = _mi_bin_size((uint8_t)i);
-      _mi_snprintf(buf, 64, "%s %3lu", fmt, (long)i);
-      mi_stat_print(&bins[i], buf, unit, out, arg);
+      const size_t unit = _mi_bin_size((uint8_t)i);
+      const char* pagekind = (unit <= MI_SMALL_MAX_OBJ_SIZE ? "S" :
+                               (unit <= MI_MEDIUM_MAX_OBJ_SIZE ? "M" :
+                                 (unit <= MI_LARGE_MAX_OBJ_SIZE ? "L" : "H")));
+      _mi_snprintf(buf, 64, "bin%2s  %3lu", pagekind, (long)i);
+      mi_stat_print(&bins[i], buf, (int64_t)unit, out, arg);
     }
   }
   if (found) {
     _mi_fprintf(out, arg, "\n");
-    mi_print_header(out, arg);
   }
+  return found;
 }
 #endif
 
@@ -299,56 +317,8 @@ static void mi_cdecl mi_buffered_out(const char* msg, void* arg) {
 // Print statistics
 //------------------------------------------------------------
 
-static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_attr_noexcept {
-  // wrap the output function to be line buffered
-  char buf[256];
-  buffered_t buffer = { out0, arg0, NULL, 0, 255 };
-  buffer.buf = buf;
-  mi_output_fun* out = &mi_buffered_out;
-  void* arg = &buffer;
-
-  // and print using that
-  mi_print_header(out,arg);
-  #if MI_STAT>1
-  mi_stats_print_bins(stats->normal_bins, MI_BIN_HUGE, "normal",out,arg);
-  #endif
-  #if MI_STAT
-  mi_stat_print(&stats->normal, "normal", (stats->normal_count.count == 0 ? 1 : -(stats->normal.allocated / stats->normal_count.count)), out, arg);
-  mi_stat_print(&stats->large, "large", (stats->large_count.count == 0 ? 1 : -(stats->large.allocated / stats->large_count.count)), out, arg);
-  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);
-  mi_stat_count_t total = { 0,0,0,0 };
-  mi_stat_add(&total, &stats->normal, 1);
-  mi_stat_add(&total, &stats->large, 1);
-  mi_stat_add(&total, &stats->huge, 1);
-  mi_stat_print(&total, "total", 1, out, arg);
-  #endif
-  #if MI_STAT>1
-  mi_stat_print(&stats->malloc, "malloc req", 1, out, arg);
-  _mi_fprintf(out, arg, "\n");
-  #endif
-  mi_stat_print_ex(&stats->reserved, "reserved", 1, out, arg, "");
-  mi_stat_print_ex(&stats->committed, "committed", 1, out, arg, "");
-  mi_stat_peak_print(&stats->reset, "reset", 1, out, arg );
-  mi_stat_peak_print(&stats->purged, "purged", 1, out, arg );
-  mi_stat_print(&stats->page_committed, "touched", 1, out, arg);
-  mi_stat_print(&stats->segments, "segments", -1, out, arg);
-  mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
-  mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg);
-  mi_stat_print(&stats->pages, "pages", -1, out, arg);
-  mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg);
-  mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg);
-  mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg);
-  mi_stat_counter_print(&stats->arena_count, "arenas", out, arg);
-  mi_stat_counter_print(&stats->arena_crossover_count, "-crossover", out, arg);
-  mi_stat_counter_print(&stats->arena_rollback_count, "-rollback", out, arg);
-  mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg);
-  mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
-  mi_stat_counter_print(&stats->reset_calls, "resets", out, arg);
-  mi_stat_counter_print(&stats->purge_calls, "purges", out, arg);
-  mi_stat_print(&stats->threads, "threads", -1, out, arg);
-  mi_stat_counter_print_avg(&stats->searches, "searches", out, arg);
-  _mi_fprintf(out, arg, "%10s: %5zu\n", "numa nodes", _mi_os_numa_node_count());
-
+mi_decl_export void mi_process_info_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept
+{
   size_t elapsed;
   size_t user_time;
   size_t sys_time;
@@ -358,58 +328,188 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   size_t peak_commit;
   size_t page_faults;
   mi_process_info(&elapsed, &user_time, &sys_time, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
-  _mi_fprintf(out, arg, "%10s: %5ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);
-  _mi_fprintf(out, arg, "%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, rss: ", "process",
-              user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults );
+  _mi_fprintf(out, arg, "  %-10s: %5zu.%03zu s\n", "elapsed", elapsed/1000, elapsed%1000);
+  _mi_fprintf(out, arg, "  %-10s: user: %zu.%03zu s, system: %zu.%03zu s, faults: %zu, peak rss: ", "process",
+    user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, page_faults);
   mi_printf_amount((int64_t)peak_rss, 1, out, arg, "%s");
   if (peak_commit > 0) {
-    _mi_fprintf(out, arg, ", commit: ");
+    _mi_fprintf(out, arg, ", peak commit: ");
     mi_printf_amount((int64_t)peak_commit, 1, out, arg, "%s");
   }
   _mi_fprintf(out, arg, "\n");
 }
 
+void _mi_stats_print(const char* name, size_t id, mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_attr_noexcept {
+  // wrap the output function to be line buffered
+  char buf[256]; _mi_memzero_var(buf);
+  buffered_t buffer = { out0, arg0, NULL, 0, 255 };
+  buffer.buf = buf;
+  mi_output_fun* out = &mi_buffered_out;
+  void* arg = &buffer;
+
+  // and print using that
+  _mi_fprintf(out, arg, "%s %zu\n", name, id);
+
+  if (stats->malloc_normal.total + stats->malloc_huge.total != 0) {
+    #if MI_STAT>1
+    mi_print_header("blocks", out, arg);
+    mi_stats_print_bins(stats->malloc_bins, MI_BIN_HUGE, out, arg);
+    #endif
+    #if MI_STAT
+    mi_stat_print(&stats->malloc_normal, "binned", (stats->malloc_normal_count.total == 0 ? -1 : 1), out, arg);
+    mi_stat_print(&stats->malloc_huge, "huge", (stats->malloc_huge_count.total == 0 ? -1 : 1), out, arg);
+    mi_stat_count_t total = { 0,0,0 };
+    mi_stat_count_add_mt(&total, &stats->malloc_normal);
+    mi_stat_count_add_mt(&total, &stats->malloc_huge);
+    mi_stat_print_ex(&total, "total", 1, out, arg, "");
+    #if MI_STAT>1
+    mi_stat_total_print(&stats->malloc_requested, "malloc req", 1, out, arg);
+    #endif
+    _mi_fprintf(out, arg, "\n");
+    #endif
+  }
+
+  if (stats->pages.total != 0) {
+    mi_print_header("pages", out, arg);
+    mi_stat_print_ex(&stats->page_committed, "touched", 1, out, arg, "");
+    // mi_stat_print(&stats->segments, "segments", -1, out, arg);
+    // mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
+    // mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg);
+    mi_stat_print(&stats->pages, "pages", 0, out, arg);
+    mi_stat_print(&stats->pages_abandoned, "abandoned", 0, out, arg);
+    mi_stat_counter_print(&stats->pages_reclaim_on_alloc, "reclaima", out, arg);
+    mi_stat_counter_print(&stats->pages_reclaim_on_free, "reclaimf", out, arg);
+    mi_stat_counter_print(&stats->pages_reabandon_full, "reabandon", out, arg);
+    mi_stat_counter_print(&stats->pages_unabandon_busy_wait, "waits", out, arg);
+    mi_stat_counter_print(&stats->pages_extended, "extended", out, arg);
+    mi_stat_counter_print(&stats->pages_retire, "retire", out, arg);
+    mi_stat_average_print(stats->page_searches_count.total, stats->page_searches.total, "searches", out, arg);
+    _mi_fprintf(out, arg, "\n");
+  }
+
+  if (stats->arena_count.total > 0) {
+    mi_print_header("arenas", out, arg);
+    mi_stat_print_ex(&stats->reserved, "reserved", 1, out, arg, "");
+    mi_stat_print_ex(&stats->committed, "committed", 1, out, arg, "");
+    mi_stat_counter_print_size(&stats->reset, "reset", out, arg);
+    mi_stat_counter_print_size(&stats->purged, "purged", out, arg);
+
+    mi_stat_counter_print(&stats->arena_count, "arenas", out, arg);
+    mi_stat_counter_print(&stats->arena_rollback_count, "rollback", out, arg);
+    mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg);
+    mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
+    mi_stat_counter_print(&stats->reset_calls, "resets", out, arg);
+    mi_stat_counter_print(&stats->purge_calls, "purges", out, arg);
+    mi_stat_counter_print(&stats->malloc_guarded_count, "guarded", out, arg);
+    mi_stat_print_ex(&stats->theaps, "theaps", 0, out, arg, "");
+    mi_stat_print_ex(&stats->heaps, "heaps", 0, out, arg, "");
+    mi_stat_counter_print(&stats->heaps_delete_wait, "heap waits", out, arg);
+    _mi_fprintf(out, arg, "\n");
+
+    mi_print_header("process", out, arg);
+    mi_stat_print_ex(&stats->threads, "threads", 0, out, arg, "");
+    _mi_fprintf(out, arg, "  %-10s: %5i\n", "numa nodes", _mi_os_numa_node_count());
+    mi_process_info_print_out(out, arg);
+  }
+  _mi_fprintf(out, arg, "\n");
+}
+
+
 static mi_msecs_t mi_process_start; // = 0
 
-static mi_stats_t* mi_stats_get_default(void) {
-  mi_heap_t* heap = mi_heap_get_default();
-  return &heap->tld->stats;
+// called on process init
+void _mi_stats_init(void) {
+  if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); };
 }
 
-static void mi_stats_merge_from(mi_stats_t* stats) {
-  if (stats != &_mi_stats_main) {
-    mi_stats_add(&_mi_stats_main, stats);
-    memset(stats, 0, sizeof(mi_stats_t));
-  }
+static void mi_stats_add_into(mi_stats_t* to, mi_stats_t* from) {
+  mi_assert_internal(to != NULL && from != NULL);
+  if (to == from) return;
+  mi_stats_add(to, from);
+}
+
+void _mi_stats_merge_into(mi_stats_t* to, mi_stats_t* from) {
+  mi_assert_internal(to != NULL && from != NULL);
+  if (to == from) return;
+  mi_stats_add(to, from);
+  _mi_memzero(from, sizeof(mi_stats_t));
+}
+
+static mi_stats_t* mi_stats_merge_theap_to_heap(mi_theap_t* theap) mi_attr_noexcept {
+  mi_stats_t* stats = &theap->stats;
+  mi_stats_t* heap_stats = &_mi_theap_heap(theap)->stats;
+  _mi_stats_merge_into( heap_stats, stats );
+  return heap_stats;
+}
+
+static mi_stats_t* mi_heap_get_stats(mi_heap_t* heap) {
+  if (heap==NULL) { heap = mi_heap_main(); }
+  mi_theap_t* theap = _mi_heap_theap_peek(heap);
+  if (theap==NULL) return &heap->stats;
+              else return mi_stats_merge_theap_to_heap(theap);
 }
 
+// deprecated
 void mi_stats_reset(void) mi_attr_noexcept {
-  mi_stats_t* stats = mi_stats_get_default();
-  if (stats != &_mi_stats_main) { memset(stats, 0, sizeof(mi_stats_t)); }
-  memset(&_mi_stats_main, 0, sizeof(mi_stats_t));
-  if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); };
+  if (!mi_theap_is_initialized(_mi_theap_default())) return;
+  mi_heap_get_stats(mi_heap_main());
+  mi_heap_stats_merge_to_subproc(mi_heap_main());
 }
 
-void mi_stats_merge(void) mi_attr_noexcept {
-  mi_stats_merge_from( mi_stats_get_default() );
+
+void mi_heap_stats_print_out(mi_heap_t* heap, mi_output_fun* out, void* arg) mi_attr_noexcept {
+  if (heap==NULL) { heap = mi_heap_main(); }
+  _mi_stats_print("heap", heap->heap_seq, mi_heap_get_stats(heap), out, arg);
+}
+
+typedef struct mi_heap_print_visit_info_s {
+  mi_output_fun* out;
+  void* out_arg;
+} mi_heap_print_visit_info_t;
+
+static bool mi_cdecl mi_heap_print_visitor(mi_heap_t* heap, void* arg) {
+  mi_heap_print_visit_info_t* vinfo = (mi_heap_print_visit_info_t*)(arg);
+  mi_heap_stats_print_out(heap, vinfo->out, vinfo->out_arg);
+  return true;
+}
+
+
+// show each heap and then the subproc
+void mi_subproc_heap_stats_print_out(mi_subproc_id_t subproc_id, mi_output_fun* out, void* arg) mi_attr_noexcept {
+  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
+  if (subproc==NULL) return;
+  mi_heap_print_visit_info_t vinfo = { out, arg };
+  mi_subproc_visit_heaps(subproc, &mi_heap_print_visitor, &vinfo);
+  _mi_stats_print("subproc", subproc->subproc_seq, &subproc->stats, out, arg);
 }
 
-void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
-  mi_stats_merge_from(stats);
+
+// aggregate all stats from the heaps and subproc and print those
+void mi_subproc_stats_print_out(mi_subproc_id_t subproc_id, mi_output_fun* out, void* arg) mi_attr_noexcept {
+  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
+  if (subproc==NULL) return;
+  mi_stats_t_decl(stats);
+  if (mi_subproc_stats_get(subproc_id, &stats)) {
+    _mi_stats_print("subproc", subproc->subproc_seq, &stats, out, arg);
+  }
 }
 
 void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  mi_stats_merge_from(mi_stats_get_default());
-  _mi_stats_print(&_mi_stats_main, out, arg);
+  mi_subproc_stats_print_out(mi_subproc_current(),out, arg);
 }
 
+// deprecated
 void mi_stats_print(void* out) mi_attr_noexcept {
   // for compatibility there is an `out` parameter (which can be `stdout` or `stderr`)
   mi_stats_print_out((mi_output_fun*)out, NULL);
 }
 
+// deprecated
 void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  _mi_stats_print(mi_stats_get_default(), out, arg);
+  mi_theap_t* theap = _mi_theap_default();
+  if (theap==NULL || !mi_theap_is_initialized(theap)) return;
+  _mi_stats_print("heap", _mi_theap_heap(theap)->heap_seq, &theap->stats, out, arg);
+  mi_stats_merge_theap_to_heap(_mi_theap_default());
 }
 
 
@@ -443,11 +543,12 @@ mi_msecs_t _mi_clock_end(mi_msecs_t start) {
 
 mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept
 {
+  mi_subproc_t* subproc = _mi_subproc_main();
   mi_process_info_t pinfo;
   _mi_memzero_var(pinfo);
   pinfo.elapsed        = _mi_clock_end(mi_process_start);
-  pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
-  pinfo.peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
+  pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.current)));
+  pinfo.peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.peak)));
   pinfo.current_rss    = pinfo.current_commit;
   pinfo.peak_rss       = pinfo.peak_commit;
   pinfo.utime          = 0;
@@ -465,3 +566,243 @@ mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, s
   if (peak_commit!=NULL)    *peak_commit    = pinfo.peak_commit;
   if (page_faults!=NULL)    *page_faults    = pinfo.page_faults;
 }
+
+mi_decl_export void mi_process_info_print(void) mi_attr_noexcept {
+  mi_process_info_print_out(NULL, NULL);
+}
+
+
+// --------------------------------------------------------
+// Return statistics
+// --------------------------------------------------------
+
+size_t mi_stats_get_bin_size(size_t bin) mi_attr_noexcept {
+  if (bin > MI_BIN_HUGE) return 0;
+  return _mi_bin_size(bin);
+}
+
+static bool _mi_stats_get(mi_stats_t* stats_in, mi_stats_t* stats_out) mi_attr_noexcept {
+  if (stats_out == NULL || stats_out->size != sizeof(mi_stats_t) || stats_out->version != MI_STAT_VERSION) return false;
+  if (stats_in == NULL || stats_in->size != stats_out->size) return false;
+  _mi_memcpy(stats_out, stats_in, stats_out->size);
+  return true;
+}
+
+bool mi_subproc_stats_get_exclusive(mi_subproc_id_t subproc_id, mi_stats_t* stats) mi_attr_noexcept {
+  return _mi_stats_get(&_mi_subproc_from_id(subproc_id)->stats, stats);
+}
+
+bool mi_heap_stats_get(mi_heap_t* heap, mi_stats_t* stats) mi_attr_noexcept {
+  return _mi_stats_get(mi_heap_get_stats(heap), stats);
+}
+
+
+static bool mi_cdecl mi_heap_aggregate_visitor(mi_heap_t* heap, void* arg) {
+  mi_stats_t* stats = (mi_stats_t*)arg;
+  mi_stats_add_into(stats, &heap->stats);
+  return true;
+}
+
+bool mi_subproc_stats_get(mi_subproc_id_t subproc_id, mi_stats_t* stats) mi_attr_noexcept {
+  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
+  if (stats == NULL || stats->size != sizeof(mi_stats_t) || stats->version != MI_STAT_VERSION) return false;
+  _mi_memzero(stats,stats->size);
+  mi_subproc_visit_heaps(subproc, &mi_heap_aggregate_visitor, stats);
+  mi_stats_add_into(stats, &subproc->stats);
+  return true;
+}
+
+bool mi_stats_get(mi_stats_t* stats) mi_attr_noexcept {
+  return mi_subproc_stats_get(mi_subproc_current(), stats);
+}
+
+
+// --------------------------------------------------------
+// Statics in json format
+// --------------------------------------------------------
+
+typedef struct mi_json_buf_s {
+  char*   buf;
+  size_t  size;
+  size_t  used;
+  bool    can_realloc;
+} mi_json_buf_t;
+
+static bool mi_json_buf_expand(mi_json_buf_t* hbuf) {
+  if (hbuf==NULL) return false;
+  if (hbuf->buf != NULL && hbuf->size>0) {
+    hbuf->buf[hbuf->size-1] = 0;
+  }
+  if (hbuf->size > SIZE_MAX/2 || !hbuf->can_realloc) return false;
+  const size_t newsize = (hbuf->size == 0 ? mi_good_size(12*MI_KiB) : 2*hbuf->size);
+  char* const  newbuf  = (char*)mi_rezalloc(hbuf->buf, newsize);
+  if (newbuf == NULL) return false;
+  hbuf->buf = newbuf;
+  hbuf->size = newsize;
+  return true;
+}
+
+static void mi_json_buf_print(mi_json_buf_t* hbuf, const char* msg) {
+  if (msg==NULL || hbuf==NULL) return;
+  if (hbuf->used + 1 >= hbuf->size && !hbuf->can_realloc) return;
+  for (const char* src = msg; *src != 0; src++) {
+    char c = *src;
+    if (hbuf->used + 1 >= hbuf->size) {
+      if (!mi_json_buf_expand(hbuf)) return;
+    }
+    mi_assert_internal(hbuf->used < hbuf->size);
+    hbuf->buf[hbuf->used++] = c;
+  }
+  mi_assert_internal(hbuf->used < hbuf->size);
+  hbuf->buf[hbuf->used] = 0;
+}
+
+static void mi_json_buf_print_count_bin(mi_json_buf_t* hbuf, const char* prefix, mi_stat_count_t* stat, size_t bin, bool add_comma) {
+  const size_t binsize = mi_stats_get_bin_size(bin);
+  const size_t pagesize = (binsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_SMALL_PAGE_SIZE :
+                            (binsize <= MI_MEDIUM_MAX_OBJ_SIZE ? MI_MEDIUM_PAGE_SIZE :
+                              (binsize <= MI_LARGE_MAX_OBJ_SIZE ? MI_LARGE_PAGE_SIZE : 0)));
+  char buf[128];
+  _mi_snprintf(buf, 128, "%s{ \"total\": %lld, \"peak\": %lld, \"current\": %lld, \"block_size\": %zu, \"page_size\": %zu }%s\n", prefix, stat->total, stat->peak, stat->current, binsize, pagesize, (add_comma ? "," : ""));
+  buf[127] = 0;
+  mi_json_buf_print(hbuf, buf);
+}
+
+static void mi_json_buf_print_count_cbin(mi_json_buf_t* hbuf, const char* prefix, mi_stat_count_t* stat, mi_chunkbin_t bin, bool add_comma) {
+  const char* cbin = " ";
+  switch(bin) {
+    case MI_CBIN_SMALL:  cbin = "S"; break;
+    case MI_CBIN_MEDIUM: cbin = "M"; break;
+    case MI_CBIN_LARGE:  cbin = "L"; break;
+    case MI_CBIN_HUGE:   cbin = "H"; break;
+    case MI_CBIN_OTHER:  cbin = "X"; break;
+    default: cbin = " "; break;
+  }
+  char buf[128];
+  _mi_snprintf(buf, 128, "%s{ \"total\": %lld, \"peak\": %lld, \"current\": %lld, \"bin\": \"%s\" }%s\n", prefix, stat->total, stat->peak, stat->current, cbin, (add_comma ? "," : ""));
+  buf[127] = 0;
+  mi_json_buf_print(hbuf, buf);
+}
+
+static void mi_json_buf_print_count(mi_json_buf_t* hbuf, const char* prefix, mi_stat_count_t* stat, bool add_comma) {
+  char buf[128];
+  _mi_snprintf(buf, 128, "%s{ \"total\": %lld, \"peak\": %lld, \"current\": %lld }%s\n", prefix, stat->total, stat->peak, stat->current, (add_comma ? "," : ""));
+  buf[127] = 0;
+  mi_json_buf_print(hbuf, buf);
+}
+
+static void mi_json_buf_print_count_value(mi_json_buf_t* hbuf, const char* name, mi_stat_count_t* stat) {
+  char buf[128];
+  _mi_snprintf(buf, 128, "  \"%s\": ", name);
+  buf[127] = 0;
+  mi_json_buf_print(hbuf, buf);
+  mi_json_buf_print_count(hbuf, "", stat, true);
+}
+
+static void mi_json_buf_print_value(mi_json_buf_t* hbuf, const char* name, int64_t val) {
+  char buf[128];
+  _mi_snprintf(buf, 128, "  \"%s\": %lld,\n", name, val);
+  buf[127] = 0;
+  mi_json_buf_print(hbuf, buf);
+}
+
+static void mi_json_buf_print_size(mi_json_buf_t* hbuf, const char* name, size_t val, bool add_comma) {
+  char buf[128];
+  _mi_snprintf(buf, 128, "    \"%s\": %zu%s\n", name, val, (add_comma ? "," : ""));
+  buf[127] = 0;
+  mi_json_buf_print(hbuf, buf);
+}
+
+static void mi_json_buf_print_counter_value(mi_json_buf_t* hbuf, const char* name, mi_stat_counter_t* stat) {
+  mi_json_buf_print_value(hbuf, name, stat->total);
+}
+
+#define MI_STAT_COUNT(stat)    mi_json_buf_print_count_value(&hbuf, #stat, &stats->stat);
+#define MI_STAT_COUNTER(stat)  mi_json_buf_print_counter_value(&hbuf, #stat, &stats->stat);
+
+static char* mi_stats_get_json_from(mi_stats_t* stats, size_t output_size, char* output_buf) mi_attr_noexcept {
+  if (stats==NULL || stats->size!=sizeof(mi_stats_t) || stats->version!=MI_STAT_VERSION) return NULL;
+  mi_json_buf_t hbuf = { NULL, 0, 0, true };
+  if (output_size > 0 && output_buf != NULL) {
+    _mi_memzero(output_buf, output_size);
+    hbuf.buf = output_buf;
+    hbuf.size = output_size;
+    hbuf.can_realloc = false;
+  }
+  else {
+    if (!mi_json_buf_expand(&hbuf)) return NULL;
+  }
+  mi_json_buf_print(&hbuf, "{\n");
+  mi_json_buf_print_value(&hbuf, "stat_version", MI_STAT_VERSION);
+  mi_json_buf_print_value(&hbuf, "mimalloc_version", MI_MALLOC_VERSION);
+
+  // process info
+  mi_json_buf_print(&hbuf, "  \"process\": {\n");
+  size_t elapsed;
+  size_t user_time;
+  size_t sys_time;
+  size_t current_rss;
+  size_t peak_rss;
+  size_t current_commit;
+  size_t peak_commit;
+  size_t page_faults;
+  mi_process_info(&elapsed, &user_time, &sys_time, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
+  mi_json_buf_print_size(&hbuf, "elapsed_msecs", elapsed, true);
+  mi_json_buf_print_size(&hbuf, "user_msecs", user_time, true);
+  mi_json_buf_print_size(&hbuf, "system_msecs", sys_time, true);
+  mi_json_buf_print_size(&hbuf, "page_faults", page_faults, true);
+  mi_json_buf_print_size(&hbuf, "rss_current", current_rss, true);
+  mi_json_buf_print_size(&hbuf, "rss_peak", peak_rss, true);
+  mi_json_buf_print_size(&hbuf, "commit_current", current_commit, true);
+  mi_json_buf_print_size(&hbuf, "commit_peak", peak_commit, false);
+  mi_json_buf_print(&hbuf, "  },\n");
+
+  // statistics
+  MI_STAT_FIELDS()
+
+  // size bins
+  mi_json_buf_print(&hbuf, "  \"malloc_bins\": [\n");
+  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
+    mi_json_buf_print_count_bin(&hbuf, "    ", &stats->malloc_bins[i], i, i!=MI_BIN_HUGE);
+  }
+  mi_json_buf_print(&hbuf, "  ],\n");
+  mi_json_buf_print(&hbuf, "  \"page_bins\": [\n");
+  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
+    mi_json_buf_print_count_bin(&hbuf, "    ", &stats->page_bins[i], i, i!=MI_BIN_HUGE);
+  }
+  mi_json_buf_print(&hbuf, "  ],\n");
+  mi_json_buf_print(&hbuf, "  \"chunk_bins\": [\n");
+  for (size_t i = 0; i < MI_CBIN_COUNT; i++) {
+    mi_json_buf_print_count_cbin(&hbuf, "    ", &stats->chunk_bins[i], (mi_chunkbin_t)i, i!=MI_CBIN_COUNT-1);
+  }
+  mi_json_buf_print(&hbuf, "  ]\n");
+  mi_json_buf_print(&hbuf, "}\n");
+  if (hbuf.used >= hbuf.size) {
+    // failed
+    if (hbuf.can_realloc) { mi_free(hbuf.buf); }
+    return NULL;
+  }
+  else {
+    return hbuf.buf;
+  }
+}
+
+char* mi_subproc_stats_get_json(mi_subproc_id_t subproc_id, size_t buf_size, char* buf) mi_attr_noexcept {
+  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
+  if (subproc==NULL) return NULL;
+  mi_stats_t_decl(stats);
+  if (!mi_subproc_stats_get(subproc_id,&stats)) return NULL;
+  return mi_stats_get_json_from(&subproc->stats, buf_size, buf);
+}
+
+char* mi_heap_stats_get_json(mi_heap_t* heap, size_t buf_size, char* buf) mi_attr_noexcept {
+  return mi_stats_get_json_from(mi_heap_get_stats(heap), buf_size, buf);
+}
+
+char* mi_stats_get_json(size_t buf_size, char* buf) mi_attr_noexcept {
+  return mi_subproc_stats_get_json(mi_subproc_current(), buf_size, buf);
+}
+
+char* mi_stats_as_json(mi_stats_t* stats, size_t buf_size, char* buf) mi_attr_noexcept {
+  return mi_stats_get_json_from(stats, buf_size, buf);
+}
diff --git a/system/lib/mimalloc/src/theap.c b/system/lib/mimalloc/src/theap.c
new file mode 100644
index 0000000000000..4b419f1004b5a
--- /dev/null
+++ b/system/lib/mimalloc/src/theap.c
@@ -0,0 +1,714 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2018-2025, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"  // _mi_theap_default
+
+#if defined(_MSC_VER) && (_MSC_VER < 1920)
+#pragma warning(disable:4204)  // non-constant aggregate initializer
+#endif
+
+/* -----------------------------------------------------------
+  Helpers
+----------------------------------------------------------- */
+
+// return `true` if ok, `false` to break
+typedef bool (theap_page_visitor_fun)(mi_theap_t* theap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2);
+
+// Visit all pages in a theap; returns `false` if break was called.
+static bool mi_theap_visit_pages(mi_theap_t* theap, theap_page_visitor_fun* fn, bool include_full, void* arg1, void* arg2)
+{
+  if (theap==NULL || theap->page_count==0) return 0;
+
+  // visit all pages
+  #if MI_DEBUG>1
+  size_t total = theap->page_count;
+  size_t count = 0;
+  #endif
+
+  const size_t max_bin = (include_full ? MI_BIN_FULL : MI_BIN_FULL - 1);
+  for (size_t i = 0; i <= max_bin; i++) {
+    mi_page_queue_t* pq = &theap->pages[i];
+    mi_page_t* page = pq->first;
+    while(page != NULL) {
+      mi_page_t* next = page->next; // save next in case the page gets removed from the queue
+      mi_assert_internal(mi_page_theap(page) == theap);
+      #if MI_DEBUG>1
+      count++;
+      #endif
+      if (!fn(theap, pq, page, arg1, arg2)) return false;
+      page = next; // and continue
+    }
+  }
+  mi_assert_internal(!include_full || count == total);
+  return true;
+}
+
+
+#if MI_DEBUG>=2
+static bool mi_theap_page_is_valid(mi_theap_t* theap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(pq);
+  mi_assert_internal(mi_page_theap(page) == theap);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  return true;
+}
+#endif
+#if MI_DEBUG>=3
+static bool mi_theap_is_valid(mi_theap_t* theap) {
+  mi_assert_internal(theap!=NULL);
+  mi_theap_visit_pages(theap, &mi_theap_page_is_valid, true, NULL, NULL);
+  for (size_t bin = 0; bin < MI_BIN_COUNT; bin++) {
+    mi_assert_internal(_mi_page_queue_is_valid(theap, &theap->pages[bin]));
+  }
+  return true;
+}
+#endif
+
+
+
+
+/* -----------------------------------------------------------
+  "Collect" pages by migrating `local_free` and `thread_free`
+  lists and freeing empty pages. This is done when a thread
+  stops (and in that case abandons pages if there are still
+  blocks alive)
+----------------------------------------------------------- */
+
+typedef enum mi_collect_e {
+  MI_NORMAL,
+  MI_FORCE,
+  MI_ABANDON
+} mi_collect_t;
+
+
+static bool mi_theap_page_collect(mi_theap_t* theap, mi_page_queue_t* pq, mi_page_t* page, void* arg_collect, void* arg2 ) {
+  MI_UNUSED(arg2);
+  MI_UNUSED(theap);
+  mi_assert_internal(mi_theap_page_is_valid(theap, pq, page, NULL, NULL));
+  mi_collect_t collect = *((mi_collect_t*)arg_collect);
+  _mi_page_free_collect(page, collect >= MI_FORCE);
+  if (mi_page_all_free(page)) {
+    // no more used blocks, possibly free the page.
+    if (collect >= MI_FORCE || page->retire_expire == 0) {  // either forced/abandon, or not already retired
+      // note: this will potentially free retired pages as well.
+      _mi_page_free(page, pq);
+    }
+  }
+  else if (collect == MI_ABANDON) {
+    // still used blocks but the thread is done; abandon the page
+    _mi_page_abandon(page, pq);
+  }
+  return true; // don't break
+}
+
+static void mi_theap_merge_stats(mi_theap_t* theap) {
+  mi_assert_internal(mi_theap_is_initialized(theap));
+  _mi_stats_merge_into(&_mi_theap_heap(theap)->stats, &theap->stats);
+}
+
+static void mi_theap_collect_ex(mi_theap_t* theap, mi_collect_t collect)
+{
+  if (theap==NULL || !mi_theap_is_initialized(theap)) return;
+  mi_assert_expensive(mi_theap_is_valid(theap));
+
+  const bool force = (collect >= MI_FORCE);
+  _mi_deferred_free(theap, force);
+
+  // python/cpython#112532: we may be called from a thread that is not the owner of the theap
+  // const bool is_main_thread = (_mi_is_main_thread() && theap->thread_id == _mi_thread_id());
+
+  // collect retired pages
+  _mi_theap_collect_retired(theap, force);
+
+  // collect all pages owned by this thread
+  mi_theap_visit_pages(theap, &mi_theap_page_collect, (collect!=MI_NORMAL), &collect, NULL);  // dont normally visit full pages, see issue #1220
+
+  // collect arenas (this is program wide so don't force purges on abandonment of threads)
+  //mi_atomic_storei64_release(&theap->tld->subproc->purge_expire, 1);
+  _mi_arenas_collect(collect == MI_FORCE /* force purge? */, collect >= MI_FORCE /* visit all? */, theap->tld);
+
+  // merge statistics
+  mi_theap_merge_stats(theap);
+}
+
+void _mi_theap_collect_abandon(mi_theap_t* theap) {
+  mi_theap_collect_ex(theap, MI_ABANDON);
+}
+
+void mi_theap_collect(mi_theap_t* theap, bool force) mi_attr_noexcept {
+  mi_theap_collect_ex(theap, (force ? MI_FORCE : MI_NORMAL));
+}
+
+void mi_collect(bool force) mi_attr_noexcept {
+  // cannot really collect process wide, just a theap..
+  mi_theap_collect(_mi_theap_default(), force);
+}
+
+void mi_heap_collect(mi_heap_t* heap, bool force) {
+  // cannot really collect a heap, just a theap..
+  mi_theap_collect(mi_heap_theap(heap), force);
+}
+
+/* -----------------------------------------------------------
+  Heap new
+----------------------------------------------------------- */
+
+mi_theap_t* mi_theap_get_default(void) {
+  mi_theap_t* theap = _mi_theap_default();
+  if mi_unlikely(!mi_theap_is_initialized(theap)) {
+    mi_thread_init();
+    theap = _mi_theap_default();
+    mi_assert_internal(mi_theap_is_initialized(theap));
+  }
+  return theap;
+}
+
+// todo: make order of parameters consistent (but would that break compat with CPython?)
+void _mi_theap_init(mi_theap_t* theap, mi_heap_t* heap, mi_tld_t* tld)
+{
+  mi_assert_internal(theap!=NULL);
+  mi_assert_internal(heap!=NULL);
+  mi_memid_t memid = theap->memid;
+  _mi_memcpy_aligned(theap, &_mi_theap_empty, sizeof(mi_theap_t));
+  theap->memid = memid;
+  theap->refcount = 1;
+  theap->tld   = tld;  // avoid reading the thread-local tld during initialization
+  mi_atomic_store_ptr_relaxed(mi_heap_t,&theap->heap,heap);
+  
+  _mi_theap_options_init(theap);
+  if (theap->tld->is_in_threadpool) {
+    // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our theap.
+    // this is checked in `free.c:mi_free_try_collect_mt`
+    // .. but abandoning is good in this case: halve the full page retain (possibly to 0)
+    // (so blocked threads do not hold on to too much memory)
+    if (theap->page_full_retain > 0) {
+      theap->page_full_retain = theap->page_full_retain / 4;
+    }
+  }
+
+  // push on the thread local theaps list
+  mi_theap_t* head = NULL;
+  mi_lock(&theap->tld->theaps_lock) {
+    head = theap->tld->theaps;
+    theap->tprev = NULL;
+    theap->tnext = head;
+    if (head!=NULL) { head->tprev = theap; }
+    theap->tld->theaps = theap;
+  }
+
+  // initialize random
+  if (head == NULL) {  // first theap in this thread?
+    #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+      _mi_random_init_weak(&theap->random);    // prevent allocation failure during bcrypt dll initialization with static linking (issue #1185)
+    #else
+      _mi_random_init(&theap->random);
+    #endif
+  }
+  else {
+    _mi_random_split(&head->random, &theap->random);
+  }
+  theap->cookie  = _mi_theap_random_next(theap) | 1;
+  _mi_theap_guarded_init(theap);
+  mi_subproc_stat_increase(_mi_subproc(),theaps,1);
+
+  // push on the heap's theap list
+  mi_lock(&heap->theaps_lock) {
+    head = heap->theaps;
+    theap->hprev = NULL;
+    theap->hnext = head;
+    if (head!=NULL) { head->hprev = theap; }
+    heap->theaps = theap;
+  }
+}
+
+mi_theap_t* _mi_theap_create(mi_heap_t* heap, mi_tld_t* tld) {
+  mi_assert_internal(tld!=NULL);
+  mi_assert_internal(heap!=NULL);
+  // allocate and initialize a theap
+  mi_memid_t memid;
+  mi_theap_t* theap;
+  //if (!_mi_is_heap_main(heap)) {
+  //  theap = (mi_theap_t*)mi_heap_zalloc(mi_heap_main(),sizeof(mi_theap_t));
+  //  memid = _mi_memid_create(MI_MEM_HEAP_MAIN);
+  //  memid.initially_zero = memid.initially_committed = true;
+  //}
+  //else
+  if (heap->exclusive_arena == NULL) {
+    theap = (mi_theap_t*)_mi_meta_zalloc(sizeof(mi_theap_t), &memid);
+  }
+  else {
+    // theaps associated with a specific arena are allocated in that arena
+    // note: takes up at least one slice which is quite wasteful...
+    const size_t size = _mi_align_up(sizeof(mi_theap_t),MI_ARENA_MIN_OBJ_SIZE);
+    theap = (mi_theap_t*)_mi_arenas_alloc(heap, size, true, true, heap->exclusive_arena, tld->thread_seq, tld->numa_node, &memid);
+    mi_assert_internal(memid.mem.os.size >= size);
+  }
+  if (theap==NULL) {
+    _mi_error_message(ENOMEM, "unable to allocate theap meta-data\n");
+    return NULL;
+  }
+  theap->memid = memid;
+  _mi_theap_init(theap, heap, tld);
+  return theap;
+}
+
+uintptr_t _mi_theap_random_next(mi_theap_t* theap) {
+  return _mi_random_next(&theap->random);
+}
+
+static void mi_theap_free_mem(mi_theap_t* theap) {
+  if (theap!=NULL) {
+    mi_subproc_stat_decrease(_mi_subproc(),theaps,1);
+    // free the used memory
+    if (theap->memid.memkind == MI_MEM_HEAP_MAIN) {  // note: for now unused as it would access theap_default stats in mi_free of the current theap
+      mi_assert_internal(_mi_is_heap_main(mi_heap_of(theap)));
+      mi_free(theap);
+    }
+    else if (theap->memid.memkind == MI_MEM_META) {
+      _mi_meta_free(theap, sizeof(*theap), theap->memid);
+    }
+    else {
+      _mi_arenas_free(theap, _mi_align_up(sizeof(*theap),MI_ARENA_MIN_OBJ_SIZE), theap->memid ); // issue #1168, avoid assertion failure
+    }
+  }
+}
+
+void _mi_theap_incref(mi_theap_t* theap) {
+  if (theap!=NULL && theap->memid.memkind > MI_MEM_STATIC) {
+    mi_atomic_increment_acq_rel(&theap->refcount);
+  }
+}
+
+void _mi_theap_decref(mi_theap_t* theap) {
+  if (theap!=NULL && theap->memid.memkind > MI_MEM_STATIC) {
+    if (mi_atomic_decrement_acq_rel(&theap->refcount) == 1) {
+      mi_theap_free_mem(theap);
+    }
+  }
+}
+
+
+// called from `mi_theap_delete` to free the internal theap resources.
+bool _mi_theap_free(mi_theap_t* theap, bool acquire_heap_theaps_lock, bool acquire_tld_theaps_lock) {
+  mi_assert(theap != NULL);
+  if (theap==NULL) return true;
+
+  mi_heap_t* const heap = mi_atomic_exchange_ptr_acq_rel(mi_heap_t, &theap->heap, NULL);
+  if (heap==NULL) {
+    // concurrent interaction, retry in an outer loop (as the other thread may be blocked on our lock)
+    return false;
+  }
+  else {
+    // merge stats to the owning heap
+    _mi_stats_merge_into(&heap->stats, &theap->stats);
+
+    // remove ourselves from the heap theaps list
+    mi_lock_maybe(&heap->theaps_lock, acquire_heap_theaps_lock) {
+      if (theap->hnext != NULL) { theap->hnext->hprev = theap->hprev; }
+      if (theap->hprev != NULL) { theap->hprev->hnext = theap->hnext; }
+                          else { mi_assert_internal(heap->theaps == theap); heap->theaps = theap->hnext; }
+      theap->hnext = theap->hprev = NULL;
+    }
+
+    // remove ourselves from the thread local theaps list
+    mi_lock_maybe(&theap->tld->theaps_lock, acquire_tld_theaps_lock) {
+      if (theap->tnext != NULL) { theap->tnext->tprev = theap->tprev;  }
+      if (theap->tprev != NULL) { theap->tprev->tnext = theap->tnext;  }
+                          else { mi_assert_internal(theap->tld->theaps == theap); theap->tld->theaps = theap->tnext; }
+      theap->tnext = theap->tprev = NULL;                        
+    }
+    theap->tld = NULL;
+    _mi_theap_decref(theap);
+    return true;
+  }
+}
+
+
+/* -----------------------------------------------------------
+  Heap destroy
+----------------------------------------------------------- */
+/*
+
+// zero out the page queues
+static void mi_theap_reset_pages(mi_theap_t* theap) {
+  mi_assert_internal(theap != NULL);
+  mi_assert_internal(mi_theap_is_initialized(theap));
+  // TODO: copy full empty theap instead?
+  _mi_memset(&theap->pages_free_direct, 0, sizeof(theap->pages_free_direct));
+  _mi_memcpy_aligned(&theap->pages, &_mi_theap_empty.pages, sizeof(theap->pages));
+  // theap->thread_delayed_free = NULL;
+  theap->page_count = 0;
+}
+
+static bool _mi_theap_page_destroy(mi_theap_t* theap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(pq);
+
+  // ensure no more thread_delayed_free will be added
+  //_mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
+
+  // stats
+  const size_t bsize = mi_page_block_size(page);
+  if (bsize > MI_LARGE_MAX_OBJ_SIZE) {
+    mi_theap_stat_decrease(theap, malloc_huge, bsize);
+  }
+  #if (MI_STAT>0)
+  _mi_page_free_collect(page, false);  // update used count
+  const size_t inuse = page->used;
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
+    mi_theap_stat_decrease(theap, malloc_normal, bsize * inuse);
+    #if (MI_STAT>1)
+    mi_theap_stat_decrease(theap, malloc_bins[_mi_bin(bsize)], inuse);
+    #endif
+  }
+  // mi_theap_stat_decrease(theap, malloc_requested, bsize * inuse);  // todo: off for aligned blocks...
+  #endif
+
+  /// pretend it is all free now
+  mi_assert_internal(mi_page_thread_free(page) == NULL);
+  page->used = 0;
+
+  // and free the page
+  // mi_page_free(page,false);
+  page->next = NULL;
+  page->prev = NULL;
+  mi_page_set_theap(page, NULL);
+  _mi_arenas_page_free(page, theap);
+
+  return true; // keep going
+}
+
+void _mi_theap_destroy_pages(mi_theap_t* theap) {
+  mi_theap_visit_pages(theap, &_mi_theap_page_destroy, NULL, NULL);
+  mi_theap_reset_pages(theap);
+}
+
+#if MI_TRACK_HEAP_DESTROY
+static bool mi_cdecl mi_theap_track_block_free(const mi_theap_t* theap, const mi_theap_area_t* area, void* block, size_t block_size, void* arg) {
+  MI_UNUSED(theap); MI_UNUSED(area);  MI_UNUSED(arg); MI_UNUSED(block_size);
+  mi_track_free_size(block,mi_usable_size(block));
+  return true;
+}
+#endif
+
+void mi_theap_destroy(mi_theap_t* theap) {
+  mi_assert(theap != NULL);
+  mi_assert(mi_theap_is_initialized(theap));
+  mi_assert(!theap->allow_page_reclaim);
+  mi_assert(!theap->allow_page_abandon);
+  mi_assert_expensive(mi_theap_is_valid(theap));
+  if (theap==NULL || !mi_theap_is_initialized(theap)) return;
+  #if MI_GUARDED
+  // _mi_warning_message("'mi_theap_destroy' called but MI_GUARDED is enabled -- using `mi_theap_delete` instead (theap at %p)\n", theap);
+  mi_theap_delete(theap);
+  return;
+  #else
+  if (theap->allow_page_reclaim) {
+    _mi_warning_message("'mi_theap_destroy' called but ignored as the theap was not created with 'allow_destroy' (theap at %p)\n", theap);
+    // don't free in case it may contain reclaimed pages,
+    mi_theap_delete(theap);
+  }
+  else {
+    // track all blocks as freed
+    #if MI_TRACK_HEAP_DESTROY
+    mi_theap_visit_blocks(theap, true, mi_theap_track_block_free, NULL);
+    #endif
+    // free all pages
+    _mi_theap_destroy_pages(theap);
+    mi_theap_free(theap,true);
+  }
+  #endif
+}
+
+// forcefully destroy all theaps in the current thread
+void _mi_theap_unsafe_destroy_all(mi_theap_t* theap) {
+  mi_assert_internal(theap != NULL);
+  if (theap == NULL) return;
+  mi_theap_t* curr = theap->tld->theaps;
+  while (curr != NULL) {
+    mi_theap_t* next = curr->next;
+    if (!curr->allow_page_reclaim) {
+      mi_theap_destroy(curr);
+    }
+    else {
+      _mi_theap_destroy_pages(curr);
+    }
+    curr = next;
+  }
+}
+*/
+
+/* -----------------------------------------------------------
+  Safe Heap delete
+----------------------------------------------------------- */
+
+// Safe delete a theap without freeing any still allocated blocks in that theap.
+void _mi_theap_delete(mi_theap_t* theap, bool acquire_tld_theaps_lock)
+{
+  mi_assert(theap != NULL);
+  mi_assert(mi_theap_is_initialized(theap));
+  mi_assert_expensive(mi_theap_is_valid(theap));
+  if (theap==NULL || !mi_theap_is_initialized(theap)) return;
+
+  // abandon all pages
+  _mi_theap_collect_abandon(theap);
+
+  mi_assert_internal(theap->page_count==0);
+  _mi_theap_free(theap, true /* acquire heap->theaps_lock */, acquire_tld_theaps_lock);
+}
+
+
+
+/* -----------------------------------------------------------
+  Load/unload theaps
+----------------------------------------------------------- */
+/*
+void mi_theap_unload(mi_theap_t* theap) {
+  mi_assert(mi_theap_is_initialized(theap));
+  mi_assert_expensive(mi_theap_is_valid(theap));
+  if (theap==NULL || !mi_theap_is_initialized(theap)) return;
+  if (_mi_theap_heap(theap)->exclusive_arena == NULL) {
+    _mi_warning_message("cannot unload theaps that are not associated with an exclusive arena\n");
+    return;
+  }
+
+  // abandon all pages so all thread'id in the pages are cleared
+  _mi_theap_collect_abandon(theap);
+  mi_assert_internal(theap->page_count==0);
+
+  // remove from theap list
+  mi_theap_free(theap, false); // but don't actually free the memory
+
+  // disassociate from the current thread-local and static state
+  theap->tld = NULL;
+  return;
+}
+
+bool mi_theap_reload(mi_theap_t* theap, mi_arena_id_t arena_id) {
+  mi_assert(mi_theap_is_initialized(theap));
+  if (theap==NULL || !mi_theap_is_initialized(theap)) return false;
+  if (_mi_theap_heap(theap)->exclusive_arena == NULL) {
+    _mi_warning_message("cannot reload theaps that were not associated with an exclusive arena\n");
+    return false;
+  }
+  if (theap->tld != NULL) {
+    _mi_warning_message("cannot reload theaps that were not unloaded first\n");
+    return false;
+  }
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
+  if (_mi_theap_heap(theap)->exclusive_arena != arena) {
+    _mi_warning_message("trying to reload a theap at a different arena address: %p vs %p\n", _mi_theap_heap(theap)->exclusive_arena, arena);
+    return false;
+  }
+
+  mi_assert_internal(theap->page_count==0);
+
+  // re-associate with the current thread-local and static state
+  theap->tld = mi_theap_get_default()->tld;
+
+  // reinit direct pages (as we may be in a different process)
+  mi_assert_internal(theap->page_count == 0);
+  for (size_t i = 0; i < MI_PAGES_DIRECT; i++) {
+    theap->pages_free_direct[i] = (mi_page_t*)&_mi_page_empty;
+  }
+
+  // push on the thread local theaps list
+  theap->tnext = theap->tld->theaps;
+  theap->tld->theaps = theap;
+  return true;
+}
+*/
+
+
+/* -----------------------------------------------------------
+  Visit all theap blocks and areas
+  Todo: enable visiting abandoned pages, and
+        enable visiting all blocks of all theaps across threads
+----------------------------------------------------------- */
+
+void _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page) {
+  const size_t bsize = mi_page_block_size(page);
+  const size_t ubsize = mi_page_usable_block_size(page);
+  area->reserved = page->reserved * bsize;
+  area->committed = page->capacity * bsize;
+  area->blocks = mi_page_start(page);
+  area->used = page->used;   // number of blocks in use (#553)
+  area->block_size = ubsize;
+  area->full_block_size = bsize;
+  area->reserved1 = page;
+}
+
+static void mi_get_fast_divisor(size_t divisor, uint64_t* magic, size_t* shift) {
+  mi_assert_internal(divisor > 0 && divisor <= UINT32_MAX);
+  *shift = MI_SIZE_BITS - mi_clz(divisor - 1);
+  *magic = ((((uint64_t)1 << 32) * (((uint64_t)1 << *shift) - divisor)) / divisor + 1);
+}
+
+static size_t mi_fast_divide(size_t n, uint64_t magic, size_t shift) {
+  mi_assert_internal(n <= UINT32_MAX);
+  const uint64_t hi = ((uint64_t)n * magic) >> 32;
+  return (size_t)((hi + n) >> shift);
+}
+
+bool _mi_theap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg) {
+  mi_assert(area != NULL);
+  if (area==NULL) return true;
+  mi_assert(page != NULL);
+  if (page == NULL) return true;
+
+  _mi_page_free_collect(page,true);              // collect both thread_delayed and local_free
+  mi_assert_internal(page->local_free == NULL);
+  if (page->used == 0) return true;
+
+  size_t psize;
+  uint8_t* const pstart = mi_page_area(page, &psize);
+  mi_heap_t* const heap = mi_page_heap(page);
+  const size_t bsize    = mi_page_block_size(page);
+  const size_t ubsize   = mi_page_usable_block_size(page); // without padding
+
+  // optimize page with one block
+  if (page->capacity == 1) {
+    mi_assert_internal(page->used == 1 && page->free == NULL);
+    return visitor(heap, area, pstart, ubsize, arg);
+  }
+  mi_assert(bsize <= UINT32_MAX);
+
+  // optimize full pages
+  if (page->used == page->capacity) {
+    uint8_t* block = pstart;
+    for (size_t i = 0; i < page->capacity; i++) {
+      if (!visitor(heap, area, block, ubsize, arg)) return false;
+      block += bsize;
+    }
+    return true;
+  }
+
+  // create a bitmap of free blocks.
+  #define MI_MAX_BLOCKS   (MI_SMALL_PAGE_SIZE / sizeof(void*))
+  uintptr_t free_map[MI_MAX_BLOCKS / MI_INTPTR_BITS];
+  const uintptr_t bmapsize = _mi_divide_up(page->capacity, MI_INTPTR_BITS);
+  memset(free_map, 0, bmapsize * sizeof(intptr_t));
+  if (page->capacity % MI_INTPTR_BITS != 0) {
+    // mark left-over bits at the end as free
+    size_t shift   = (page->capacity % MI_INTPTR_BITS);
+    uintptr_t mask = (UINTPTR_MAX << shift);
+    free_map[bmapsize - 1] = mask;
+  }
+
+  // fast repeated division by the block size
+  uint64_t magic;
+  size_t   shift;
+  mi_get_fast_divisor(bsize, &magic, &shift);
+
+  #if MI_DEBUG>1
+  size_t free_count = 0;
+  #endif
+  for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page, block)) {
+    #if MI_DEBUG>1
+    free_count++;
+    #endif
+    mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize));
+    size_t offset = (uint8_t*)block - pstart;
+    mi_assert_internal(offset % bsize == 0);
+    mi_assert_internal(offset <= UINT32_MAX);
+    size_t blockidx = mi_fast_divide(offset, magic, shift);
+    mi_assert_internal(blockidx == offset / bsize);
+    mi_assert_internal(blockidx < MI_MAX_BLOCKS);
+    size_t bitidx = (blockidx / MI_INTPTR_BITS);
+    size_t bit = blockidx - (bitidx * MI_INTPTR_BITS);
+    free_map[bitidx] |= ((uintptr_t)1 << bit);
+  }
+  mi_assert_internal(page->capacity == (free_count + page->used));
+
+  // walk through all blocks skipping the free ones
+  #if MI_DEBUG>1
+  size_t used_count = 0;
+  #endif
+  uint8_t* block = pstart;
+  for (size_t i = 0; i < bmapsize; i++) {
+    if (free_map[i] == 0) {
+      // every block is in use
+      for (size_t j = 0; j < MI_INTPTR_BITS; j++) {
+        #if MI_DEBUG>1
+        used_count++;
+        #endif
+        if (!visitor(heap, area, block, ubsize, arg)) return false;
+        block += bsize;
+      }
+    }
+    else {
+      // visit the used blocks in the mask
+      uintptr_t m = ~free_map[i];
+      while (m != 0) {
+        #if MI_DEBUG>1
+        used_count++;
+        #endif
+        size_t bitidx = mi_ctz(m);
+        if (!visitor(heap, area, block + (bitidx * bsize), ubsize, arg)) return false;
+        m &= m - 1;  // clear least significant bit
+      }
+      block += bsize * MI_INTPTR_BITS;
+    }
+  }
+  mi_assert_internal(page->used == used_count);
+  return true;
+}
+
+
+
+// Separate struct to keep `mi_page_t` out of the public interface
+typedef struct mi_theap_area_ex_s {
+  mi_heap_area_t area;
+  mi_page_t* page;
+} mi_theap_area_ex_t;
+
+typedef bool (mi_theap_area_visit_fun)(const mi_theap_t* theap, const mi_theap_area_ex_t* area, void* arg);
+
+static bool mi_theap_visit_areas_page(mi_theap_t* theap, mi_page_queue_t* pq, mi_page_t* page, void* vfun, void* arg) {
+  MI_UNUSED(theap);
+  MI_UNUSED(pq);
+  mi_theap_area_visit_fun* fun = (mi_theap_area_visit_fun*)vfun;
+  mi_theap_area_ex_t xarea;
+  xarea.page = page;
+  _mi_heap_area_init(&xarea.area, page);
+  return fun(theap, &xarea, arg);
+}
+
+// Visit all theap pages as areas
+static bool mi_theap_visit_areas(const mi_theap_t* theap, mi_theap_area_visit_fun* visitor, void* arg) {
+  if (visitor == NULL) return false;
+  return mi_theap_visit_pages((mi_theap_t*)theap, &mi_theap_visit_areas_page, true, (void*)(visitor), arg); // note: function pointer to void* :-{
+}
+
+// Just to pass arguments
+typedef struct mi_visit_blocks_args_s {
+  bool  visit_blocks;
+  mi_block_visit_fun* visitor;
+  void* arg;
+} mi_visit_blocks_args_t;
+
+static bool mi_theap_area_visitor(const mi_theap_t* theap, const mi_theap_area_ex_t* xarea, void* arg) {
+  mi_visit_blocks_args_t* args = (mi_visit_blocks_args_t*)arg;
+  if (!args->visitor(_mi_theap_heap(theap), &xarea->area, NULL, xarea->area.block_size, args->arg)) return false;
+  if (args->visit_blocks) {
+    return _mi_theap_area_visit_blocks(&xarea->area, xarea->page, args->visitor, args->arg);
+  }
+  else {
+    return true;
+  }
+}
+
+// Visit all blocks in a theap
+bool mi_theap_visit_blocks(const mi_theap_t* theap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  mi_visit_blocks_args_t args = { visit_blocks, visitor, arg };
+  return mi_theap_visit_areas(theap, &mi_theap_area_visitor, &args);
+}
+
diff --git a/system/lib/mimalloc/src/threadlocal.c b/system/lib/mimalloc/src/threadlocal.c
new file mode 100644
index 0000000000000..c7822c1c3bacc
--- /dev/null
+++ b/system/lib/mimalloc/src/threadlocal.c
@@ -0,0 +1,224 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2026, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+Implement dynamic thread local variables (for heap's).
+Unlike most OS native implementations there is no limit on the number
+that can be allocated.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/prim.h"
+
+/* -----------------------------------------------------------
+  Each thread can have (a dynamically expanding) array of
+  thread-local values. Each slot has a value and a version.
+  The version is used to safely reuse slots.
+----------------------------------------------------------- */
+typedef struct mi_tls_slot_s {
+  size_t  version;
+  void*   value;
+} mi_tls_slot_t;
+
+typedef struct mi_thread_locals_s {
+  size_t        count;
+  mi_tls_slot_t slots[1];
+} mi_thread_locals_t;
+
+static mi_thread_locals_t mi_thread_locals_empty = { 0, {{0,NULL}} };
+
+mi_decl_thread mi_thread_locals_t* mi_thread_locals = &mi_thread_locals_empty;  // always point to a valid `mi_thread_locals_t`
+
+
+/* -----------------------------------------------------------
+  Each key consists of the slot index in the lower bits,
+  and its version it the top bits. When we get a value
+  the version must match or we return NULL. When we set
+  a value, we also set the version of the key.
+----------------------------------------------------------- */
+
+#define MI_TLS_IDX_BITS  (MI_SIZE_BITS/2)
+#define MI_TLS_IDX_MASK  ((MI_ZU(1)<<MI_TLS_IDX_BITS)-1)
+
+static size_t mi_key_index( size_t key ) {
+  return (key & MI_TLS_IDX_MASK);
+}
+
+static size_t mi_key_version( size_t key ) {
+  return (key >> MI_TLS_IDX_BITS);
+}
+
+static mi_thread_local_t mi_key_create( size_t index, size_t version ) {
+  mi_assert_internal(version != 0);
+  mi_assert_internal(index <= MI_TLS_IDX_MASK);
+  const mi_thread_local_t key = ((version << MI_TLS_IDX_BITS) | index);
+  mi_assert_internal(key != 0);
+  return key;
+}
+
+
+// dynamically reallocate the thread local slots when needed
+static mi_thread_locals_t* mi_thread_locals_expand(size_t least_idx) {
+  mi_thread_locals_t* tls_old = mi_thread_locals;
+  const size_t count_old = tls_old->count;
+  size_t count;
+  if (count_old==0) {
+    tls_old = NULL; // so we allocate fresh from mi_thread_locals_empty
+    count = 16;     // start with 16 slots
+  } 
+  else if (count_old >= 1024) {
+    count = count_old + 1024;  // at some point increase linearly
+  }
+  else {
+    count = 2*count_old;       // and double initially
+  }
+  if (count <= least_idx) {
+    count = least_idx + 1;
+  }
+  mi_thread_locals_t* tls = (mi_thread_locals_t*)mi_rezalloc(tls_old, sizeof(mi_thread_locals_t) + count*sizeof(mi_tls_slot_t));
+  if mi_unlikely(tls==NULL) return NULL;
+  tls->count = count;
+  mi_thread_locals = tls;
+  return tls;
+}
+
+static mi_decl_noinline bool mi_thread_local_set_expand( mi_thread_local_t key, void* val ) {
+  if (val==NULL) return true;
+  const size_t idx = mi_key_index(key);  
+  mi_thread_locals_t* tls = mi_thread_locals_expand(idx);
+  if (tls==NULL) return false;
+  mi_assert_internal(tls == mi_thread_locals);
+  mi_assert_internal(idx < tls->count);
+  tls->slots[idx].value = val;
+  tls->slots[idx].version = mi_key_version(key);
+  return true;
+}
+
+// set a tls slot; returns `true` if successful.
+// Can return `false` if we could not reallocate the slots array.
+bool _mi_thread_local_set( mi_thread_local_t key, void* val ) {
+  mi_thread_locals_t* tls = mi_thread_locals;
+  mi_assert_internal(tls!=NULL);
+  mi_assert_internal(key!=0);
+  const size_t idx = mi_key_index(key);
+  if mi_likely(idx < tls->count) {
+    tls->slots[idx].value = val;
+    tls->slots[idx].version = mi_key_version(key);
+    return true;
+  }
+  else {
+    return mi_thread_local_set_expand( key, val );  // tailcall
+  }
+}
+
+// get a tls slot value
+void* _mi_thread_local_get( mi_thread_local_t key ) {
+  const mi_thread_locals_t* const tls = mi_thread_locals;
+  mi_assert_internal(tls!=NULL);
+  mi_assert_internal(key!=0);
+  const size_t idx = mi_key_index(key);
+  if mi_likely(idx < tls->count && mi_key_version(key) == tls->slots[idx].version) {
+    return tls->slots[idx].value;
+  }
+  else {
+    return NULL;  
+  }
+}
+
+void _mi_thread_locals_thread_done(void) {
+  mi_thread_locals_t* const tls = mi_thread_locals;
+  if (tls!=NULL && tls->count > 0) {
+    mi_free(tls);
+    mi_thread_locals = &mi_thread_locals_empty;
+  }
+}
+
+/* -----------------------------------------------------------
+Create and free fresh TLS key's
+----------------------------------------------------------- */
+#include "bitmap.h"
+
+static mi_lock_t    mi_thread_locals_lock;    // we need a lock in order to re-allocate the slot bits
+static mi_bitmap_t* mi_thread_locals_free;    // reuse an arena bitmap to track which slots were assigned (1=free, 0=in-use)
+static size_t       mi_thread_locals_version; // version to be able to reuse slots safely
+
+void _mi_thread_locals_init(void) {
+  mi_lock_init(&mi_thread_locals_lock);
+}
+
+void _mi_thread_locals_done(void) {
+  mi_lock(&mi_thread_locals_lock) {
+    mi_bitmap_t* const slots = mi_thread_locals_free;
+    mi_free(slots);
+  }
+  mi_lock_done(&mi_thread_locals_lock);
+}
+
+// strange signature but allows us to reuse the arena code for claiming free pages
+static bool mi_thread_local_claim_fun(size_t _slice_index, mi_arena_t* _arena, bool* keep_set) {
+  MI_UNUSED(_slice_index); MI_UNUSED(_arena);
+  *keep_set = false;
+  return true;
+}
+
+// When we claim a free slot, we increase the global version counter 
+// (so if we reuse a slot it will be returning NULL initially when a thread tries to get it)
+static mi_thread_local_t mi_thread_local_claim(void) {
+  size_t idx = 0;
+  if (mi_thread_locals_free != NULL && mi_bitmap_try_find_and_claim(mi_thread_locals_free,0,&idx,&mi_thread_local_claim_fun,NULL)) {
+    mi_thread_locals_version++;
+    if (mi_thread_locals_version == SIZE_MAX/2) { mi_thread_locals_version = 1; }
+    return mi_key_create( idx, mi_thread_locals_version);
+  }
+  else {
+    return 0;
+  }
+}
+
+static bool mi_thread_local_create_expand(void) {
+  mi_bitmap_t* slots = mi_thread_locals_free;
+  // 1024 bits at a time
+  const size_t oldcount = (slots==NULL ? 0 : mi_bitmap_max_bits(slots));
+  const size_t newcount = 1024 + oldcount;
+  if (newcount > MI_TLS_IDX_MASK) { return false; }
+  const size_t newsize = mi_bitmap_size( newcount, NULL );
+  slots = (mi_bitmap_t*)mi_realloc_aligned(slots, newsize, MI_BCHUNK_SIZE);
+  if (slots == NULL) { return false; }
+  mi_bitmap_init(slots, newcount, true /* or otherwise we would zero all old entries */);
+  mi_bitmap_unsafe_setN(slots, oldcount, newcount - oldcount);
+  mi_thread_locals_free = slots;
+  return true;
+}
+
+
+// create a fresh key
+mi_thread_local_t _mi_thread_local_create(void) {
+  mi_thread_local_t key = 0;
+  mi_lock(&mi_thread_locals_lock) {
+    key = mi_thread_local_claim();
+    if (key==0) {
+      if (mi_thread_local_create_expand()) {
+        key = mi_thread_local_claim();
+      }
+    }
+  }
+  return key;
+}
+
+// free a key
+void _mi_thread_local_free(mi_thread_local_t key) {
+  if (key==0) return;
+  const size_t idx = mi_key_index(key);
+  mi_lock(&mi_thread_locals_lock) {
+    mi_bitmap_t* const slots = mi_thread_locals_free;
+    if (slots!=NULL && idx < mi_bitmap_max_bits(slots)) {
+      mi_bitmap_set(slots,idx);
+    }
+  }
+}
+
diff --git a/test/test_core.py b/test/test_core.py
index 87dfd4cbc5288..2a455e1d735dc 100644
--- a/test/test_core.py
+++ b/test/test_core.py
@@ -8654,8 +8654,8 @@ def test_mallinfo(self):
   @parameterized({
     '': ([],),
     'emmalloc': (['-sMALLOC=emmalloc'],),
-    # FIXME(https://github.com/emscripten-core/emscripten/issues/23090)
-    # 'mimalloc': (['-sMALLOC=mimalloc'],),
+    # TODO: investigate removing `-sABORTING_MALLOC=0`, as it's only needed for `wasm64_4gb.test_wrap_malloc_mimalloc`
+    'mimalloc': (['-sMALLOC=mimalloc', '-sABORTING_MALLOC=0'],),
   })
   def test_wrap_malloc(self, args):
     self.do_runf('core/test_wrap_malloc.c', 'OK.', cflags=args)
diff --git a/test/test_other.py b/test/test_other.py
index 30fddb5cbfac6..3e5054da4b751 100644
--- a/test/test_other.py
+++ b/test/test_other.py
@@ -7397,10 +7397,10 @@ def test_dlmalloc_modes(self):
     'O2': (['-O2'], 137000),
     'emmalloc': (['-sMALLOC=emmalloc'], 185000),
     'dlmalloc': (['-sMALLOC=dlmalloc'], 191000),
-    'mimalloc': (['-sMALLOC=mimalloc'], 245000),
+    'mimalloc': (['-sMALLOC=mimalloc'], 255000),
     'emmalloc_O2': (['-sMALLOC=emmalloc', '-O2'], 130000),
     'dlmalloc_O2': (['-sMALLOC=dlmalloc', '-O2'], 137000),
-    'mimalloc_O2': (['-sMALLOC=mimalloc', '-O2'], 181000),
+    'mimalloc_O2': (['-sMALLOC=mimalloc', '-O2'], 193000),
   })
   # This test verifies the output code size of the different -sMALLOC= modes.
   def test_malloc_size(self, args, max_size):
diff --git a/tools/system_libs.py b/tools/system_libs.py
index 942bb018bfd19..09ddc30ba24d9 100644
--- a/tools/system_libs.py
+++ b/tools/system_libs.py
@@ -1869,14 +1869,26 @@ class libmimalloc(MTLibrary):
     # build emmalloc as only a system allocator, without exporting itself onto
     # malloc/free in the global scope
     '-DEMMALLOC_NO_STD_EXPORTS',
+    # disable large pages by default, see:
+    # https://github.com/microsoft/mimalloc/commit/9199d54bcf1e6dea0deb61a3a8a4b3ea4b45a341
+    '-DMI_ENABLE_LARGE_PAGES=0',
+    # halve the page size to 32KiB on wasm64 and to 16KiB on wasm32
+    # https://github.com/microsoft/mimalloc/issues/647#issuecomment-1324109021
+    # https://github.com/emscripten-core/emscripten/issues/20645#issuecomment-1962964755
+    '-DMI_ARENA_SLICE_SHIFT=(12 + MI_SIZE_SHIFT)',
+    # `malloc`ed pointers must be aligned at least as strictly as max_align_t
+    '-DMI_MAX_ALIGN_SIZE=8',
+    # reserve memory in 64 MiB chunks (internally divided by 4)
+    # Note: keep in sync with the -sINITIAL_HEAP default
+    '-DMI_DEFAULT_ARENA_RESERVE=65536',
     # build mimalloc with an override of malloc/free
     '-DMI_MALLOC_OVERRIDE',
     # TODO: add build modes that include debug checks 1,2,3
     '-DMI_DEBUG=0',
     # disable `assert()` in the underlying emmalloc allocator
     '-DNDEBUG',
-    # avoid use of `__builtin_thread_pointer()`
-    '-DMI_LIBC_MUSL',
+    # enable use of `__builtin_thread_pointer()`
+    '-DMI_USE_BUILTIN_THREAD_POINTER',
   ]
 
   # malloc/free/calloc are runtime functions and can be generated during LTO