diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
index 353303e54c..6920ab4713 100644
--- a/.github/workflows/weekly.yml
+++ b/.github/workflows/weekly.yml
@@ -150,6 +150,8 @@ jobs:
           - { name: 'half',   image: '2026',           build: 'Release', components: 'core,python,bin,view,render,test', cmake: '-DUSE_BLOSC=OFF -DUSE_IMATH_HALF=ON' }
           - { name: 'sse',    image: '2026',           build: 'Release', components: 'core,python,bin,view,render,test', cmake: '-DOPENVDB_SIMD=SSE42' }
           - { name: 'avx',    image: '2026',           build: 'Release', components: 'core,python,bin,view,render,test', cmake: '-DOPENVDB_SIMD=AVX' }
+          - { name: 'simd',   image: '2026',           build: 'Release', components: 'core,python,bin,view,render,test', cmake: '-DOPENVDB_SIMD=AVX -DUSE_STD_SIMD=ON' }
+          - { name: 'simd-fallback', image: '2026',    build: 'Release', components: 'core,python,bin,view,render,test', cmake: '-DOPENVDB_SIMD=AVX -DUSE_STD_SIMD=OFF' }
           - { name: 'pygrid', image: '2026',           build: 'Release', components: 'core,python,bin,view,render,test', cmake: '-DOPENVDB_PYTHON_WRAP_ALL_GRID_TYPES=ON' }
           - { name: 'asan',   image: '2026',           build: 'asan',    components: 'core,test',                        cmake: '-DDISABLE_DEPENDENCY_VERSION_CHECKS=ON -DNANOVDB_USE_OPENVDB=ON -DOPENVDB_AX_STATIC=OFF -DOPENVDB_CORE_STATIC=OFF -DUSE_BLOSC=OFF' } # We never called blosc_destroy(), so disable blosc to silence these errors
           - { name: 'ubsan',  image: '2026',           build: 'ubsan',   components: 'core,test',                        cmake: '-DDISABLE_DEPENDENCY_VERSION_CHECKS=ON -DCMAKE_CXX_FLAGS="-Wno-deprecated-declarations" ' }
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 089381aa41..25b53106cb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -131,6 +131,13 @@ required.]=] ${USE_EXR})
 option(USE_PNG "Use PNG while building openvdb components." OFF)
 option(USE_AX "Use OpenVDB AX while building openvdb components." ${OPENVDB_BUILD_AX})
 option(USE_NANOVDB "Use NanoVDB while building openvdb components." ${OPENVDB_BUILD_NANOVDB})
+option(USE_STD_SIMD [=[
+Suppress auto-detection of std::experimental::simd (C++ Parallelism TS v2) and
+force the std::array fallback backend for openvdb::simd::Simd<T,W>.  By default
+the TS v2 header is used when the compiler provides it; set this to OFF to
+disable.  Has no effect on non-x86 or compilers that do not ship the TS header.
+Note: unlike USE_VCL this option requires no external dependency — the TS header
+is part of the compiler's standard library.]=] ON)
 
 cmake_dependent_option(OPENVDB_DISABLE_BOOST_IMPLICIT_LINKING
   "Disable the implicit linking of Boost libraries on Windows" ON "WIN32" OFF)
@@ -420,6 +427,11 @@ elseif(OPENVDB_SIMD STREQUAL "SSE42")
   add_compile_definitions("$<$<COMPILE_LANGUAGE:CXX>:OPENVDB_USE_SSE42>")
 endif()
 
+# Suppress std::experimental::simd auto-detection if USE_STD_SIMD is OFF.
+if(NOT USE_STD_SIMD)
+  add_compile_definitions("$<$<COMPILE_LANGUAGE:CXX>:OPENVDB_NO_STD_SIMD>")
+endif()
+
 #########################################################################
 
 # Configure our cmake modules to only search for static libraries
diff --git a/openvdb/openvdb/Platform.h b/openvdb/openvdb/Platform.h
index 017876490b..dae09ee8db 100644
--- a/openvdb/openvdb/Platform.h
+++ b/openvdb/openvdb/Platform.h
@@ -55,6 +55,17 @@
     #endif
 #endif
 
+/// Auto-detect std::experimental::simd (C++ Parallelism TS v2).
+/// When available, openvdb::simd::Simd<T,W> wraps it to emit native SIMD
+/// instructions without relying on the auto-vectorizer.
+/// Suppress with -DOPENVDB_NO_STD_SIMD to force the std::array fallback.
+#if !defined(OPENVDB_NO_STD_SIMD) && defined(__has_include) && __has_include(<experimental/simd>)
+#  include <experimental/simd>
+#  ifdef __cpp_lib_experimental_parallel_simd
+#    define OPENVDB_USE_STD_SIMD 1
+#  endif
+#endif
+
 /// Windows defines
 #ifdef _WIN32
     ///Disable the non-portable Windows definitions of min() and max() macros
diff --git a/openvdb/openvdb/points/impl/PointRasterizeEllipsoidsSDFImpl.h b/openvdb/openvdb/points/impl/PointRasterizeEllipsoidsSDFImpl.h
index 1e70a12677..f394141e91 100644
--- a/openvdb/openvdb/points/impl/PointRasterizeEllipsoidsSDFImpl.h
+++ b/openvdb/openvdb/points/impl/PointRasterizeEllipsoidsSDFImpl.h
@@ -399,6 +399,20 @@ struct EllipsoidTransferQuat final :
         this->BaseT::rasterizePoint(ijk, id, bounds, R);
     }
 
+    /// @brief  Point-by-point dispatch — ellipsoids are not batchable via
+    ///   the spherical SIMD path.  This override prevents SphericalTransfer::
+    ///   rasterizePoints (which calls rasterizeN2) from being instantiated for
+    ///   the FixedBandRadius<Vec3f> radius type used by ellipsoids.
+    inline void rasterizePoints(const Coord& ijk,
+                    const Index start,
+                    const Index end,
+                    const CoordBBox& bounds)
+    {
+        for (Index i = start; i < end; ++i) {
+            this->rasterizePoint(ijk, i, bounds);
+        }
+    }
+
 private:
     std::unique_ptr<QuatHandleT> mRotationHandle;
 };
@@ -446,6 +460,17 @@ struct EllipsoidTransferMat3 final :
         this->BaseT::rasterizePoint(ijk, id, bounds, mXformHandle->get(id));
     }
 
+    /// @brief  See EllipsoidTransferQuat::rasterizePoints.
+    inline void rasterizePoints(const Coord& ijk,
+                    const Index start,
+                    const Index end,
+                    const CoordBBox& bounds)
+    {
+        for (Index i = start; i < end; ++i) {
+            this->rasterizePoint(ijk, i, bounds);
+        }
+    }
+
 private:
     std::unique_ptr<XformHandleT> mXformHandle;
 };
diff --git a/openvdb/openvdb/points/impl/PointRasterizeSDFImpl.h b/openvdb/openvdb/points/impl/PointRasterizeSDFImpl.h
index 393eb26c22..563efdf658 100644
--- a/openvdb/openvdb/points/impl/PointRasterizeSDFImpl.h
+++ b/openvdb/openvdb/points/impl/PointRasterizeSDFImpl.h
@@ -9,6 +9,13 @@
 #ifndef OPENVDB_POINTS_RASTERIZE_SDF_IMPL_HAS_BEEN_INCLUDED
 #define OPENVDB_POINTS_RASTERIZE_SDF_IMPL_HAS_BEEN_INCLUDED
 
+#include <openvdb/simd/Simd.h>
+
+/// @brief  Set to 1 to disable batched SIMD rasterization and always use the
+///   scalar per-point path, regardless of the ISA in use.  Useful for
+///   debugging or performance comparison.
+#define OPENVDB_DISABLE_BATCHED_TRANSFERS 0
+
 namespace openvdb {
 OPENVDB_USE_VERSION_NAMESPACE
 namespace OPENVDB_VERSION_NAME {
@@ -177,6 +184,19 @@ struct SignedDistanceFieldTransfer :
     using FilteredTransferT = FilteredTransfer<FilterT>;
     using PositionHandleT = AttributeHandle<Vec3f, PositionCodecT>;
 
+    /// @brief  Return the SIMD batch width for arithmetic type RealT.
+    ///   Returns 1 (scalar fallback) when batched transfers are disabled or
+    ///   no ISA is configured.
+    template<typename RealT>
+    static constexpr size_t GetBatchSize()
+    {
+#if OPENVDB_DISABLE_BATCHED_TRANSFERS
+        return 1;
+#else
+        return simd::SimdNativeT<RealT>::width;
+#endif
+    }
+
     // typically the max radius of all points rounded up
     inline Vec3i range(const Coord&, size_t) const { return mMaxKernelWidth; }
 
@@ -305,10 +325,7 @@ struct SphericalTransfer :
             const std::unordered_map<const PointDataTree::LeafNodeType*, Index>* ids = nullptr)
         : SphericalTransfer(pidx, Vec3i(width), rt, source, filter, interrupt, surface, cpg, ids) {}
 
-    /// @brief  For each point, stamp a sphere with a given radius by running
-    ///   over all intersecting voxels and calculating if this point is closer
-    ///   than the currently held distance value. Note that the default value
-    ///   of the surface buffer should be the background value of the surface.
+    /// @brief  Dispatch: batched SIMD path when N2 > 1, scalar fallback when N2 == 1.
     inline void rasterizePoint(const Coord& ijk,
                     const Index id,
                     const CoordBBox& bounds)
@@ -328,6 +345,46 @@ struct SphericalTransfer :
         this->rasterizePoint(P, id, bounds, this->mRadius.eval(id));
     }
 
+    /// @brief  For each point in [start, end), batch into groups of N2 and
+    ///   dispatch to rasterizeN2.  Partial final batches pad with -1 sentinels
+    ///   and fall through to scalar for size-1 remainders.
+    inline void rasterizePoints(const Coord& ijk,
+                    const Index start,
+                    const Index end,
+                    const CoordBBox& bounds)
+    {
+        constexpr size_t N2 = BaseT::template GetBatchSize<RealT>();
+        if constexpr (N2 == 1) {
+            // Scalar fallback: no ISA configured or batching disabled
+            for (Index i = start; i < end; ++i) {
+                this->rasterizePoint(ijk, i, bounds);
+            }
+        } else {
+            // Batched SIMD path.  N2 must be a power of two.
+            static_assert((N2 > 1) && !(N2 & (N2 - 1)));
+
+            std::array<int64_t, N2> ids;
+            size_t offset = 0;
+            for (Index i = start; i < end; ++i) {
+                if (!BaseT::filter(i)) continue;
+                ids[offset++] = int64_t(i);
+                if (offset == N2) {
+                    this->rasterizeN2<N2>(ijk, ids, bounds);
+                    offset = 0;
+                }
+            }
+
+            if (offset == 0) return;
+            if (offset == 1) {
+                this->rasterizePoint(ijk, Index(ids[0]), bounds);
+            } else {
+                // Pad remaining slots with -1 sentinel; stamp will ignore them.
+                for (; offset < N2; ++offset) ids[offset] = int64_t(-1);
+                this->rasterizeN2<N2>(ijk, ids, bounds);
+            }
+        }
+    }
+
     /// @brief Allow early termination if all voxels in the surface have been
     ///   deactivated (all interior)
     inline bool endPointLeaf(const PointDataTree::LeafNodeType&)
@@ -371,9 +428,6 @@ struct SphericalTransfer :
     ///   to pass a different P and scaled FixedBandRadius from its ellipsoid
     ///   path (as isolated points are stamped as spheres with a different
     ///   scale and positions may have been smoothed).
-    /// @todo   I would prefer this second function wasn't necessary but there
-    ///   is no easy way to allow differently scaled radii to exist in a more
-    ///   efficient manner, nor use a different P.
     inline void rasterizePoint(const Vec3d& P,
                     const Index id,
                     const CoordBBox& bounds,
@@ -383,64 +437,179 @@ struct SphericalTransfer :
         CoordBBox intersectBox(Coord::round(P - max), Coord::round(P + max));
         intersectBox.intersect(bounds);
         if (intersectBox.empty()) return;
+        this->stamp<RealT>(RealT(P[0]), RealT(P[1]), RealT(P[2]),
+            RealT(r.get()), RealT(r.minSq()), RealT(r.maxSq()), &id, intersectBox);
+    }
+
+    /// @brief  Gather N2 points from the point-data leaf, convert AoS→SoA,
+    ///   load into Simd<RealT,Size> vectors, and call the generic stamp.
+    template<size_t Size>
+    inline void rasterizeN2(const Coord& ijk,
+        const std::array<int64_t, BaseT::template GetBatchSize<RealT>()>& points,
+        const CoordBBox& bounds)
+    {
+        using SimdT = simd::Simd<RealT, int(Size)>;
+
+        // Scratch: 3 SoA position arrays (+ 3 radius arrays for varying radii)
+        constexpr size_t CacheWords = (RadiusType::Fixed ? 3 : 6) * Size;
+        std::array<RealT, CacheWords> cache;
+
+        const Vec3d ijkd = ijk.asVec3d();
+        Vec3d tmp{};
+
+        // Find last valid index; sentinel -1 marks padding slots.
+        // It is guaranteed at least 2 valid entries (caller routes size-1 to scalar).
+        int64_t firstInvalid = Size;
+        for (size_t i = 0; i < Size; ++i) {
+            if (points[i] == int64_t(-1)) { firstInvalid = int64_t(i); break; }
+        }
+
+        // AoS → SoA: convert positions (and optionally radii) to RealT arrays.
+        for (size_t i = 0; i < Size; ++i) {
+            if (int64_t(i) < firstInvalid) {
+                tmp = ijkd + Vec3d(this->mPosition->get(Index(points[i])));
+                tmp = this->transformSourceToTarget(tmp);
+            }
+            cache[i + Size*0] = tmp[0];
+            cache[i + Size*1] = tmp[1];
+            cache[i + Size*2] = tmp[2];
+            if constexpr (!RadiusType::Fixed) {
+                if (int64_t(i) < firstInvalid) {
+                    auto r = this->mRadius.eval(Index(points[i]));
+                    cache[i + Size*3] = RealT(r.get());
+                    cache[i + Size*4] = RealT(r.minSq());
+                    cache[i + Size*5] = RealT(r.maxSq());
+                } else {
+                    cache[i + Size*3] = cache[i + Size*4] = cache[i + Size*5] = RealT(0);
+                }
+            }
+        }
+
+        // Load SoA arrays into SIMD registers.
+        SimdT px(cache.data() + Size*0);
+        SimdT py(cache.data() + Size*1);
+        SimdT pz(cache.data() + Size*2);
+
+        SimdT rad, rmin2, rmax2;
+        if constexpr (RadiusType::Fixed) {
+            auto r = this->mRadius.eval(Index(0));  // same for all
+            rad   = SimdT(RealT(r.get()));
+            rmin2 = SimdT(RealT(r.minSq()));
+            rmax2 = SimdT(RealT(r.maxSq()));
+        } else {
+            rad   = SimdT(cache.data() + Size*3);
+            rmin2 = SimdT(cache.data() + Size*4);
+            rmax2 = SimdT(cache.data() + Size*5);
+        }
+
+        // Compute bounding box from the widest point in the batch.
+        const SimdT rmax_broad = simd::hmax(rmax2);  // broadcast max radius²
+        // Conservative: use sqrt(max rmax2) as the search radius
+        const RealT searchR = std::sqrt(RealT(simd::hmax(rmax2)));
+        const SimdT pxMin = simd::hmin(px), pyMin = simd::hmin(py), pzMin = simd::hmin(pz);
+        const SimdT pxMax = simd::hmax(px), pyMax = simd::hmax(py), pzMax = simd::hmax(pz);
+
+        CoordBBox intersectBox(
+            Coord::round(Vec3d(RealT(pxMin) - searchR, RealT(pyMin) - searchR, RealT(pzMin) - searchR)),
+            Coord::round(Vec3d(RealT(pxMax) + searchR, RealT(pyMax) + searchR, RealT(pzMax) + searchR)));
+        intersectBox.intersect(bounds);
+        if (intersectBox.empty()) return;
+
+        this->stamp<SimdT>(px, py, pz, rad, rmin2, rmax2, points.data(), intersectBox);
+    }
+
+    /// @brief  Generic-T stamp: ScalarT is either RealT (scalar) or Simd<RealT,W>
+    ///   (SIMD batch).  The same source compiles for both with zero #ifdef.
+    ///
+    ///   Key design points:
+    ///   - Arithmetic operators (+, -, *, /) and comparisons (>=, <=, ==) are
+    ///     overloaded for Simd<T,W> and resolve to the built-in operators for
+    ///     scalar T.  No simd::add() free-function calls.
+    ///   - simd::where(mask, a, b) replaces ternary; scalar overload is a
+    ///     plain ternary.
+    ///   - simd::hmin(dist) reduces all lanes to the minimum and broadcasts
+    ///     back, keeping the result in ScalarT space.  For scalar T it is the
+    ///     identity.
+    ///   - ScalarT(hmin_result) invokes explicit operator T() on Simd<T,W> to
+    ///     extract the scalar at the single write boundary; for scalar T this
+    ///     is a trivial copy.
+    ///   - simd::hall / simd::hany return bool for both scalar and SIMD.
+    template<typename ScalarT, typename IdT = const Index*>
+    inline void stamp(const ScalarT& Px, const ScalarT& Py, const ScalarT& Pz,
+                      const ScalarT& r,  const ScalarT& Rmin2, const ScalarT& Rmax2,
+                      const IdT ids, const CoordBBox& intersection)
+    {
+        using simd::where;
+        using simd::hall;
+        using simd::hany;
+        using simd::hmin;
+        using simd::hfirst;
+        using ElemT = simd::scalar_t<ScalarT>;
+
+        if (intersection.empty()) return;
 
         auto* const data = this->template buffer<0>();
-        auto* const cpg = CPG ? this->template buffer<CPG ? 1 : 0>() : nullptr;
+        [[maybe_unused]] auto* const cpgbuf =
+            CPG ? this->template buffer<CPG ? 1 : 0>() : nullptr;
         auto& mask = *(this->template mask<0>());
 
-        // If min2 == 0.0, then the index space radius is equal to or less than
-        // the desired half band. In this case each sphere interior always needs
-        // to be filled with distance values as we won't ever reach the negative
-        // background value. If, however, a point overlaps a voxel coord exactly,
-        // x2y2z2 will be 0.0. Forcing min2 to be less than zero here avoids
-        // incorrectly setting these voxels to inactive -background values as
-        // x2y2z2 will never be < 0.0. We still want the lteq logic in the
-        // (x2y2z2 <= min2) check as this is valid when min2 > 0.0.
-        const RealT min2 = r.minSq() == 0.0 ? -1.0 : r.minSq();
-        const RealT max2 = r.maxSq();
+        // When Rmin2 == 0 the sphere interior must be filled but x2y2z2 can
+        // equal 0 at the centre, which would incorrectly trigger the <= check.
+        // Force min2 negative so x2y2z2 <= min2 is never true in that case.
+        const ScalarT min2 = where(Rmin2 == ScalarT(ElemT(0)), ScalarT(ElemT(-1)), Rmin2);
+        const ScalarT max2 = Rmax2;
+        const ScalarT vdx(ElemT(this->mDx));
 
-        const Coord& a(intersectBox.min());
-        const Coord& b(intersectBox.max());
+        const Coord& a(intersection.min());
+        const Coord& b(intersection.max());
         for (Coord c = a; c.x() <= b.x(); ++c.x()) {
-            const RealT x2 = static_cast<RealT>(math::Pow2(c.x() - P[0]));
-            const Index i = ((c.x() & (DIM-1u)) << 2*LOG2DIM); // unsigned bit shift mult
+            const ScalarT x2 = simd::square(ScalarT(ElemT(c.x())) - Px);
+            const Index ii = ((c.x() & (DIM-1u)) << 2*LOG2DIM);
             for (c.y() = a.y(); c.y() <= b.y(); ++c.y()) {
-                const RealT x2y2 = static_cast<RealT>(x2 + math::Pow2(c.y() - P[1]));
-                const Index ij = i + ((c.y() & (DIM-1u)) << LOG2DIM);
+                const ScalarT x2y2 = x2 + simd::square(ScalarT(ElemT(c.y())) - Py);
+                const Index ij = ii + ((c.y() & (DIM-1u)) << LOG2DIM);
                 for (c.z() = a.z(); c.z() <= b.z(); ++c.z()) {
-                    const Index offset = ij + /*k*/(c.z() & (DIM-1u));
-                    if (!mask.isOn(offset)) continue; // inside existing level set or not in range
+                    const Index offset = ij + (c.z() & (DIM-1u));
+                    if (!mask.isOn(offset)) continue;
+
+                    const ScalarT x2y2z2 = x2y2 + simd::square(ScalarT(ElemT(c.z())) - Pz);
 
-                    const RealT x2y2z2 = static_cast<RealT>(x2y2 + math::Pow2(c.z() - P[2]));
-                    if (x2y2z2 >= max2) continue; //outside narrow band of particle in positive direction
-                    if (x2y2z2 <= min2) { //outside narrow band of the particle in negative direction. can disable this to fill interior
+                    // All lanes outside positive band — skip voxel entirely.
+                    if (hall(x2y2z2 >= max2)) continue;
+
+                    // Any lane inside negative band — voxel is interior.
+                    if (hany(x2y2z2 <= min2)) {
                         data[offset] = -(this->mBackground);
                         mask.setOff(offset);
                         continue;
                     }
 
-                    const ValueT d = ValueT(this->mDx * (math::Sqrt(x2y2z2) - r.get())); // back to world space
+                    // Distance to surface, back to world space.
+                    const ScalarT dist = vdx * (simd::sqrt(x2y2z2) - r);
+
+                    // hmin reduces all lanes to the minimum distance and
+                    // broadcasts it back into ScalarT so subsequent arithmetic
+                    // stays in the same type domain.  operator ElemT() then
+                    // extracts the scalar at this single write boundary.
+                    const ScalarT mindist = hmin(dist);
+                    const ValueT d = ValueT(ElemT(mindist));
+
                     ValueT& v = data[offset];
                     if (d < v) {
                         v = d;
-                        OPENVDB_NO_TYPE_CONVERSION_WARNING_BEGIN
-                        if (CPG) cpg[offset] = Int64(this->mPLeafMask | Index64(id));
-                        OPENVDB_NO_TYPE_CONVERSION_WARNING_END
-                        // transfer attributes - we can't use this here as the exposed
-                        // function signatures take vector of attributes (i.e. an unbounded
-                        // size). If we instead clamped the attribute transfer to a fixed
-                        // amount of attributes we could get rid of the closest point logic
-                        // entirely. @todo consider adding another signature for increased
-                        // performance
-                        // this->foreach([&](auto* buffer, const size_t idx) {
-                        //     using Type = typename std::remove_pointer<decltype(buffer)>::type;
-                        //     buffer[offset] = mAttributes.template get<Type>(idx);
-                        // })
+                        if constexpr (CPG) {
+                            // Find which lane contributed the minimum distance.
+                            const int winner = hfirst(mindist == dist);
+                            OPENVDB_ASSERT(winner >= 0);
+                            OPENVDB_NO_TYPE_CONVERSION_WARNING_BEGIN
+                            cpgbuf[offset] = Int64(this->mPLeafMask | Index64(ids[winner]));
+                            OPENVDB_NO_TYPE_CONVERSION_WARNING_END
+                        }
                     }
                 }
             }
-        } // outer sdf voxel
-    } // point idx
+        }
+    }
 };
 
 /// @brief  The transfer implementation for averaging of positions followed by
diff --git a/openvdb/openvdb/simd/ASSEMBLY_NOTES.md b/openvdb/openvdb/simd/ASSEMBLY_NOTES.md
new file mode 100644
index 0000000000..ef487b79b6
--- /dev/null
+++ b/openvdb/openvdb/simd/ASSEMBLY_NOTES.md
@@ -0,0 +1,94 @@
+# Assembly notes for `Simd<T,W>` — `rasterizeSdf` demo path
+
+Examined symbol:
+`SphericalTransfer<..., NullCodec, FixedBandRadius<double>, NullFilter, false>::rasterizeN2<4>`
+
+Object file: `build/.../unittest/CMakeFiles/vdb_test.dir/TestPointRasterizeSDF.cc.o`  
+Build flags: `-O3 -mavx -msse4.2` (CPU: Intel Core Ultra 9 285K — supports AVX2)
+
+---
+
+## Inner per-voxel z-loop (hot path)
+
+The `stamp<Simd<double,4>>` kernel is fully inlined into `rasterizeN2<4>`.
+Key instructions in the innermost loop body:
+
+```nasm
+; --- broadcast voxel coord to all 4 particle lanes ---
+vbroadcastsd  %xmm0, %ymm0           ; voxel.z → 4 lanes
+
+; --- x²+y²+z² for 4 particles simultaneously ---
+vsubpd   Pz_simd(%rbp), %ymm0, %ymm0 ; vox.z - Pz[0..3]
+vfmadd213pd  xy2_simd(%rbp), %ymm0, %ymm0  ; z² + (x²+y²)   [AVX2 FMA]
+
+; --- all-outside early exit (zero extra cost) ---
+vcmplepd  %ymm0, rmax2_ymm, %ymm1    ; rmax² <= dist² lane-wise
+vmovmskpd %ymm1, %eax                ; fold 4 comparison bits → scalar int
+not %eax
+test $0xf, %al
+je   next_voxel                      ; all 4 outside → skip with one branch
+
+; --- 4 square roots in one instruction ---
+vsqrtpd  %ymm0, %ymm1                ; sqrt(x²+y²+z²) for all 4 particles
+
+; --- dist = (sqrt(dist²) - r) × vdx ---
+vsubpd   r_simd(%rbp), %ymm1, %ymm1
+vmulpd   vdx_simd(%rbp), %ymm1, %ymm1
+
+; --- hmin: tree reduction 4→2→1 ---
+vmovapd       %xmm1, %xmm0           ; lower 2 lanes
+vextractf128  $1, %ymm1, %xmm1       ; upper 2 lanes
+; (stdx::reduce helper — see note below)
+vmovsd        %xmm0, %xmm0, %xmm1
+vunpckhpd     %xmm0, %xmm0, %xmm0
+vminsd        %xmm1, %xmm0, %xmm1   ; scalar minimum
+
+; --- scalar write boundary ---
+vcvtsd2ss  %xmm1, %xmm1, %xmm1      ; double min → float  (explicit operator T())
+vcomiss    %xmm1, grid_val           ; compare with existing voxel value
+vmovss     %xmm1, grid_val           ; conditional store
+```
+
+---
+
+## Key observations
+
+| | Observation |
+|---|---|
+| ✓ | All arithmetic uses 256-bit YMM registers (`vsubpd`, `vmulpd`, `vaddpd`, `vcmplepd`) |
+| ✓ | `vsqrtpd` — 4 double square roots in **one** instruction vs. 4 serial `sqrtsd` in scalar |
+| ✓ | `vcmplepd + vmovmskpd` — 4-lane comparison folds to a single branch (all-outside check) |
+| ✓ | `vbroadcastsd` — voxel coordinates broadcast across all 4 particle lanes for free |
+| ✓ | `vfmadd213pd` — fused multiply-add for z²+(x²+y²), no extra instruction |
+| ✓ | `vcvtsd2ss` at write boundary — clean extraction of scalar min (matches `explicit operator T()`) |
+| ⚠ | Two `vzeroupper + call` in the inner loop — AVX→SSE ABI transition before stdx helpers |
+
+---
+
+## The `vzeroupper` concern
+
+There are two `vzeroupper + call` sequences per voxel iteration:
+
+1. Before `stdx::any_of` — used for `hany(x2y2z2 <= min2)` (interior-fill check).
+2. Before `stdx::reduce` — used for the `hmin` 4→2 tree step.
+
+These arise because GCC's `std::experimental::fixed_size_simd<double,4>` calls
+internal helpers for reductions rather than emitting everything inline.
+On Arrow Lake (`vzeroupper` ≈ 2 cycles), this is minor.
+On older microarchitectures (Skylake, ~100 cycle penalty), it matters more.
+
+**Mitigation if needed:** replace `stdx::reduce` with explicit `vextractf128 +
+vminpd + vpermilpd + vminsd` in `hmin`, bypassing the helper entirely.  This
+is mechanical and can be added to Backend A without changing the public API.
+
+---
+
+## Scalar comparison (`-march=native` vs `-mavx`)
+
+Recompiling with `-march=native` (enables AVX2 + FMA) yields:
+- `vbroadcastsd` replaces `vmovddup + vinsertf128` (cleaner 1-instruction broadcast).
+- `vfmadd213pd` replaces `vmulpd + vaddpd` (fused multiply-add, saves an instruction).
+- The `vzeroupper + call` pattern is **unchanged** — it is a stdx implementation
+  detail, not an ISA limitation.
+
+Both builds produce correct results; all 8 `TestPointRasterizeSDF` test cases pass.
diff --git a/openvdb/openvdb/simd/Simd.h b/openvdb/openvdb/simd/Simd.h
new file mode 100644
index 0000000000..d02b778bee
--- /dev/null
+++ b/openvdb/openvdb/simd/Simd.h
@@ -0,0 +1,421 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+///
+/// @file Simd.h
+///
+/// @brief Generic-T SIMD abstraction for OpenVDB.
+///
+/// @details  Provides Simd<T,W> and SimdMask<T,W> wrapper types together with
+///   free functions (where, hmin, hmax, hall, hany, hfirst, sqrt, square) that
+///   have matching scalar overloads, enabling kernels to be written once as
+///   templates on a value type T and compiled for both:
+///
+///     T = float / double  — scalar path, used directly or on GPU
+///     T = Simd<float,W>   — W-wide SIMD path for CPU batch processing
+///
+///   The same source — operators, where(), hmin() etc. — compiles correctly
+///   for both instantiations with no #ifdef or duplicated logic.
+///
+/// @note  Two backends are provided, selected automatically:
+///
+///   Backend A (OPENVDB_USE_STD_SIMD): wraps std::experimental::simd
+///     (Parallelism TS v2) in a thin class to provide the full API including
+///     explicit operator T() and value_type.  The compiler emits native SIMD
+///     instructions directly; no auto-vectorizer involvement.
+///
+///   Backend B (default, C++17): wraps std::array<T,W> with element-wise
+///     operator loops.  The compiler auto-vectorizes fixed-count loops.
+///     Annotated __hostdev__ for CUDA compatibility (NanoVDB use).
+///
+/// @note  Migration to std::simd (C++26) will be a one-line change in the
+///   backend detection guard; all call sites are unchanged.
+
+#ifndef OPENVDB_SIMD_SIMD_HAS_BEEN_INCLUDED
+#define OPENVDB_SIMD_SIMD_HAS_BEEN_INCLUDED
+
+#include <openvdb/Platform.h>
+#include <climits>
+#include <cmath>
+#include <limits>
+#include <type_traits>
+
+#ifdef OPENVDB_USE_STD_SIMD
+#include <experimental/simd>
+#endif
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace simd {
+
+// ============================================================================
+// Register-width constants — derived from OPENVDB_SIMD compile flags.
+// SimdNativeT<T> uses this to select the natural lane count for element type T.
+// ============================================================================
+
+static constexpr size_t RegisterBits =
+#if defined(OPENVDB_USE_AVX)
+    256;
+#elif defined(OPENVDB_USE_SSE42)
+    128;
+#else
+    0; // no explicit ISA targeting; fall back to scalar (width 1)
+#endif
+
+// ============================================================================
+// Backend A: std::experimental::simd — thin wrapper class
+// ============================================================================
+#ifdef OPENVDB_USE_STD_SIMD
+
+namespace stdx = std::experimental;
+
+/// @brief  SIMD mask wrapper for Backend A.
+template<typename T, int W>
+struct SimdMask
+{
+    using MaskT = stdx::fixed_size_simd_mask<T, W>;
+    MaskT m;
+
+    SimdMask() = default;
+    /*implicit*/ SimdMask(MaskT mask) : m(mask) {}
+
+    bool operator[](int i) const { return m[i]; }
+};
+
+/// @brief  SIMD value wrapper for Backend A.
+template<typename T, int W>
+struct Simd
+{
+    using value_type = T;
+    using VecT = stdx::fixed_size_simd<T, W>;
+    VecT v;
+
+    Simd() = default;
+    /*implicit*/ Simd(T scalar) : v(scalar) {}   // broadcast
+    explicit     Simd(const T* p) : v(p, stdx::element_aligned) {} // load
+    /*implicit*/ Simd(VecT vec) : v(vec) {}
+
+    /// @brief  Extract lane 0.  When all lanes hold the same value (e.g.
+    ///   after hmin/hmax), this converts a SIMD result back to a scalar.
+    explicit operator T() const { return v[0]; }
+
+    T operator[](int i) const { return v[i]; }
+
+    void store(T* p) const { v.copy_to(p, stdx::element_aligned); }
+
+    // Arithmetic operators
+    Simd operator-() const { return Simd(-v); }
+    Simd operator+(Simd o) const { return Simd(v + o.v); }
+    Simd operator-(Simd o) const { return Simd(v - o.v); }
+    Simd operator*(Simd o) const { return Simd(v * o.v); }
+    Simd operator/(Simd o) const { return Simd(v / o.v); }
+
+    // Compound assignment
+    Simd& operator+=(Simd o) { v += o.v; return *this; }
+    Simd& operator-=(Simd o) { v -= o.v; return *this; }
+    Simd& operator*=(Simd o) { v *= o.v; return *this; }
+    Simd& operator/=(Simd o) { v /= o.v; return *this; }
+
+    // Comparison — return mask
+    SimdMask<T,W> operator>=(Simd o) const { return SimdMask<T,W>(v >= o.v); }
+    SimdMask<T,W> operator<=(Simd o) const { return SimdMask<T,W>(v <= o.v); }
+    SimdMask<T,W> operator> (Simd o) const { return SimdMask<T,W>(v >  o.v); }
+    SimdMask<T,W> operator< (Simd o) const { return SimdMask<T,W>(v <  o.v); }
+    SimdMask<T,W> operator==(Simd o) const { return SimdMask<T,W>(v == o.v); }
+    SimdMask<T,W> operator!=(Simd o) const { return SimdMask<T,W>(v != o.v); }
+};
+
+// Mixed scalar-SIMD operators
+template<typename T, int W> Simd<T,W> operator+(T a, Simd<T,W> b) { return Simd<T,W>(a) + b; }
+template<typename T, int W> Simd<T,W> operator+(Simd<T,W> a, T b) { return a + Simd<T,W>(b); }
+template<typename T, int W> Simd<T,W> operator-(T a, Simd<T,W> b) { return Simd<T,W>(a) - b; }
+template<typename T, int W> Simd<T,W> operator-(Simd<T,W> a, T b) { return a - Simd<T,W>(b); }
+template<typename T, int W> Simd<T,W> operator*(T a, Simd<T,W> b) { return Simd<T,W>(a) * b; }
+template<typename T, int W> Simd<T,W> operator*(Simd<T,W> a, T b) { return a * Simd<T,W>(b); }
+template<typename T, int W> Simd<T,W> operator/(T a, Simd<T,W> b) { return Simd<T,W>(a) / b; }
+template<typename T, int W> Simd<T,W> operator/(Simd<T,W> a, T b) { return a / Simd<T,W>(b); }
+
+// SIMD free functions — Backend A
+
+template<typename T, int W>
+Simd<T,W> where(SimdMask<T,W> mask, Simd<T,W> a, Simd<T,W> b) {
+    auto result = b.v;
+    stdx::where(mask.m, result) = a.v;
+    return Simd<T,W>(result);
+}
+
+template<typename T, int W>
+Simd<T,W> min(Simd<T,W> a, Simd<T,W> b) { return Simd<T,W>(stdx::min(a.v, b.v)); }
+
+template<typename T, int W>
+Simd<T,W> max(Simd<T,W> a, Simd<T,W> b) { return Simd<T,W>(stdx::max(a.v, b.v)); }
+
+template<typename T, int W>
+Simd<T,W> sqrt(Simd<T,W> v) { return Simd<T,W>(stdx::sqrt(v.v)); }
+
+template<typename T, int W>
+Simd<T,W> abs(Simd<T,W> v) { return Simd<T,W>(stdx::abs(v.v)); }
+
+/// @brief  Horizontal min — reduces all lanes to the minimum value and
+///   broadcasts it back to all lanes.  The result stays in Simd<T,W> so
+///   subsequent arithmetic (e.g. equality comparison to find winner lane)
+///   remains in the same type domain.  Use explicit operator T() to extract
+///   the scalar at the write boundary: T result = T(hmin(v));
+template<typename T, int W>
+Simd<T,W> hmin(Simd<T,W> v) {
+    // stdx::reduce passes intermediate simd chunks to the binary op (tree reduction),
+    // so the lambda must use auto parameters and stdx::min for element-wise selection.
+    T m = stdx::reduce(v.v, [](auto a, auto b){ return stdx::min(a, b); });
+    return Simd<T,W>(m);
+}
+
+template<typename T, int W>
+Simd<T,W> hmax(Simd<T,W> v) {
+    T m = stdx::reduce(v.v, [](auto a, auto b){ return stdx::max(a, b); });
+    return Simd<T,W>(m);
+}
+
+/// @brief  Returns true if all lanes of the mask are set.
+template<typename T, int W>
+bool hall(SimdMask<T,W> mask) { return stdx::all_of(mask.m); }
+
+/// @brief  Returns true if any lane of the mask is set.
+template<typename T, int W>
+bool hany(SimdMask<T,W> mask) { return stdx::any_of(mask.m); }
+
+/// @brief  Returns the index of the first set lane, or -1 if none.
+template<typename T, int W>
+int hfirst(SimdMask<T,W> mask) {
+    for (int i = 0; i < W; ++i) if (mask[i]) return i;
+    return -1;
+}
+
+// ============================================================================
+// Backend B: std::array fallback — auto-vectorizer target
+// ============================================================================
+#else // !OPENVDB_USE_STD_SIMD
+
+/// @brief  SIMD mask wrapper for Backend B.
+template<typename T, int W>
+struct SimdMask
+{
+    std::array<bool, W> m{};
+
+    SimdMask() = default;
+
+    bool operator[](int i) const { return m[i]; }
+    bool& operator[](int i) { return m[i]; }
+};
+
+/// @brief  SIMD value wrapper for Backend B.  Element-wise operator loops are
+///   the auto-vectorizer target; fixed-count, no struct indirection.
+template<typename T, int W>
+struct Simd
+{
+    using value_type = T;
+    std::array<T, W> v{};
+
+    Simd() = default;
+    /*implicit*/ Simd(T scalar) { v.fill(scalar); }        // broadcast
+    explicit     Simd(const T* p) {                         // load
+        for (int i = 0; i < W; ++i) v[i] = p[i];
+    }
+
+    /// @brief  Extract lane 0.  See hmin() for the intended usage pattern.
+    explicit operator T() const { return v[0]; }
+
+    T  operator[](int i) const { return v[i]; }
+    T& operator[](int i)       { return v[i]; }
+
+    void store(T* p) const {
+        for (int i = 0; i < W; ++i) p[i] = v[i];
+    }
+
+    Simd operator-() const {
+        Simd r; for (int i = 0; i < W; ++i) r[i] = -v[i]; return r;
+    }
+    Simd operator+(Simd o) const {
+        Simd r; for (int i = 0; i < W; ++i) r[i] = v[i] + o[i]; return r;
+    }
+    Simd operator-(Simd o) const {
+        Simd r; for (int i = 0; i < W; ++i) r[i] = v[i] - o[i]; return r;
+    }
+    Simd operator*(Simd o) const {
+        Simd r; for (int i = 0; i < W; ++i) r[i] = v[i] * o[i]; return r;
+    }
+    Simd operator/(Simd o) const {
+        Simd r; for (int i = 0; i < W; ++i) r[i] = v[i] / o[i]; return r;
+    }
+
+    Simd& operator+=(Simd o) { for (int i = 0; i < W; ++i) v[i] += o[i]; return *this; }
+    Simd& operator-=(Simd o) { for (int i = 0; i < W; ++i) v[i] -= o[i]; return *this; }
+    Simd& operator*=(Simd o) { for (int i = 0; i < W; ++i) v[i] *= o[i]; return *this; }
+    Simd& operator/=(Simd o) { for (int i = 0; i < W; ++i) v[i] /= o[i]; return *this; }
+
+    SimdMask<T,W> operator>=(Simd o) const {
+        SimdMask<T,W> r; for (int i = 0; i < W; ++i) r[i] = v[i] >= o[i]; return r;
+    }
+    SimdMask<T,W> operator<=(Simd o) const {
+        SimdMask<T,W> r; for (int i = 0; i < W; ++i) r[i] = v[i] <= o[i]; return r;
+    }
+    SimdMask<T,W> operator>(Simd o) const {
+        SimdMask<T,W> r; for (int i = 0; i < W; ++i) r[i] = v[i] > o[i]; return r;
+    }
+    SimdMask<T,W> operator<(Simd o) const {
+        SimdMask<T,W> r; for (int i = 0; i < W; ++i) r[i] = v[i] < o[i]; return r;
+    }
+    SimdMask<T,W> operator==(Simd o) const {
+        SimdMask<T,W> r; for (int i = 0; i < W; ++i) r[i] = v[i] == o[i]; return r;
+    }
+    SimdMask<T,W> operator!=(Simd o) const {
+        SimdMask<T,W> r; for (int i = 0; i < W; ++i) r[i] = v[i] != o[i]; return r;
+    }
+};
+
+// Mixed scalar-SIMD operators
+template<typename T, int W> Simd<T,W> operator+(T a, Simd<T,W> b) { return Simd<T,W>(a) + b; }
+template<typename T, int W> Simd<T,W> operator+(Simd<T,W> a, T b) { return a + Simd<T,W>(b); }
+template<typename T, int W> Simd<T,W> operator-(T a, Simd<T,W> b) { return Simd<T,W>(a) - b; }
+template<typename T, int W> Simd<T,W> operator-(Simd<T,W> a, T b) { return a - Simd<T,W>(b); }
+template<typename T, int W> Simd<T,W> operator*(T a, Simd<T,W> b) { return Simd<T,W>(a) * b; }
+template<typename T, int W> Simd<T,W> operator*(Simd<T,W> a, T b) { return a * Simd<T,W>(b); }
+template<typename T, int W> Simd<T,W> operator/(T a, Simd<T,W> b) { return Simd<T,W>(a) / b; }
+template<typename T, int W> Simd<T,W> operator/(Simd<T,W> a, T b) { return a / Simd<T,W>(b); }
+
+// SIMD free functions — Backend B
+
+template<typename T, int W>
+Simd<T,W> where(SimdMask<T,W> mask, Simd<T,W> a, Simd<T,W> b) {
+    Simd<T,W> r; for (int i = 0; i < W; ++i) r[i] = mask[i] ? a[i] : b[i]; return r;
+}
+
+template<typename T, int W>
+Simd<T,W> min(Simd<T,W> a, Simd<T,W> b) {
+    Simd<T,W> r; for (int i = 0; i < W; ++i) r[i] = a[i] < b[i] ? a[i] : b[i]; return r;
+}
+
+template<typename T, int W>
+Simd<T,W> max(Simd<T,W> a, Simd<T,W> b) {
+    Simd<T,W> r; for (int i = 0; i < W; ++i) r[i] = a[i] > b[i] ? a[i] : b[i]; return r;
+}
+
+template<typename T, int W>
+Simd<T,W> sqrt(Simd<T,W> v) {
+    Simd<T,W> r; for (int i = 0; i < W; ++i) r[i] = std::sqrt(v[i]); return r;
+}
+
+template<typename T, int W>
+Simd<T,W> abs(Simd<T,W> v) {
+    Simd<T,W> r; for (int i = 0; i < W; ++i) r[i] = std::abs(v[i]); return r;
+}
+
+template<typename T, int W>
+Simd<T,W> hmin(Simd<T,W> v) {
+    T m = v[0]; for (int i = 1; i < W; ++i) m = m < v[i] ? m : v[i];
+    return Simd<T,W>(m);
+}
+
+template<typename T, int W>
+Simd<T,W> hmax(Simd<T,W> v) {
+    T m = v[0]; for (int i = 1; i < W; ++i) m = m > v[i] ? m : v[i];
+    return Simd<T,W>(m);
+}
+
+template<typename T, int W>
+bool hall(SimdMask<T,W> mask) {
+    for (int i = 0; i < W; ++i) if (!mask[i]) return false;
+    return true;
+}
+
+template<typename T, int W>
+bool hany(SimdMask<T,W> mask) {
+    for (int i = 0; i < W; ++i) if (mask[i]) return true;
+    return false;
+}
+
+template<typename T, int W>
+int hfirst(SimdMask<T,W> mask) {
+    for (int i = 0; i < W; ++i) if (mask[i]) return i;
+    return -1;
+}
+
+#endif // OPENVDB_USE_STD_SIMD
+
+// ============================================================================
+// Scalar overloads — always present.
+//
+// These make the Generic-T pattern complete: a kernel templated on T compiles
+// identically for T=float (scalar/GPU) and T=Simd<float,W> (CPU batch).
+// The scalar overloads are the identity in every case.
+// ============================================================================
+
+template<typename T> T    where(bool m, T a, T b) { return m ? a : b; }
+template<typename T> T    min(T a, T b)            { return a < b ? a : b; }
+template<typename T> T    max(T a, T b)            { return a > b ? a : b; }
+template<typename T> T    sqrt(T v)                { return std::sqrt(v); }
+template<typename T> T    abs(T v)                 { return std::abs(v); }
+template<typename T> T    hmin(T v)                { return v; } // identity: one lane
+template<typename T> T    hmax(T v)                { return v; } // identity: one lane
+inline bool               hall(bool b)             { return b; } // identity: one lane
+inline bool               hany(bool b)             { return b; } // identity: one lane
+inline int                hfirst(bool b)           { return b ? 0 : -1; }
+
+// square — useful shorthand, defined after scalar overloads so T works for both
+template<typename T> T square(T v) { return v * v; }
+
+// ============================================================================
+// Scalar<T> — extract the underlying element type from T.
+//
+//   scalar_t<float>          = float   (primary: T has no value_type)
+//   scalar_t<Simd<float,W>>  = float   (specialization: T::value_type exists)
+//
+// Detection uses std::void_t so it generalises to any type with a value_type
+// member, without an explicit Simd<T,W> specialization.
+// ============================================================================
+
+template<typename T, typename = void>
+struct Scalar { using type = T; };
+
+template<typename T>
+struct Scalar<T, std::void_t<typename T::value_type>> {
+    using type = typename T::value_type;
+};
+
+template<typename T>
+using scalar_t = typename Scalar<T>::type;
+
+// ============================================================================
+// SimdTraits<SimdT> — compile-time lane count for a SIMD type.
+//
+//   SimdTraits<float>::size           = 1
+//   SimdTraits<Simd<float,W>>::size   = W
+// ============================================================================
+
+template<typename T>
+struct SimdTraits { static constexpr size_t size = 1; };
+
+template<typename T, int W>
+struct SimdTraits<Simd<T,W>> { static constexpr size_t size = size_t(W); };
+
+// ============================================================================
+// SimdNativeT<T> — select the natural SIMD type for element type T.
+//
+//   SimdNativeT<double>::Type on AVX  → Simd<double, 4>  (256 / 64 bits)
+//   SimdNativeT<float>::Type  on AVX  → Simd<float,  8>  (256 / 32 bits)
+//   SimdNativeT<double>::Type no ISA  → Simd<double, 1>  (scalar fallback)
+// ============================================================================
+
+template<typename T>
+struct SimdNativeT
+{
+    static constexpr size_t elemBits = sizeof(T) * CHAR_BIT;
+    static constexpr size_t width    = (RegisterBits > 0)
+        ? (RegisterBits / elemBits) : 1;
+    using Type = Simd<T, int(width)>;
+};
+
+} // namespace simd
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_SIMD_SIMD_HAS_BEEN_INCLUDED