diff --git a/docs/documentation/case.md b/docs/documentation/case.md
index 5acd85a7a3..dbd127f23c 100644
--- a/docs/documentation/case.md
+++ b/docs/documentation/case.md
@@ -638,7 +638,7 @@ To restart the simulation from $k$-th time step, see @ref running "Restarting Ca
 | `alpha_wrt(i)`          | Logical | Add the volume fraction of fluid $i$ to the database	|
 | `gamma_wrt`             | Logical | Add the specific heat ratio function to the database	|
 | `heat_ratio_wrt`        | Logical | Add the specific heat ratio to the database	|
-| `ib_state_wrt`          | Logical | Write IB state and loads to a datafile at each time step |
+| `ib_state_wrt`          | Logical | Parameter to handle writing IB state on saves and outputting the state as a point mesh to SILO files. |
 | `pi_inf_wrt`            | Logical | Add the liquid stiffness function to the database |
 | `pres_inf_wrt`          | Logical | Add the liquid stiffness to the formatted database	 |
 | `c_wrt`                 | Logical | Add the sound speed to the database	 |
@@ -706,7 +706,7 @@ If `file_per_process` is true, then pre_process, simulation, and post_process mu
 
 - `probe_wrt` activates the output of state variables at coordinates specified by `probe(i)%[x;y,z]`.
 
-- `ib_state_wrt` activates the output of data specified by patch_ib(i)%force(:) (and torque, vel, angular_vel, angles, [x,y,z]_centroid) into a single binary datafile for all IBs at all timesteps. During post_processing, this file is converted into separate time histories for each IB.
+- `ib_state_wrt` is used to trigger post-processing of the IB state to be written out as a point mesh in the SILO files. When no IBs are moving, it also triggers force and torque calculation so that those values may be written to the output state files.
 
 - `output_partial_domain` activates the output of part of the domain specified by `[x,y,z]_output%%beg` and `[x,y,z]_output%%end`.
 This is useful for large domains where only a portion of the domain is of interest.
diff --git a/src/common/m_constants.fpp b/src/common/m_constants.fpp
index cacc50f528..d66fb10614 100644
--- a/src/common/m_constants.fpp
+++ b/src/common/m_constants.fpp
@@ -25,6 +25,7 @@ module m_constants
     integer, parameter  :: num_probes_max = 10                 !< Maximum number of flow probes in the simulation
     integer, parameter  :: num_patches_max = 10                !< Maximum number of IC patches
     integer, parameter  :: num_ib_patches_max = 50000          !< Maximum number of immersed boundary patches (patch_ib)
+    integer, parameter  :: num_local_ibs_max = 2000            !< Maximum number of immersed boundary patches (patch_ib)
     integer, parameter  :: num_bc_patches_max = 10             !< Maximum number of boundary condition patches
     integer, parameter  :: max_2d_fourier_modes = 10           !< Max Fourier mode index for 2D modal patch (geometry 13)
     integer, parameter  :: max_sph_harm_degree = 5             !< Max degree L for 3D spherical harmonic patch (geometry 14)
diff --git a/src/common/m_derived_types.fpp b/src/common/m_derived_types.fpp
index 6ee2e836a5..c4e5873ead 100644
--- a/src/common/m_derived_types.fpp
+++ b/src/common/m_derived_types.fpp
@@ -202,12 +202,10 @@ module m_derived_types
 
     type :: t_model_array
         ! Original CPU-side fields (unchanged)
-        type(t_model), allocatable              :: model                    !< STL/OBJ geometry model
-        real(wp), allocatable, dimension(:,:,:) :: boundary_v               !< Boundary vertices
-        real(wp), allocatable, dimension(:,:)   :: interpolated_boundary_v  !< Interpolated boundary vertices
-        integer                                 :: boundary_edge_count      !< Number of boundary edges
-        integer                                 :: total_vertices           !< Total vertex count
-        integer                                 :: interpolate              !< Interpolation flag
+        type(t_model), allocatable              :: model                !< STL/OBJ geometry model
+        real(wp), allocatable, dimension(:,:,:) :: boundary_v           !< Boundary vertices
+        integer                                 :: boundary_edge_count  !< Number of boundary edges
+        integer                                 :: total_vertices       !< Total vertex count
 
         ! GPU-friendly flattened arrays
         integer                                 :: ntrs   !< Copy of model%ntrs
@@ -272,9 +270,10 @@ module m_derived_types
     end type ic_patch_parameters
 
     type ib_patch_parameters
-
         integer  :: geometry                            !< Type of geometry for the patch
+        integer  :: gbl_patch_id
         real(wp) :: x_centroid, y_centroid, z_centroid  !< Geometric center coordinates of the patch
+
         !> Centroid locations of intermediate steps in the time_stepper module
         real(wp)                 :: step_x_centroid, step_y_centroid, step_z_centroid
         real(wp), dimension(1:3) :: centroid_offset  !< offset of center of mass from computed cell center for odd-shaped IBs
diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp
index 0dab036f9b..f02474f8d0 100644
--- a/src/common/m_model.fpp
+++ b/src/common/m_model.fpp
@@ -983,10 +983,11 @@ contains
         dx_local = minval(dx); dy_local = minval(dy)
         if (p /= 0) dz_local = minval(dz)
 
-        allocate (stl_bounding_boxes(num_ibs,1:3,1:3))
+        allocate (stl_bounding_boxes(num_gbl_ibs,1:3,1:3))
 
         do patch_id = 1, num_ibs
             if (patch_ib(patch_id)%geometry == 5 .or. patch_ib(patch_id)%geometry == 12) then
+                print *, proc_rank, patch_id, num_ibs, patch_ib(patch_id)%geometry
                 allocate (models(patch_id)%model)
                 print *, " * Reading model: " // trim(patch_ib(patch_id)%model_filepath)
 
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index a5fdb3ef81..effc020325 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -909,7 +909,7 @@ contains
         integer                              :: ifile, ierr
         integer, dimension(MPI_STATUS_SIZE)  :: status
         logical                              :: file_exist
-        integer                              :: i
+        integer                              :: i, ib_idx
         integer, parameter                   :: NFIELDS_PER_IB = 20
         real(wp)                             :: ib_buf(NFIELDS_PER_IB)
 
@@ -923,19 +923,6 @@ contains
         end if
         call s_mpi_barrier()
 
-        ! Divide num_ibs across num_procs
-        nibs_per_rank = num_ibs/num_procs
-        remainder = mod(num_ibs, num_procs)
-
-        ! Ranks < remainder get one extra IB
-        if (proc_rank < remainder) then
-            ib_start = proc_rank*(nibs_per_rank + 1) + 1
-            ib_end = ib_start + nibs_per_rank  ! nibs_per_rank + 1 total
-        else
-            ib_start = remainder*(nibs_per_rank + 1) + (proc_rank - remainder)*nibs_per_rank + 1
-            ib_end = ib_start + nibs_per_rank - 1
-        end if
-
         write (file_loc, '(A,I0,A)') '/restart_data/ib_state_', t_step, '.dat'
         file_loc = trim(case_dir) // trim(file_loc)
 
@@ -947,20 +934,21 @@ contains
 
         call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), mpi_info_int, ifile, ierr)
 
-        do i = ib_start, ib_end
+        do i = 1, num_local_ibs
+            ib_idx = local_ib_patch_ids(i)
             ib_buf(1) = mytime
-            ib_buf(2:4) = patch_ib(i)%force(1:3)
-            ib_buf(5:7) = patch_ib(i)%torque(1:3)
-            ib_buf(8:10) = patch_ib(i)%vel(1:3)
-            ib_buf(11:13) = patch_ib(i)%angular_vel(1:3)
-            ib_buf(14:16) = patch_ib(i)%angles(1:3)
-            ib_buf(17) = patch_ib(i)%x_centroid
-            ib_buf(18) = patch_ib(i)%y_centroid
-            ib_buf(19) = patch_ib(i)%z_centroid
-            ib_buf(20) = patch_ib(i)%radius
+            ib_buf(2:4) = patch_ib(ib_idx)%force(1:3)
+            ib_buf(5:7) = patch_ib(ib_idx)%torque(1:3)
+            ib_buf(8:10) = patch_ib(ib_idx)%vel(1:3)
+            ib_buf(11:13) = patch_ib(ib_idx)%angular_vel(1:3)
+            ib_buf(14:16) = patch_ib(ib_idx)%angles(1:3)
+            ib_buf(17) = patch_ib(ib_idx)%x_centroid
+            ib_buf(18) = patch_ib(ib_idx)%y_centroid
+            ib_buf(19) = patch_ib(ib_idx)%z_centroid
+            ib_buf(20) = patch_ib(ib_idx)%radius
 
             ! Global IB index (i) determines position in file
-            disp = int(i - 1, MPI_OFFSET_KIND)*int(NFIELDS_PER_IB, MPI_OFFSET_KIND)*WP_MOK
+            disp = int(patch_ib(ib_idx)%gbl_patch_id - 1, MPI_OFFSET_KIND)*int(NFIELDS_PER_IB, MPI_OFFSET_KIND)*WP_MOK
 
             call MPI_FILE_WRITE_AT(ifile, disp, ib_buf, NFIELDS_PER_IB, mpi_p, status, ierr)
         end do
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 7fb3353e2c..f1f2317236 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -229,7 +229,8 @@ module m_global_parameters
     $:GPU_DECLARE(create='[ib_bc_x, ib_bc_y, ib_bc_z]')
 #endif
     type(bounds_info) :: x_domain, y_domain, z_domain
-    $:GPU_DECLARE(create='[x_domain, y_domain, z_domain]')
+    type(bounds_info) :: neighbor_domain_x, neighbor_domain_y, neighbor_domain_z
+    $:GPU_DECLARE(create='[x_domain, y_domain, z_domain, neighbor_domain_x, neighbor_domain_y, neighbor_domain_z]')
     real(wp) :: x_a, y_a, z_a
     real(wp) :: x_b, y_b, z_b
     logical  :: parallel_io       !< Format of the data files
@@ -336,18 +337,21 @@ module m_global_parameters
 
     !> @name Immersed Boundaries
     !> @{
-    logical                                                  :: ib
-    integer                                                  :: num_ibs
-    integer                                                  :: collision_model
-    real(wp)                                                 :: coefficient_of_restitution
-    real(wp)                                                 :: collision_time
-    real(wp)                                                 :: ib_coefficient_of_friction
-    logical                                                  :: ib_state_wrt
-    type(ib_patch_parameters), dimension(num_ib_patches_max) :: patch_ib  !< Immersed boundary patch parameters
-    type(vec3_dt), allocatable, dimension(:)                 :: airfoil_grid_u, airfoil_grid_l
-    integer                                                  :: Np
-
-    $:GPU_DECLARE(create='[ib, num_ibs, patch_ib, Np, airfoil_grid_u, airfoil_grid_l]')
+    logical                                              :: ib
+    integer                                              :: num_ibs  !< number of IBs that the current processor is aware of
+    integer                                              :: num_gbl_ibs  !< number of IBs in the overall simulation
+    integer                                              :: num_local_ibs  !< number of IBs that lie inside the processor domain
+    integer                                              :: collision_model
+    real(wp)                                             :: coefficient_of_restitution
+    real(wp)                                             :: collision_time
+    real(wp)                                             :: ib_coefficient_of_friction
+    logical                                              :: ib_state_wrt
+    type(ib_patch_parameters), allocatable, dimension(:) :: patch_ib  !< Immersed boundary patch parameters
+    integer, dimension(num_local_ibs_max)                :: local_ib_patch_ids  !< lookup table of IBs in the local compute domain
+    type(vec3_dt), allocatable, dimension(:)             :: airfoil_grid_u, airfoil_grid_l
+    integer                                              :: Np
+
+    $:GPU_DECLARE(create='[ib, num_ibs, num_gbl_ibs, num_local_ibs, patch_ib, Np, airfoil_grid_u, airfoil_grid_l, local_ib_patch_ids]')
     $:GPU_DECLARE(create='[ib_coefficient_of_friction]')
     !> @}
 
@@ -787,7 +791,9 @@ contains
             relativity = .false.
         #:endif
 
+        allocate (patch_ib(num_ib_patches_max))
         do i = 1, num_ib_patches_max
+            patch_ib(i)%gbl_patch_id = i
             patch_ib(i)%geometry = dflt_int
             patch_ib(i)%x_centroid = 0._wp
             patch_ib(i)%y_centroid = 0._wp
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index cf3ad9ecfa..1770e9c271 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -109,7 +109,7 @@ contains
         radius = patch_ib(patch_id)%radius
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -221,7 +221,7 @@ contains
         end if
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -376,7 +376,7 @@ contains
         end if
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -466,7 +466,7 @@ contains
         inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:)
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -510,16 +510,12 @@ contains
         real(wp)                 :: radius
         real(wp), dimension(1:3) :: center
 
-        ! Variables to initialize the pressure field that corresponds to the bubble-collapse test case found in Tiwari et al. (2013)
-
-        ! Transferring spherical patch's radius, centroid, smoothing patch identity and smoothing coefficient information
-
         center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
         center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
         center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(z_domain%end - z_domain%beg)
         radius = patch_ib(patch_id)%radius
 
-        ! completely skip particles no in the domain
+        ! completely skip particles not in the domain
         if (center(1) - radius > x_cc(m + gp_layers + 1) .or. center(1) + radius < x_cc(-gp_layers - 1) .or. center(2) &
             & - radius > y_cc(n + gp_layers + 1) .or. center(2) + radius < y_cc(-gp_layers - 1) .or. center(3) - radius > z_cc(p &
             & + gp_layers + 1) .or. center(3) + radius < z_cc(-gp_layers - 1)) then
@@ -527,7 +523,7 @@ contains
         end if
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -542,18 +538,12 @@ contains
 
         ! Checking whether the sphere covers a particular cell in the domain and verifying whether the current patch has permission
         ! to write to that cell. If both queries check out, the primitive variables of the current patch are assigned to this cell.
-        $:GPU_PARALLEL_LOOP(private='[i, j, k, cart_y, cart_z]', copyin='[encoded_patch_id, center, radius]', collapse=3)
+        $:GPU_PARALLEL_LOOP(private='[i, j, k]', copyin='[encoded_patch_id, center, radius]', collapse=3)
         do k = kl, kr
             do j = jl, jr
                 do i = il, ir
-                    if (grid_geometry == 3) then
-                        call s_convert_cylindrical_to_cartesian_coord(y_cc(j), z_cc(k))
-                    else
-                        cart_y = y_cc(j)
-                        cart_z = z_cc(k)
-                    end if
                     ! Updating the patch identities bookkeeping variable
-                    if (((x_cc(i) - center(1))**2 + (cart_y - center(2))**2 + (cart_z - center(3))**2 <= radius**2)) then
+                    if (((x_cc(i) - center(1))**2 + (y_cc(j) - center(2))**2 + (z_cc(k) - center(3))**2 <= radius**2)) then
                         ib_markers%sf(i, j, k) = encoded_patch_id
                     end if
                 end do
@@ -586,7 +576,7 @@ contains
         inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:)
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -656,7 +646,7 @@ contains
         inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:)
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id)
 
         il = -gp_layers - 1
         jl = -gp_layers - 1
@@ -724,7 +714,7 @@ contains
         inverse_rotation(:,:) = patch_ib(patch_id)%rotation_matrix_inverse(:,:)
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id)
 
         ! find the indices to the left and right of the IB in i, j, k
         il = -gp_layers - 1
@@ -781,7 +771,7 @@ contains
         threshold = patch_ib(patch_id)%model_threshold
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, 0, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, 0, encoded_patch_id)
 
         il = -gp_layers - 1
         jl = -gp_layers - 1
@@ -858,7 +848,7 @@ contains
         rotation(:,:) = patch_ib(patch_id)%rotation_matrix(:,:)
 
         ! encode the periodicity information into the patch_id
-        call s_encode_patch_periodicity(patch_id, xp, yp, zp, encoded_patch_id)
+        call s_encode_patch_periodicity(patch_ib(patch_id)%gbl_patch_id, xp, yp, zp, encoded_patch_id)
 
         il = -gp_layers - 1
         jl = -gp_layers - 1
@@ -1039,7 +1029,7 @@ contains
         temp_y_per = y_periodicity; if (y_periodicity == -1) temp_y_per = 2
         temp_z_per = z_periodicity; if (z_periodicity == -1) temp_z_per = 2
 
-        offset = (num_ibs + 1)*temp_x_per + 3*(num_ibs + 1)*temp_y_per + 9*(num_ibs + 1)*temp_z_per
+        offset = (num_gbl_ibs + 1)*temp_x_per + 3*(num_gbl_ibs + 1)*temp_y_per + 9*(num_gbl_ibs + 1)*temp_z_per
         encoded_patch_id = patch_id + offset
 
     end subroutine s_encode_patch_periodicity
@@ -1054,7 +1044,7 @@ contains
         integer, intent(out), optional :: x_periodicity, y_periodicity, z_periodicity
         integer                        :: offset, remainder, xp, yp, zp, base
 
-        base = num_ibs + 1
+        base = num_gbl_ibs + 1
 
         patch_id = mod(encoded_patch_id - 1, base) + 1
         offset = (encoded_patch_id - patch_id)/base
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 5f865cffeb..90431a6839 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -1010,9 +1010,8 @@ contains
 
         call s_apply_collision_forces(ghost_points, num_gps, ib_markers, forces, torques)
 
-        ! reduce the forces across all MPI ranks
-        call s_mpi_allreduce_vectors_sum(forces, forces, num_ibs, 3)
-        call s_mpi_allreduce_vectors_sum(torques, torques, num_ibs, 3)
+        ! reduce the forces across local neighborhood ranks
+        call s_communicate_ib_forces(forces, torques)
 
         ! consider body forces after reducing to avoid double counting
         do i = 1, num_ibs
@@ -1225,4 +1224,396 @@ contains
 
     end subroutine s_wrap_periodic_ibs
 
+    !> @brief reasseses ownership of IBs and passes ownership of IBs to neighbor processors
+    !> Reduces forces and torques across the local neighborhood without a global allreduce. Accumulation phase: 2 passes per
+    !! dimension receiving from the low-index (-X) neighbor. Pass 1: add received values; save what was received as recv_snap. Pass
+    !! 2: send current (post-pass-1) values; add received; subtract recv_snap to remove double-counting of the direct contribution
+    !! already added in pass 1. Back-propagation phase: 2 passes per dimension receiving from the high-index (+X) neighbor, each
+    !! overwriting local forces with the neighbor's accumulated total.
+    subroutine s_communicate_ib_forces(forces, torques)
+
+        real(wp), dimension(num_ibs, 3), intent(inout) :: forces, torques
+
+#ifdef MFC_MPI
+        integer                       :: i, j, pack_pos, unpack_pos, buf_size, ierr
+        integer                       :: send_neighbor, recv_neighbor, recv_count, pid
+        real(wp), dimension(3)        :: fval, tval
+        real(wp), allocatable         :: recv_forces_snap(:,:), recv_torques_snap(:,:)
+        character(len=1), allocatable :: send_buf(:), recv_buf(:)
+
+        if (num_procs == 1) return
+
+        buf_size = storage_size(0)/8 + (storage_size(0)/8 + 6*storage_size(0._wp)/8)*size(patch_ib)
+        allocate (send_buf(buf_size), recv_buf(buf_size), recv_forces_snap(num_ibs, 3), recv_torques_snap(num_ibs, 3))
+
+        ! Accumulation phase: propagate contributions toward the high-index corner.
+        #:for X, ID, TAG1, TAG2 in [('x', 1, 300, 302), ('y', 2, 304, 306), ('z', 3, 308, 310)]
+            if (num_dims >= ${ID}$) then
+                send_neighbor = merge(bc_${X}$%end, MPI_PROC_NULL, bc_${X}$%end >= 0)
+                recv_neighbor = merge(bc_${X}$%beg, MPI_PROC_NULL, bc_${X}$%beg >= 0)
+
+                ! Pass 1: send current forces to +${X}$ neighbor; receive from -${X}$ neighbor and add. Save what was received as
+                ! recv_snap for double-count removal in pass 2.
+                recv_forces_snap = 0._wp
+                recv_torques_snap = 0._wp
+                pack_pos = 0
+                call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                do i = 1, num_ibs
+                    call MPI_PACK(patch_ib(i)%gbl_patch_id, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    fval(:) = forces(i,:); tval(:) = torques(i,:)
+                    call MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                end do
+                call MPI_SENDRECV(send_buf, pack_pos, MPI_PACKED, send_neighbor, ${TAG1}$, recv_buf, buf_size, MPI_PACKED, &
+                                  & recv_neighbor, ${TAG1}$, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                if (recv_neighbor /= MPI_PROC_NULL) then
+                    unpack_pos = 0
+                    call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                    do i = 1, recv_count
+                        call MPI_UNPACK(recv_buf, buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                        call MPI_UNPACK(recv_buf, buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_UNPACK(recv_buf, buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr)
+                        do j = 1, num_ibs
+                            if (patch_ib(j)%gbl_patch_id == pid) then
+                                recv_forces_snap(j,:) = fval(:)
+                                recv_torques_snap(j,:) = tval(:)
+                                forces(j,:) = forces(j,:) + fval(:)
+                                torques(j,:) = torques(j,:) + tval(:)
+                                exit
+                            end if
+                        end do
+                    end do
+                end if
+
+                ! Pass 2: send post-pass-1 forces to +${X}$ neighbor; receive from -${X}$ neighbor. Add received values then
+                ! subtract recv_snap to remove the pass-1 contribution that was already counted, leaving only the 2-hop delta.
+                pack_pos = 0
+                call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                do i = 1, num_ibs
+                    call MPI_PACK(patch_ib(i)%gbl_patch_id, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    fval(:) = forces(i,:); tval(:) = torques(i,:)
+                    call MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                end do
+                call MPI_SENDRECV(send_buf, pack_pos, MPI_PACKED, send_neighbor, ${TAG2}$, recv_buf, buf_size, MPI_PACKED, &
+                                  & recv_neighbor, ${TAG2}$, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                if (recv_neighbor /= MPI_PROC_NULL) then
+                    unpack_pos = 0
+                    call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                    do i = 1, recv_count
+                        call MPI_UNPACK(recv_buf, buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                        call MPI_UNPACK(recv_buf, buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_UNPACK(recv_buf, buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr)
+                        do j = 1, num_ibs
+                            if (patch_ib(j)%gbl_patch_id == pid) then
+                                forces(j,:) = forces(j,:) + fval(:) - recv_forces_snap(j,:)
+                                torques(j,:) = torques(j,:) + tval(:) - recv_torques_snap(j,:)
+                                exit
+                            end if
+                        end do
+                    end do
+                end if
+            end if
+        #:endfor
+
+        ! Back-propagation phase: for each dimension, 2 passes receiving from the high-index neighbor. Each pass overwrites local
+        ! forces with the neighbor's accumulated total. Two passes ensure the total reaches 2 hops back, covering the full
+        ! neighborhood.
+        #:for X, ID, TAG1, TAG2 in [('x', 1, 312, 314), ('y', 2, 316, 318), ('z', 3, 320, 322)]
+            if (num_dims >= ${ID}$) then
+                send_neighbor = merge(bc_${X}$%beg, MPI_PROC_NULL, bc_${X}$%beg >= 0)
+                recv_neighbor = merge(bc_${X}$%end, MPI_PROC_NULL, bc_${X}$%end >= 0)
+
+                #:for TAG in [TAG1, TAG2]
+                    pack_pos = 0
+                    call MPI_PACK(num_ibs, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    do i = 1, num_ibs
+                        call MPI_PACK(patch_ib(i)%gbl_patch_id, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                        fval(:) = forces(i,:); tval(:) = torques(i,:)
+                        call MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                        call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                    end do
+                    call MPI_SENDRECV(send_buf, pack_pos, MPI_PACKED, send_neighbor, ${TAG}$, recv_buf, buf_size, MPI_PACKED, &
+                                      & recv_neighbor, ${TAG}$, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                    if (recv_neighbor /= MPI_PROC_NULL) then
+                        unpack_pos = 0
+                        call MPI_UNPACK(recv_buf, buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                        do i = 1, recv_count
+                            call MPI_UNPACK(recv_buf, buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                            call MPI_UNPACK(recv_buf, buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr)
+                            call MPI_UNPACK(recv_buf, buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr)
+                            do j = 1, num_ibs
+                                if (patch_ib(j)%gbl_patch_id == pid) then
+                                    forces(j,:) = fval(:)
+                                    torques(j,:) = tval(:)
+                                    exit
+                                end if
+                            end do
+                        end do
+                    end if
+                #:endfor
+            end if
+        #:endfor
+
+        deallocate (send_buf, recv_buf, recv_forces_snap, recv_torques_snap)
+#endif
+
+    end subroutine s_communicate_ib_forces
+
+    !> Alternative force reduction using two non-blocking all-to-neighbor broadcasts. Phase 1: every rank sends its full force array
+    !! to all 26 neighborhood neighbors simultaneously. After MPI_WAITALL, each rank sums contributions from neighbors for its owned
+    !! particles. Phase 2: each rank sends its finalized owned-particle forces (by gbl_patch_id) back to all neighbors
+    !! simultaneously. After MPI_WAITALL, each rank overwrites ghost-particle forces with the authoritative values from the owning
+    !! rank. Not currently called - available for benchmarking against s_communicate_ib_forces.
+    !     subroutine s_communicate_ib_forces_scatter(forces, torques)
+
+    !         real(wp), dimension(num_ibs, 3), intent(inout) :: forces, torques
+
+    ! #ifdef MFC_MPI integer, parameter :: max_nbrs = 26 integer :: i, j, k, nbr_idx, nreqs, pack_pos, unpack_pos integer ::
+    ! buf_size, entry_bytes, ierr, recv_count, pid integer :: send_neighbor, recv_neighbor, dx, dy, dz, tag integer, dimension(3) ::
+    ! nbr_coords logical :: is_owned real(wp), dimension(3) :: fval, tval real(wp), dimension(num_ibs, 3) :: forces_total,
+    ! torques_total integer, dimension(max_nbrs) :: recv_neighbor_list integer, dimension(2*max_nbrs) :: requests character(len=1),
+    ! allocatable :: send_buf(:), recv_bufs(:,:) character(len=1), allocatable :: owned_send_buf(:), owned_recv_bufs(:,:) integer ::
+    ! owned_buf_size
+
+    !         if (num_procs == 1) return
+
+    ! ! Buffer sized to hold count + (gbl_patch_id, forces, torques) per particle entry_bytes = storage_size(0)/8 +
+    ! 6*storage_size(0._wp)/8 buf_size = storage_size(0)/8 + entry_bytes*num_ibs owned_buf_size = storage_size(0)/8 +
+    ! entry_bytes*num_local_ibs_max allocate (send_buf(buf_size), recv_bufs(buf_size, max_nbrs), owned_send_buf(owned_buf_size), & &
+    ! owned_recv_bufs(owned_buf_size, max_nbrs))
+
+    ! ! Phase 1: pack full local force array and broadcast to all neighborhood neighbors. pack_pos = 0 call MPI_PACK(num_ibs, 1,
+    ! MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) do i = 1, num_ibs call MPI_PACK(patch_ib(i)%gbl_patch_id, 1,
+    ! MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) fval(:) = forces(i,:); tval(:) = torques(i,:) call
+    ! MPI_PACK(fval, 3, mpi_p, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr) call MPI_PACK(tval, 3, mpi_p, send_buf, buf_size,
+    ! pack_pos, MPI_COMM_WORLD, ierr) end do
+
+    ! nreqs = 0 nbr_idx = 0 do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3) do dy = -1, 1 do dx = -1, 1 if (dx == 0
+    ! .and. dy == 0 .and. dz == 0) cycle nbr_idx = nbr_idx + 1 tag = 400 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1)
+
+    ! nbr_coords = proc_coords - [dx, dy, dz] call MPI_CART_RANK(MPI_COMM_CART, nbr_coords, recv_neighbor, ierr) if (ierr /=
+    ! MPI_SUCCESS) recv_neighbor = MPI_PROC_NULL recv_neighbor_list(nbr_idx) = recv_neighbor
+
+    ! nreqs = nreqs + 1 call MPI_IRECV(recv_bufs(:,nbr_idx), buf_size, MPI_PACKED, recv_neighbor, tag, MPI_COMM_WORLD, & &
+    ! requests(nreqs), ierr) end do end do end do
+
+    ! do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3) do dy = -1, 1 do dx = -1, 1 if (dx == 0 .and. dy == 0 .and. dz
+    ! == 0) cycle tag = 400 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1)
+
+    ! nbr_coords = proc_coords + [dx, dy, dz] call MPI_CART_RANK(MPI_COMM_CART, nbr_coords, send_neighbor, ierr) if (ierr /=
+    ! MPI_SUCCESS) send_neighbor = MPI_PROC_NULL
+
+    ! nreqs = nreqs + 1 call MPI_ISEND(send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, MPI_COMM_WORLD, requests(nreqs), ierr)
+    ! end do end do end do
+
+    !         call MPI_WAITALL(nreqs, requests, MPI_STATUSES_IGNORE, ierr)
+
+    ! ! Local reduction: for each owned particle, sum contributions from all neighbors. forces_total = forces torques_total =
+    ! torques do nbr_idx = 1, merge(26, 8, num_dims == 3) if (recv_neighbor_list(nbr_idx) == MPI_PROC_NULL) cycle unpack_pos = 0
+    ! call MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr) do i = 1,
+    ! recv_count call MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr) call
+    ! MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr) call
+    ! MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr) ! Only accumulate for particles
+    ! this rank owns do k = 1, num_local_ibs j = local_ib_patch_ids(k) if (patch_ib(j)%gbl_patch_id == pid) then forces_total(j,:) =
+    ! forces_total(j,:) + fval(:) torques_total(j,:) = torques_total(j,:) + tval(:) exit end if end do end do end do
+
+    ! ! Write totals back for owned particles only do k = 1, num_local_ibs j = local_ib_patch_ids(k) forces(j,:) = forces_total(j,:)
+    ! torques(j,:) = torques_total(j,:) end do
+
+    ! ! Phase 2: pack finalized owned-particle forces and back-broadcast to all neighbors. pack_pos = 0 call MPI_PACK(num_local_ibs,
+    ! 1, MPI_INTEGER, owned_send_buf, owned_buf_size, pack_pos, MPI_COMM_WORLD, ierr) do k = 1, num_local_ibs j =
+    ! local_ib_patch_ids(k) call MPI_PACK(patch_ib(j)%gbl_patch_id, 1, MPI_INTEGER, owned_send_buf, owned_buf_size, pack_pos,
+    ! MPI_COMM_WORLD, ierr) fval(:) = forces(j,:); tval(:) = torques(j,:) call MPI_PACK(fval, 3, mpi_p, owned_send_buf,
+    ! owned_buf_size, pack_pos, MPI_COMM_WORLD, ierr) call MPI_PACK(tval, 3, mpi_p, owned_send_buf, owned_buf_size, pack_pos,
+    ! MPI_COMM_WORLD, ierr) end do
+
+    ! nreqs = 0 nbr_idx = 0 do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3) do dy = -1, 1 do dx = -1, 1 if (dx == 0
+    ! .and. dy == 0 .and. dz == 0) cycle nbr_idx = nbr_idx + 1 tag = 427 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1)
+
+    ! nbr_coords = proc_coords - [dx, dy, dz] call MPI_CART_RANK(MPI_COMM_CART, nbr_coords, recv_neighbor, ierr) if (ierr /=
+    ! MPI_SUCCESS) recv_neighbor = MPI_PROC_NULL recv_neighbor_list(nbr_idx) = recv_neighbor
+
+    ! nreqs = nreqs + 1 call MPI_IRECV(owned_recv_bufs(:,nbr_idx), owned_buf_size, MPI_PACKED, recv_neighbor, tag, MPI_COMM_WORLD, &
+    ! & requests(nreqs), ierr) end do end do end do
+
+    ! do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3) do dy = -1, 1 do dx = -1, 1 if (dx == 0 .and. dy == 0 .and. dz
+    ! == 0) cycle tag = 427 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1)
+
+    ! nbr_coords = proc_coords + [dx, dy, dz] call MPI_CART_RANK(MPI_COMM_CART, nbr_coords, send_neighbor, ierr) if (ierr /=
+    ! MPI_SUCCESS) send_neighbor = MPI_PROC_NULL
+
+    ! nreqs = nreqs + 1 call MPI_ISEND(owned_send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, MPI_COMM_WORLD, requests(nreqs),
+    ! ierr) end do end do end do
+
+    !         call MPI_WAITALL(nreqs, requests, MPI_STATUSES_IGNORE, ierr)
+
+    ! ! Overwrite ghost-particle forces with authoritative values from the owning rank. do nbr_idx = 1, merge(26, 8, num_dims == 3)
+    ! if (recv_neighbor_list(nbr_idx) == MPI_PROC_NULL) cycle unpack_pos = 0 call MPI_UNPACK(owned_recv_bufs(:,nbr_idx),
+    ! owned_buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, & & ierr) do i = 1, recv_count call
+    ! MPI_UNPACK(owned_recv_bufs(:,nbr_idx), owned_buf_size, unpack_pos, pid, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr) call
+    ! MPI_UNPACK(owned_recv_bufs(:,nbr_idx), owned_buf_size, unpack_pos, fval, 3, mpi_p, MPI_COMM_WORLD, ierr) call
+    ! MPI_UNPACK(owned_recv_bufs(:,nbr_idx), owned_buf_size, unpack_pos, tval, 3, mpi_p, MPI_COMM_WORLD, ierr) ! Only overwrite
+    ! ghost particles (not owned ones - this rank's total is authoritative) do j = 1, num_ibs if (patch_ib(j)%gbl_patch_id == pid)
+    ! then is_owned = .false. do k = 1, num_local_ibs if (local_ib_patch_ids(k) == j) then is_owned = .true. exit end if end do if
+    ! (.not. is_owned) then forces(j,:) = fval(:) torques(j,:) = tval(:) end if exit end if end do end do end do
+
+    ! deallocate (send_buf, recv_bufs, owned_send_buf, owned_recv_bufs) #endif
+
+    !     end subroutine s_communicate_ib_forces_scatter
+
+    subroutine s_handoff_ib_ownership()
+
+        integer                               :: i, j, k, output_idx, local_output_idx
+        integer                               :: old_num_local_ibs
+        integer                               :: new_count, recv_count
+        integer                               :: pack_pos, unpack_pos, buf_size, patch_bytes
+        integer                               :: send_neighbor, recv_neighbor, ierr
+        integer                               :: dx, dy, dz, tag, nbr_idx, nreqs
+        integer, dimension(3)                 :: nbr_coords
+        real(wp)                              :: position
+        real(wp), dimension(3)                :: centroid
+        logical                               :: is_new, already_known
+        type(ib_patch_parameters)             :: tmp_patch
+        integer, dimension(num_local_ibs_max) :: local_ib_idx_old
+        ! 26 neighbors max in 3D; each gets its own recv buffer and a request handle for send + recv
+        integer, parameter             :: max_nbrs = 26
+        character(len=1), allocatable  :: send_buf(:), recv_bufs(:,:)
+        integer, dimension(2*max_nbrs) :: requests
+        integer, dimension(max_nbrs)   :: recv_neighbor_list
+
+        #:if defined('MFC_MPI')
+            if (num_procs > 1) then
+                ! save a copy of the local IB's global indices to cross-reference for later.
+                old_num_local_ibs = num_local_ibs
+                do i = 1, num_local_ibs
+                    local_ib_idx_old(i) = patch_ib(local_ib_patch_ids(i))%gbl_patch_id
+                end do
+
+                ! delete any particles that no longer need to be tracked and coalesce the array
+                output_idx = 0
+                local_output_idx = 0
+                do i = 1, num_ibs
+                    #:for X, ID in [('x', 1), ('y', 2), ('z', 3)]
+                        if (patch_ib(i)%${X}$_centroid < neighbor_domain_${X}$%beg .or. neighbor_domain_${X}$%end < patch_ib(i) &
+                            & %${X}$_centroid) then
+                            cycle
+                        end if
+                    #:endfor
+
+                    output_idx = output_idx + 1
+                    if (i /= output_idx) patch_ib(output_idx) = patch_ib(i)
+                    centroid = [patch_ib(i)%x_centroid, patch_ib(i)%y_centroid, 0._wp]
+                    if (num_dims == 3) centroid(3) = patch_ib(i)%z_centroid
+                    if (f_local_rank_owns_collision(centroid)) then
+                        local_output_idx = local_output_idx + 1
+                        local_ib_patch_ids(local_output_idx) = output_idx
+                    end if
+                end do
+                num_ibs = output_idx
+                num_local_ibs = local_output_idx
+
+                ! Broadcast newly-owned patches to all neighborhood neighbors (including corners/edges).
+                patch_bytes = storage_size(tmp_patch)/8
+                buf_size = storage_size(0)/8 + patch_bytes*num_local_ibs_max
+                allocate (send_buf(buf_size), recv_bufs(buf_size, max_nbrs))
+
+                ! Write placeholder count at position 0
+                pack_pos = 0
+                call MPI_PACK(0, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+
+                ! Single pass: pack new patches and count them
+                new_count = 0
+                do i = 1, num_local_ibs
+                    k = local_ib_patch_ids(i)
+                    is_new = .true.
+                    do j = 1, old_num_local_ibs
+                        if (patch_ib(k)%gbl_patch_id == local_ib_idx_old(j)) then
+                            is_new = .false.
+                            exit
+                        end if
+                    end do
+                    if (is_new) then
+                        call MPI_PACK(patch_ib(k), patch_bytes, MPI_BYTE, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                        new_count = new_count + 1
+                    end if
+                end do
+
+                ! Overwrite the placeholder with the real count
+                pack_pos = 0
+                call MPI_PACK(new_count, 1, MPI_INTEGER, send_buf, buf_size, pack_pos, MPI_COMM_WORLD, ierr)
+                pack_pos = storage_size(0)/8 + new_count*patch_bytes
+
+                ! Post all receives first, then all sends, so they are all in flight together. Tags 200..226: tag = 200 + (dx+1)*9 +
+                ! (dy+1)*3 + (dz+1)
+                nreqs = 0
+                nbr_idx = 0
+                do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3)
+                    do dy = -1, 1
+                        do dx = -1, 1
+                            if (dx == 0 .and. dy == 0 .and. dz == 0) cycle
+                            nbr_idx = nbr_idx + 1
+                            tag = 200 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1)
+
+                            ! Receive from the mirror direction
+                            nbr_coords = proc_coords - [dx, dy, dz]
+                            call MPI_CART_RANK(MPI_COMM_CART, nbr_coords, recv_neighbor, ierr)
+                            if (ierr /= MPI_SUCCESS) recv_neighbor = MPI_PROC_NULL
+                            recv_neighbor_list(nbr_idx) = recv_neighbor
+
+                            nreqs = nreqs + 1
+                            call MPI_IRECV(recv_bufs(:,nbr_idx), buf_size, MPI_PACKED, recv_neighbor, tag, MPI_COMM_WORLD, &
+                                           & requests(nreqs), ierr)
+                        end do
+                    end do
+                end do
+
+                do dz = merge(-1, 0, num_dims == 3), merge(1, 0, num_dims == 3)
+                    do dy = -1, 1
+                        do dx = -1, 1
+                            if (dx == 0 .and. dy == 0 .and. dz == 0) cycle
+                            tag = 200 + (dx + 1)*9 + (dy + 1)*3 + (dz + 1)
+
+                            nbr_coords = proc_coords + [dx, dy, dz]
+                            call MPI_CART_RANK(MPI_COMM_CART, nbr_coords, send_neighbor, ierr)
+                            if (ierr /= MPI_SUCCESS) send_neighbor = MPI_PROC_NULL
+
+                            nreqs = nreqs + 1
+                            call MPI_ISEND(send_buf, pack_pos, MPI_PACKED, send_neighbor, tag, MPI_COMM_WORLD, requests(nreqs), &
+                                           & ierr)
+                        end do
+                    end do
+                end do
+
+                call MPI_WAITALL(nreqs, requests, MPI_STATUSES_IGNORE, ierr)
+
+                ! Unpack all received buffers
+                do nbr_idx = 1, merge(26, 8, num_dims == 3)
+                    if (recv_neighbor_list(nbr_idx) == MPI_PROC_NULL) cycle
+                    unpack_pos = 0
+                    call MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, recv_count, 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                    do i = 1, recv_count
+                        call MPI_UNPACK(recv_bufs(:,nbr_idx), buf_size, unpack_pos, tmp_patch, patch_bytes, MPI_BYTE, &
+                                        & MPI_COMM_WORLD, ierr)
+                        already_known = .false.
+                        do j = 1, num_ibs
+                            if (patch_ib(j)%gbl_patch_id == tmp_patch%gbl_patch_id) then
+                                already_known = .true.
+                                exit
+                            end if
+                        end do
+                        if (.not. already_known) then
+                            num_ibs = num_ibs + 1
+                            @:ASSERT(num_ibs <= size(patch_ib), 'patch_ib overflow in neighborhood handoff')
+                            patch_ib(num_ibs) = tmp_patch
+                        end if
+                    end do
+                end do
+
+                deallocate (send_buf, recv_bufs)
+            end if
+        #:endif
+
+    end subroutine s_handoff_ib_ownership
+
 end module m_ibm
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index d544324c8a..1420464202 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -39,6 +39,7 @@ module m_start_up
 
     use m_nvtx
     use m_ibm
+    use m_collisions
     use m_compile_specific
     use m_checker_common
     use m_checker
@@ -915,6 +916,7 @@ contains
         if (model_eqns == 3) call s_initialize_internal_energy_equations(q_cons_ts(1)%vf)
         if (ib) then
             if (t_step_start > 0) call s_read_ib_restart_data(t_step_start)
+            call s_reduce_ib_patch_array()
             call s_ibm_setup()
             if (t_step_start == 0) then
                 call s_write_ib_data_file(0)
@@ -1010,6 +1012,8 @@ contains
 #else
             "on CPUs"
 #endif
+        else
+            allocate (patch_ib(num_ib_patches_max))
         end if
 
         call s_mpi_bcast_user_inputs()
@@ -1192,4 +1196,111 @@ contains
 
     end subroutine s_read_ib_restart_data
 
+    !> @brief takes the patch_ib struct array that contains all global IB patches and reduces to only contain patches that are in
+    !! the local computational domain.
+    subroutine s_reduce_ib_patch_array()
+
+        type(ib_patch_parameters), dimension(num_ib_patches_max) :: patch_ib_gbl
+        real(wp)                                                 :: position
+        integer                                                  :: i, j
+        integer                                                  :: num_aware_ibs
+        logical                                                  :: is_in_neighborhood, is_local
+
+        patch_ib_gbl(:) = patch_ib(:)
+        call get_neighbor_bounds()  ! make sure the bounds of the neighbors are correctly set up
+
+        deallocate (patch_ib)
+        if (num_dims == 3) then
+            num_aware_ibs = min(num_local_ibs_max*27, num_ib_patches_max)
+        else
+            num_aware_ibs = min(num_local_ibs_max*9, num_ib_patches_max)
+        end if
+        allocate (patch_ib(num_aware_ibs))
+
+        num_gbl_ibs = num_ibs
+
+#ifdef MFC_MPI
+        ! fallback for 1-rank case
+        if (num_procs == 1) then
+            patch_ib(:) = patch_ib_gbl(1:num_aware_ibs)
+        else
+            ! determine the set of patches owned by local rank
+            num_local_ibs = 0
+            num_ibs = 0
+            do i = 1, num_gbl_ibs
+                ! catch the edge case where th collision lies just outside the computational domain
+                is_in_neighborhood = .true.
+                is_local = .true.
+
+                #:for X, ID, DIM in [('x', 1, 'm'), ('y', 2, 'n'), ('z', 3, 'p')]
+                    if (num_dims >= ${ID}$) then
+                        position = patch_ib_gbl(i)%${X}$_centroid
+                        if (neighbor_domain_${X}$%beg > position .or. position > neighbor_domain_${X}$%end) then
+                            is_in_neighborhood = .false.
+                            is_local = .false.
+                        else if (${X}$_cb(-1) > position .or. position > ${X}$_cb(${DIM}$)) then
+                            is_local = .false.
+                        end if
+                    end if
+                #:endfor
+
+                if (is_in_neighborhood) then
+                    num_ibs = num_ibs + 1
+                    patch_ib(num_ibs) = patch_ib_gbl(i)
+                    patch_ib(num_ibs)%gbl_patch_id = i
+                    if (is_local) then
+                        num_local_ibs = num_local_ibs + 1
+                        local_ib_patch_ids(num_local_ibs) = num_ibs
+                    end if
+                end if
+            end do
+        end if
+#else
+        ! reduce the size of the array for local simulation in no-MPI case
+        patch_ib(:) = patch_ib_gbl(1:num_aware_ibs)
+#endif
+
+    end subroutine s_reduce_ib_patch_array
+
+    subroutine get_neighbor_bounds()
+
+        real(wp) :: send_val, recv_val
+        integer  :: send_neighbor, recv_neighbor, ierr
+
+        ! Default: no neighbor in any direction
+
+        neighbor_domain_x%beg = -huge(0._wp)
+        neighbor_domain_x%end = huge(0._wp)
+        neighbor_domain_y%beg = -huge(0._wp)
+        neighbor_domain_y%end = huge(0._wp)
+        neighbor_domain_z%beg = -huge(0._wp)
+        neighbor_domain_z%end = huge(0._wp)
+
+#ifdef MFC_MPI
+        #:for X, ID, TAG, DIM in [('x', 1, 100, 'm'), ('y', 2, 102, 'n'), ('z', 3, 104, 'p')]
+            if (num_dims >= ${ID}$) then
+                ! Step 1: broadcast left edge (-1 face) rightward; receive left neighbor's left edge -> neighbor_domain_${X}$%beg
+                send_val = ${X}$_cb(-1)
+                send_neighbor = merge(bc_${X}$%end, MPI_PROC_NULL, bc_${X}$%end >= 0)
+                recv_neighbor = merge(bc_${X}$%beg, MPI_PROC_NULL, bc_${X}$%beg >= 0)
+                recv_val = -huge(0._wp)
+                call MPI_SENDRECV(send_val, 1, mpi_p, send_neighbor, ${TAG}$, recv_val, 1, mpi_p, recv_neighbor, ${TAG}$, &
+                                  & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                neighbor_domain_${X}$%beg = recv_val
+
+                ! Step 2: broadcast right edge (${DIM}$ face) leftward; receive right neighbor's right edge ->
+                ! neighbor_domain_${X}$%end
+                send_val = ${X}$_cb(${DIM}$)
+                send_neighbor = merge(bc_${X}$%beg, MPI_PROC_NULL, bc_${X}$%beg >= 0)
+                recv_neighbor = merge(bc_${X}$%end, MPI_PROC_NULL, bc_${X}$%end >= 0)
+                recv_val = huge(0._wp)
+                call MPI_SENDRECV(send_val, 1, mpi_p, send_neighbor, ${TAG + 1}$, recv_val, 1, mpi_p, recv_neighbor, ${TAG + 1}$, &
+                                  & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                neighbor_domain_${X}$%end = recv_val
+            end if
+        #:endfor
+#endif
+
+    end subroutine get_neighbor_bounds
+
 end module m_start_up
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index ce0ae48c4b..6b914eb6ff 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -566,8 +566,10 @@ contains
 
         !
         if (ib) then
-            if (moving_immersed_boundary_flag) call s_wrap_periodic_ibs()
-            if (ib_state_wrt .and. (.not. moving_immersed_boundary_flag)) then
+            if (moving_immersed_boundary_flag) then
+                call s_wrap_periodic_ibs()  ! wraps the positions of IBs to the local proc
+                call s_handoff_ib_ownership()  ! recomputes which ranks own which IBs and communicate to neighbors
+            else if (ib_state_wrt) then
                 call s_compute_ib_forces(q_prim_vf, fluid_pp)
             end if
         end if