Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/include/cuopt/linear_programming/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
#define CUOPT_ORDERING "ordering"
#define CUOPT_BARRIER_DUAL_INITIAL_POINT "barrier_dual_initial_point"
#define CUOPT_BARRIER_ITERATIVE_REFINEMENT "barrier_iterative_refinement"
#define CUOPT_BARRIER_SOC_THRESHOLD "barrier_soc_threshold"
#define CUOPT_BARRIER_STEP_SCALE "barrier_step_scale"
#define CUOPT_ELIMINATE_DENSE_COLUMNS "eliminate_dense_columns"
#define CUOPT_CUDSS_DETERMINISTIC "cudss_deterministic"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ class pdlp_solver_settings_t {
bool eliminate_dense_columns{true};
pdlp_precision_t pdlp_precision{pdlp_precision_t::DefaultPrecision};
bool barrier_iterative_refinement{true};
i_t barrier_soc_threshold{5};
f_t barrier_step_scale{0.9};
bool save_best_primal_so_far{false};
/**
Expand Down
491 changes: 396 additions & 95 deletions cpp/src/barrier/barrier.cu

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion cpp/src/barrier/iterative_refinement.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@

#include <algorithm>
#include <cmath>
#include <cstdio>
#include <limits>
#include <vector>

Expand Down
601 changes: 535 additions & 66 deletions cpp/src/barrier/second_order_cone_kernels.cuh

Large diffs are not rendered by default.

116 changes: 71 additions & 45 deletions cpp/src/barrier/second_order_cone_reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
#include <utilities/copy_helpers.hpp>
#include <utilities/macros.cuh>

#include <cub/block/block_reduce.cuh>
#include <cub/device/device_reduce.cuh>
#include <cub/device/device_segmented_reduce.cuh>
#include <cub/warp/warp_reduce.cuh>

#include <rmm/device_buffer.hpp>
Expand All @@ -24,7 +24,6 @@

#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/permutation_iterator.h>

#include <algorithm>
#include <concepts>
Expand All @@ -46,14 +45,22 @@ __global__ void __launch_bounds__(warps_per_cta* raft::WarpSize)
OutputIt output,
value_t init);

template <std::integral i_t, typename value_t, int block_dim, typename InputIt, typename OutputIt>
__global__ void __launch_bounds__(block_dim)
block_per_cone_reduce_kernel(InputIt input,
raft::device_span<const i_t> medium_cone_ids,
raft::device_span<const std::size_t> cone_offsets,
OutputIt output,
value_t init);

/**
* Segmented-sum dispatcher for packed second-order cone vectors.
*
* Cone dimensions are fixed for a solve, so the constructor partitions cone
* ids once by reduction strategy. Each call then reuses those partitions:
* small cones use one warp per cone, medium cones use CUB DeviceSegmentedReduce,
* small cones use one warp per cone, medium cones use one block per cone,
* and large cones use CUB DeviceReduce one cone at a time. The object owns the
* CUB workspace for those medium/large paths. Call `prepare_workspace` once
* CUB workspace for the large-cone path. Call `prepare_workspace` once
* before using a CUB-backed path.
*/
template <std::integral i_t, i_t warp_cone_dim = 64, i_t large_cone_cutoff = 32768>
Expand All @@ -69,7 +76,7 @@ struct segmented_sum_t {
std::vector<i_t> large_cone_ids;
std::vector<i_t> large_cone_dimensions;

// Maximum CUB temporary storage needed by prepared medium/large reductions.
// Maximum CUB temporary storage needed by prepared large reductions.
std::size_t cub_workspace_bytes = 0;
rmm::device_buffer cub_workspace;

Expand All @@ -80,24 +87,6 @@ struct segmented_sum_t {
auto input = thrust::make_constant_iterator(value_t{});
auto output = thrust::make_discard_iterator();

if (!medium_cone_ids.is_empty()) {
const auto medium_begin_offsets =
thrust::make_permutation_iterator(cone_offsets.data(), medium_cone_ids.begin());
const auto medium_end_offsets =
thrust::make_permutation_iterator(cone_offsets.data() + 1, medium_cone_ids.begin());

std::size_t temp_storage_bytes = 0;
RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr,
temp_storage_bytes,
input,
output,
medium_cone_ids.size(),
medium_begin_offsets,
medium_end_offsets,
stream.value()));
cub_workspace_bytes = std::max(cub_workspace_bytes, temp_storage_bytes);
}

for (std::size_t i = 0; i < large_cone_ids.size(); ++i) {
std::size_t temp_storage_bytes = 0;
RAFT_CUDA_TRY(cub::DeviceReduce::Sum(nullptr,
Expand All @@ -109,7 +98,7 @@ struct segmented_sum_t {
cub_workspace_bytes = std::max(cub_workspace_bytes, temp_storage_bytes);
}

if (cub_workspace.size() < cub_workspace_bytes) {
if (cub_workspace_bytes > 0 && cub_workspace.size() < cub_workspace_bytes) {
cub_workspace.resize(cub_workspace_bytes, stream);
}
}
Expand Down Expand Up @@ -138,31 +127,17 @@ struct segmented_sum_t {
}

if (!medium_cone_ids.is_empty()) {
cuopt_assert(cub_workspace_bytes > 0 && cub_workspace.size() >= cub_workspace_bytes,
"segmented_sum_t::prepare_workspace must be called before reducing medium or "
"large cones");

const auto medium_output = thrust::make_permutation_iterator(output, medium_cone_ids.begin());
const auto medium_begin_offsets =
thrust::make_permutation_iterator(cone_offsets.data(), medium_cone_ids.begin());
const auto medium_end_offsets =
thrust::make_permutation_iterator(cone_offsets.data() + 1, medium_cone_ids.begin());

std::size_t temp_storage_bytes = cub_workspace_bytes;
RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(cub_workspace.data(),
temp_storage_bytes,
input,
medium_output,
medium_cone_ids.size(),
medium_begin_offsets,
medium_end_offsets,
stream.value()));
constexpr int medium_block_dim = 256;
const auto n_medium = medium_cone_ids.size();
block_per_cone_reduce_kernel<i_t, value_t, medium_block_dim>
<<<n_medium, medium_block_dim, 0, stream.value()>>>(
input, cuopt::make_span(medium_cone_ids), cone_offsets, output, init);
RAFT_CUDA_TRY(cudaPeekAtLastError());
}

if (!large_cone_ids.empty()) {
cuopt_assert(cub_workspace_bytes > 0 && cub_workspace.size() >= cub_workspace_bytes,
"segmented_sum_t::prepare_workspace must be called before reducing medium or "
"large cones");
"segmented_sum_t::prepare_workspace must be called before reducing large cones");

for (std::size_t i = 0; i < large_cone_ids.size(); ++i) {
std::size_t temp_storage_bytes = cub_workspace_bytes;
Expand Down Expand Up @@ -258,4 +233,55 @@ __global__ void __launch_bounds__(warps_per_cta* raft::WarpSize)
if (lane_id == 0) { output[cone] = sum; }
}

template <std::integral i_t, typename value_t, int block_dim, typename InputIt, typename OutputIt>
__global__ void __launch_bounds__(block_dim)
block_per_cone_reduce_kernel(InputIt input,
raft::device_span<const i_t> medium_cone_ids,
raft::device_span<const std::size_t> cone_offsets,
OutputIt output,
value_t init)
{
static_assert(block_dim > 0);
static_assert(block_dim <= 1024);

constexpr int items_per_thread = 4;

using block_reduce_t = cub::BlockReduce<value_t, block_dim>;
__shared__ typename block_reduce_t::TempStorage temp_storage;

const auto slot = blockIdx.x;
if (slot >= medium_cone_ids.size()) { return; }

const auto cone = medium_cone_ids[slot];
const auto off = cone_offsets[cone];
const auto dim = cone_offsets[cone + 1] - off;

value_t acc[items_per_thread];
#pragma unroll
for (int k = 0; k < items_per_thread; ++k) {
acc[k] = value_t{0};
}

// Thread t owns elements t, t+block_dim, ..., t+(items_per_thread-1)*block_dim
// within each tile of `block_dim * items_per_thread` consecutive entries, so
// every warp load stays coalesced.
const std::size_t tile = static_cast<std::size_t>(block_dim) * items_per_thread;
for (std::size_t tile_start = 0; tile_start < dim; tile_start += tile) {
#pragma unroll
for (int k = 0; k < items_per_thread; ++k) {
const std::size_t idx = tile_start + threadIdx.x + static_cast<std::size_t>(k) * block_dim;
if (idx < dim) { acc[k] = acc[k] + input[off + idx]; }
}
}

value_t sum = init;
#pragma unroll
for (int k = 0; k < items_per_thread; ++k) {
sum = sum + acc[k];
}

sum = block_reduce_t(temp_storage).Sum(sum);
if (threadIdx.x == 0) { output[cone] = sum; }
Comment on lines +277 to +284

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎯 Functional Correctness | 🟡 Minor | ⚡ Quick win

init is accumulated block_dim times instead of once.

Every thread seeds sum = init, and block_reduce_t::Sum then sums across all block_dim threads, so the result becomes Σ entries + block_dim * init (256×) rather than Σ entries + init. The convenience overload at Line 157 currently passes f_t{0}, so this is latent today, but the public operator() at Line 114 accepts an arbitrary init, so any non-zero caller would get a wrong cone sum. (The warp kernel has the same shape but only multiplies by 32, so the two paths are also inconsistent.)

Apply init once, after the block reduction.

🐛 Proposed fix
-  value_t sum = init;
+  value_t sum = value_t{0};
 `#pragma` unroll
   for (int k = 0; k < items_per_thread; ++k) {
     sum = sum + acc[k];
   }
 
   sum = block_reduce_t(temp_storage).Sum(sum);
-  if (threadIdx.x == 0) { output[cone] = sum; }
+  if (threadIdx.x == 0) { output[cone] = init + sum; }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
value_t sum = init;
#pragma unroll
for (int k = 0; k < items_per_thread; ++k) {
sum = sum + acc[k];
}
sum = block_reduce_t(temp_storage).Sum(sum);
if (threadIdx.x == 0) { output[cone] = sum; }
value_t sum = value_t{0};
`#pragma` unroll
for (int k = 0; k < items_per_thread; ++k) {
sum = sum + acc[k];
}
sum = block_reduce_t(temp_storage).Sum(sum);
if (threadIdx.x == 0) { output[cone] = init + sum; }
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@cpp/src/barrier/second_order_cone_reduction.cuh` around lines 277 - 284, The
cone reduction in second_order_cone_reduction.cuh is adding init on every thread
before block_reduce_t::Sum, which makes the final result include init multiplied
by block_dim instead of once. Update the reduction flow in the block path so
that only the accumulated acc[k] values are reduced first, then apply init a
single time after the block reduction in the operator() / reduction helper that
computes the cone sum. Keep the same logic consistent with the warp path and
verify the convenience overload still works for f_t{0}.

}

} // namespace cuopt::linear_programming::dual_simplex
2 changes: 2 additions & 0 deletions cpp/src/dual_simplex/simplex_solver_settings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ struct simplex_solver_settings_t {
eliminate_dense_columns(true),
barrier_iterative_refinement(true),
barrier_step_scale(0.9),
barrier_soc_threshold(5),
num_gpus(1),
folding(-1),
augmented(0),
Expand Down Expand Up @@ -157,6 +158,7 @@ struct simplex_solver_settings_t {
bool eliminate_dense_columns; // true to eliminate dense columns from A*D*A^T
bool barrier_iterative_refinement; // true to use iterative refinement for barrier method
f_t barrier_step_scale; // step scale for barrier method
i_t barrier_soc_threshold; // SOC dimension above which rank-2 sparse scaling is used
int num_gpus; // Number of GPUs to use (maximum of 2 gpus are supported at the moment)
i_t folding; // -1 automatic, 0 don't fold, 1 fold
i_t augmented; // -1 automatic, 0 to solve with ADAT, 1 to solve with augmented system
Expand Down
1 change: 1 addition & 0 deletions cpp/src/math_optimization/solver_settings.cu
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
{CUOPT_DUALIZE, &pdlp_settings.dualize, -1, 1, -1},
{CUOPT_ORDERING, &pdlp_settings.ordering, -1, 1, -1},
{CUOPT_BARRIER_DUAL_INITIAL_POINT, &pdlp_settings.barrier_dual_initial_point, -1, 1, -1},
{CUOPT_BARRIER_SOC_THRESHOLD, &pdlp_settings.barrier_soc_threshold, 0, 1000000, 5, "SOC dimension above which rank-2 sparse KKT expansion is used"},
{CUOPT_MIP_CUT_PASSES, &mip_settings.max_cut_passes, -1, std::numeric_limits<i_t>::max(), 10},
{CUOPT_MIP_MIXED_INTEGER_ROUNDING_CUTS, &mip_settings.mir_cuts, -1, 1, -1},
{CUOPT_MIP_MIXED_INTEGER_GOMORY_CUTS, &mip_settings.mixed_integer_gomory_cuts, -1, 1, -1},
Expand Down
1 change: 1 addition & 0 deletions cpp/src/pdlp/solve.cu
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,7 @@ run_barrier(dual_simplex::user_problem_t<i_t, f_t>& user_problem,
barrier_settings.crossover = settings.crossover;
barrier_settings.eliminate_dense_columns = settings.eliminate_dense_columns;
barrier_settings.barrier_iterative_refinement = settings.barrier_iterative_refinement;
barrier_settings.barrier_soc_threshold = settings.barrier_soc_threshold;
barrier_settings.barrier_step_scale = settings.barrier_step_scale;
barrier_settings.cudss_deterministic = settings.cudss_deterministic;
barrier_settings.barrier_relaxed_feasibility_tol = settings.tolerances.relative_primal_tolerance;
Expand Down
1 change: 1 addition & 0 deletions cpp/tests/socp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

ConfigureTest(SOCP_TEST
${CMAKE_CURRENT_SOURCE_DIR}/second_order_cone_kernels.cu
${CMAKE_CURRENT_SOURCE_DIR}/sparse_augmented_kkt_test.cu
${CMAKE_CURRENT_SOURCE_DIR}/solve_barrier_socp.cu
${CMAKE_CURRENT_SOURCE_DIR}/general_quadratic_test.cu
LABELS numopt)
Loading
Loading