Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,6 @@ jobs:
sudo env "PATH=$TEST_PATH" bash -lc "command -v '$bin'"
done

- name: Create run-scoped temp directory
run: |
TEST_NETWORK_TMPDIR="/tmp/hm-net-${{ github.run_id }}-${{ github.run_attempt }}"
sudo rm -rf "$TEST_NETWORK_TMPDIR"
mkdir -p "$TEST_NETWORK_TMPDIR"
sudo chown -R "$(id -u):$(id -g)" "$TEST_NETWORK_TMPDIR"

# Avoids rate limits when running the tests
# Tests includes pulling, then converting to disk images
- name: Login to Docker Hub
Expand Down Expand Up @@ -141,7 +134,6 @@ jobs:

- name: Run tests
env:
HYPEMAN_TEST_NETWORK_TMPDIR: /tmp/hm-net-${{ github.run_id }}-${{ github.run_attempt }}
GO_TEST_TIMEOUT: 600s
# Docker auth for tests running as root (sudo)
DOCKER_CONFIG: /home/debianuser/.docker
Expand Down Expand Up @@ -171,7 +163,6 @@ jobs:
echo "$units" | xargs -r sudo systemctl reset-failed || true
fi
sudo rm -f /run/hypeman/uffd/ci-${{ github.run_id }}-${{ github.run_attempt }}-*.env
sudo rm -rf "/tmp/hm-net-${{ github.run_id }}-${{ github.run_attempt }}"
rm -f "${{ runner.temp }}/hypeman-uffd-pager-${{ github.run_id }}-${{ github.run_attempt }}"

test-darwin:
Expand Down
143 changes: 138 additions & 5 deletions lib/instances/test_network_config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"os"
"os/exec"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
Expand All @@ -32,6 +33,34 @@ const (
testSubnetThirdOctetMax = 250
)

// iptablesWaitSeconds matches the production convention in
// lib/network/bridge_linux.go so the test harness cooperates with the
// system-wide xtables lock instead of failing immediately under contention.
const testIPTablesWaitSeconds = "5"

// iptablesDeleteAttempts is the number of times we retry an iptables delete on
// transient lock contention before giving up.
const iptablesDeleteAttempts = 3

// newTestIPTablesCommand builds an iptables command with the -w wait flag so it
// waits for the xtables lock rather than failing immediately.
func newTestIPTablesCommand(args ...string) *exec.Cmd {
fullArgs := make([]string, 0, len(args)+2)
fullArgs = append(fullArgs, "-w", testIPTablesWaitSeconds)
fullArgs = append(fullArgs, args...)
return exec.Command("iptables", fullArgs...)
}

// logTestNetworkErr surfaces a real cleanup failure to stderr. These helpers run
// both with and without a *testing.T (e.g. under testNetworkGuardCleanupOnce and
// in t.Cleanup), so we log instead of swallowing the error silently.
func logTestNetworkErr(op string, err error) {
if err == nil {
return
}
fmt.Fprintf(os.Stderr, "hypeman-test-network: %s: %v\n", op, err)
}

type testNetworkLease struct {
cfg config.NetworkConfig
release func()
Expand Down Expand Up @@ -105,6 +134,10 @@ func allocateTestNetworkLease(testName string, seq uint32) (*testNetworkLease, e

testNetworkGuardCleanupOnce.Do(func() {
cleanupStaleLinkDownRoutes(routes)
// Sweep iptables rules for test bridges that no longer exist. Once a
// bridge is fully deleted its route is gone too, so linkdown cleanup
// above can't catch these — they would otherwise leak forever.
sweepOrphanedTestIPTablesRules()
// Refresh route snapshot after cleanup so subnet selection sees current state.
refreshed, refreshErr := listHostRoutes()
if refreshErr == nil {
Expand Down Expand Up @@ -324,6 +357,74 @@ func cleanupStaleLinkDownRoutes(routes []hostRoute) {
}
}

// testRuleCommentPattern matches hypeman test-harness iptables rule comments and
// captures both the full comment and the referenced bridge name. We deliberately
// anchor on the "hm" test prefix so we never touch "ha"-prefixed rules from a
// real (non-test) hypeman process running on the same host.
var testRuleCommentPattern = regexp.MustCompile(`hypeman-(?:fwd-out|fwd-in|nat)-(hm[0-9a-f]+)`)

// sweepOrphanedTestIPTablesRules removes hypeman test iptables rules whose
// referenced bridge interface no longer exists. Once a bridge is fully deleted,
// its route disappears, so cleanupStaleLinkDownRoutes can't reach these rules and
// they accumulate indefinitely on the shared CI runner, slowing every iptables
// operation under the xtables lock.
//
// This is conservative and safe: a rule whose bridge interface is gone can never
// affect live traffic, and we never delete a rule whose interface still exists.
func sweepOrphanedTestIPTablesRules() {
sweepOrphanedTestRulesInChain("", "FORWARD")
sweepOrphanedTestRulesInChain("nat", "POSTROUTING")
}

func sweepOrphanedTestRulesInChain(table, chain string) {
args := []string{}
if table != "" {
args = append(args, "-t", table)
}
args = append(args, "-S", chain)
output, err := newTestIPTablesCommand(args...).Output()
if err != nil {
logTestNetworkErr(fmt.Sprintf("iptables -S %s/%s for orphan sweep", table, chain), err)
return
}

// Collect comments whose bridge interface is gone. Cache bridge existence and
// dedupe comments so we shell out and delete each orphan only once.
exists := make(map[string]bool)
seen := make(map[string]struct{})
var orphanedComments []string
for _, line := range strings.Split(string(output), "\n") {
match := testRuleCommentPattern.FindStringSubmatch(line)
if match == nil {
continue
}
comment := match[0]
bridge := match[1]

alive, checked := exists[bridge]
if !checked {
alive = bridgeExists(bridge)
exists[bridge] = alive
}
if alive {
// Never delete a rule whose bridge interface still exists.
continue
}

if _, ok := seen[comment]; ok {
continue
}
seen[comment] = struct{}{}
orphanedComments = append(orphanedComments, comment)
}

// Delete by comment, reusing the existing line-number-based deleter which
// handles quoting and renumbering correctly.
for _, comment := range orphanedComments {
deleteIPTablesRulesByComment(table, chain, comment)
}
}

func pruneStaleLeases(leases map[string]subnetLease, routes []hostRoute) {
liveRoutes := make(map[string]struct{}, len(routes))
for _, route := range routes {
Expand Down Expand Up @@ -422,10 +523,18 @@ func isTestCIDR(cidr string) bool {

func cleanupTestNetworkArtifacts(bridgeName, subnetCIDR string) {
if subnetCIDR != "" && bridgeName != "" {
_ = exec.Command("ip", "-4", "route", "del", subnetCIDR, "dev", bridgeName).Run()
// The route may already be gone (linkdown cleanup, prior pass) — that is
// not a real failure, so only surface unexpected errors.
out, err := exec.Command("ip", "-4", "route", "del", subnetCIDR, "dev", bridgeName).CombinedOutput()
if err != nil && !isNoSuchObjectErr(out) {
logTestNetworkErr(fmt.Sprintf("ip route del %s dev %s: %s", subnetCIDR, bridgeName, strings.TrimSpace(string(out))), err)
}
}
if bridgeName != "" {
_ = exec.Command("ip", "link", "delete", bridgeName, "type", "bridge").Run()
out, err := exec.Command("ip", "link", "delete", bridgeName, "type", "bridge").CombinedOutput()
if err != nil && !isNoSuchObjectErr(out) {
logTestNetworkErr(fmt.Sprintf("ip link delete %s: %s", bridgeName, strings.TrimSpace(string(out))), err)
}
}

bridgeSuffix := strings.ToLower(bridgeName)
Expand All @@ -434,6 +543,14 @@ func cleanupTestNetworkArtifacts(bridgeName, subnetCIDR string) {
deleteIPTablesRulesByComment("", "FORWARD", "hypeman-fwd-in-"+bridgeSuffix)
}

// isNoSuchObjectErr reports whether an `ip` command output indicates the object
// (route/link) was already gone, which is an expected, benign outcome for
// idempotent cleanup.
func isNoSuchObjectErr(combinedOutput []byte) bool {
out := strings.ToLower(string(combinedOutput))
return strings.Contains(out, "cannot find") || strings.Contains(out, "no such")
}

func deleteIPTablesRulesByComment(table, chain, comment string) {
if comment == "" {
return
Expand All @@ -444,9 +561,9 @@ func deleteIPTablesRulesByComment(table, chain, comment string) {
args = append(args, "-t", table)
}
args = append(args, "-L", chain, "--line-numbers", "-n")
listCmd := exec.Command("iptables", args...)
output, err := listCmd.Output()
output, err := newTestIPTablesCommand(args...).Output()
if err != nil {
logTestNetworkErr(fmt.Sprintf("iptables list %s/%s for comment %q", table, chain, comment), err)
return
}

Expand All @@ -472,8 +589,24 @@ func deleteIPTablesRulesByComment(table, chain, comment string) {
delArgs = append(delArgs, "-t", table)
}
delArgs = append(delArgs, "-D", chain, strconv.Itoa(ruleNums[i]))
_ = exec.Command("iptables", delArgs...).Run()
deleteIPTablesRuleWithRetry(delArgs)
}
}

// deleteIPTablesRuleWithRetry runs an iptables `-D` delete, retrying a few times
// on transient lock contention (the failure mode under concurrent CI jobs).
func deleteIPTablesRuleWithRetry(delArgs []string) {
var err error
for attempt := 0; attempt < iptablesDeleteAttempts; attempt++ {
if attempt > 0 {
time.Sleep(time.Duration(attempt) * 100 * time.Millisecond)
}
err = newTestIPTablesCommand(delArgs...).Run()
if err == nil {
return
}
}
logTestNetworkErr(fmt.Sprintf("iptables %s", strings.Join(delArgs, " ")), err)
}

func legacyParallelTestNetworkConfig(seq uint32) config.NetworkConfig {
Expand Down
Loading