From bb12c479915a12f61b28d80b0b736e5d79e9744d Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 28 Jan 2026 17:57:27 +0000 Subject: [PATCH 1/4] fix: cleanup temporary shard files on failed indexing When zoekt-git-index fails during repository indexing, it can leave behind .tmp shard files that accumulate over time and fill up disk space. This is especially problematic for large repos that repeatedly fail to index. Changes: - Add cleanupTempShards() function to zoekt.ts that removes temporary shard files (files with .tmp in their name) for a specific repository - Call cleanupTempShards() in repoIndexManager.ts when indexGitRepository fails, before re-throwing the error This ensures that even if a repository consistently fails to index, the temporary files created during each attempt are cleaned up. Co-authored-by: michael --- packages/backend/src/repoIndexManager.ts | 16 +++++++++--- packages/backend/src/zoekt.ts | 32 ++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/packages/backend/src/repoIndexManager.ts b/packages/backend/src/repoIndexManager.ts index 8e499863a..69226935c 100644 --- a/packages/backend/src/repoIndexManager.ts +++ b/packages/backend/src/repoIndexManager.ts @@ -13,7 +13,7 @@ import { captureEvent } from './posthog.js'; import { PromClient } from './promClient.js'; import { RepoWithConnections, Settings } from "./types.js"; import { getAuthCredentialsForRepo, getShardPrefix, groupmqLifecycleExceptionWrapper, measure, setIntervalAsync } from './utils.js'; -import { indexGitRepository } from './zoekt.js'; +import { cleanupTempShards, indexGitRepository } from './zoekt.js'; const LOG_TAG = 'repo-index-manager'; const logger = createLogger(LOG_TAG); @@ -438,9 +438,17 @@ export class RepoIndexManager { } logger.info(`Indexing ${repo.name} (id: ${repo.id})...`); - const { durationMs } = await measure(() => indexGitRepository(repo, this.settings, revisions, signal)); - const indexDuration_s = durationMs / 1000; - logger.info(`Indexed ${repo.name} (id: ${repo.id}) in ${indexDuration_s}s`); + try { + const { durationMs } = await measure(() => indexGitRepository(repo, this.settings, revisions, signal)); + const indexDuration_s = durationMs / 1000; + logger.info(`Indexed ${repo.name} (id: ${repo.id}) in ${indexDuration_s}s`); + } catch (error) { + // Clean up any temporary shard files left behind by the failed indexing operation. + // Zoekt creates .tmp files during indexing which can accumulate if indexing fails repeatedly. + logger.warn(`Indexing failed for ${repo.name} (id: ${repo.id}), cleaning up temp shard files...`); + await cleanupTempShards(repo); + throw error; + } return revisions; } diff --git a/packages/backend/src/zoekt.ts b/packages/backend/src/zoekt.ts index 27f17d711..ff97d1cee 100644 --- a/packages/backend/src/zoekt.ts +++ b/packages/backend/src/zoekt.ts @@ -1,6 +1,7 @@ import { Repo } from "@sourcebot/db"; import { createLogger, env, getRepoPath } from "@sourcebot/shared"; import { exec } from "child_process"; +import { readdir, rm } from "fs/promises"; import { INDEX_CACHE_DIR } from "./constants.js"; import { Settings } from "./types.js"; import { getShardPrefix } from "./utils.js"; @@ -54,3 +55,34 @@ export const indexGitRepository = async (repo: Repo, settings: Settings, revisio }) }); } + +/** + * Cleans up temporary shard files left behind by a failed indexing operation. + * Zoekt creates temporary files (with `.tmp` suffix) during indexing, which + * can be left behind if the indexing process fails or is interrupted. + * + * @param repo - The repository whose temp shards should be cleaned up + */ +export const cleanupTempShards = async (repo: Repo) => { + const shardPrefix = getShardPrefix(repo.orgId, repo.id); + + try { + const files = await readdir(INDEX_CACHE_DIR); + const tempFiles = files.filter(file => + file.startsWith(shardPrefix) && file.includes('.tmp') + ); + + for (const file of tempFiles) { + const filePath = `${INDEX_CACHE_DIR}/${file}`; + logger.info(`Cleaning up temp shard file: ${filePath}`); + await rm(filePath, { force: true }); + } + + if (tempFiles.length > 0) { + logger.info(`Cleaned up ${tempFiles.length} temp shard file(s) for repo ${repo.id}`); + } + } catch (error) { + // Log but don't throw - cleanup is best effort + logger.warn(`Failed to cleanup temp shards for repo ${repo.id}:`, error); + } +} From 6bcb977385a16fae055bced223c7a100dc8f9862 Mon Sep 17 00:00:00 2001 From: bkellam Date: Fri, 6 Feb 2026 18:42:00 -0800 Subject: [PATCH 2/4] fix merge issue --- packages/backend/src/repoIndexManager.ts | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/packages/backend/src/repoIndexManager.ts b/packages/backend/src/repoIndexManager.ts index 7081bff49..11fc0b945 100644 --- a/packages/backend/src/repoIndexManager.ts +++ b/packages/backend/src/repoIndexManager.ts @@ -1,19 +1,17 @@ import * as Sentry from '@sentry/node'; import { PrismaClient, Repo, RepoIndexingJobStatus, RepoIndexingJobType } from "@sourcebot/db"; -import { createLogger, Logger } from "@sourcebot/shared"; -import { env, RepoIndexingJobMetadata, repoIndexingJobMetadataSchema, RepoMetadata, repoMetadataSchema, getRepoPath } from '@sourcebot/shared'; +import { createLogger, env, getRepoPath, Logger, RepoIndexingJobMetadata, repoIndexingJobMetadataSchema, RepoMetadata, repoMetadataSchema } from "@sourcebot/shared"; +import { DelayedError, Job, Queue, Worker } from "bullmq"; import { existsSync } from 'fs'; import { readdir, rm } from 'fs/promises'; -import { DelayedError, Job, Queue, Worker } from "bullmq"; import { Redis } from 'ioredis'; -import Redlock, { ExecutionError } from 'redlock'; import micromatch from 'micromatch'; -import { WORKER_STOP_GRACEFUL_TIMEOUT_MS, INDEX_CACHE_DIR } from './constants.js'; +import Redlock, { ExecutionError } from 'redlock'; +import { INDEX_CACHE_DIR, WORKER_STOP_GRACEFUL_TIMEOUT_MS } from './constants.js'; import { cloneRepository, fetchRepository, getBranches, getCommitHashForRefName, getLatestCommitTimestamp, getLocalDefaultBranch, getTags, isPathAValidGitRepoRoot, unsetGitConfig, upsertGitConfig } from './git.js'; import { captureEvent } from './posthog.js'; import { PromClient } from './promClient.js'; import { RepoWithConnections, Settings } from "./types.js"; -import { getAuthCredentialsForRepo, getShardPrefix, groupmqLifecycleExceptionWrapper, measure, setIntervalAsync } from './utils.js'; import { getAuthCredentialsForRepo, getShardPrefix, measure, setIntervalAsync } from './utils.js'; import { cleanupTempShards, indexGitRepository } from './zoekt.js'; From c5b57e7d24df85f743df644f611d20d5c3adac8f Mon Sep 17 00:00:00 2001 From: bkellam Date: Fri, 6 Feb 2026 18:44:25 -0800 Subject: [PATCH 3/4] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca43800dc..fbcfb2ee5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Added support to set "Require approval for new members" via config with (`REQUIRE_APPROVAL_NEW_MEMBERS`). [#858](https://github.com/sourcebot-dev/sourcebot/pull/858) +- Added automatic cleanup of tmp shard files on indexing failure. [#805](https://github.com/sourcebot-dev/sourcebot/pull/805) ### Changed - Improved stability for connection and repo indexing workers. [#860](https://github.com/sourcebot-dev/sourcebot/pull/860) From d1d0c1151195e558b101b4b3096a8c2ef26702aa Mon Sep 17 00:00:00 2001 From: bkellam Date: Fri, 6 Feb 2026 18:52:24 -0800 Subject: [PATCH 4/4] changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fbcfb2ee5..56dfc1429 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Added support to set "Require approval for new members" via config with (`REQUIRE_APPROVAL_NEW_MEMBERS`). [#858](https://github.com/sourcebot-dev/sourcebot/pull/858) -- Added automatic cleanup of tmp shard files on indexing failure. [#805](https://github.com/sourcebot-dev/sourcebot/pull/805) ### Changed - Improved stability for connection and repo indexing workers. [#860](https://github.com/sourcebot-dev/sourcebot/pull/860) @@ -17,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Fixed issue where certain file and folder names would cause type errors. [#862](https://github.com/sourcebot-dev/sourcebot/pull/862) - Fixed token refresh error "Provider config not found or invalid for: x" when a sso is configured using deprecated env vars. [#841](https://github.com/sourcebot-dev/sourcebot/pull/841) +- Fixed issue where temporary shard files created on index failure were not being cleaned up. [#805](https://github.com/sourcebot-dev/sourcebot/pull/805) ## [4.10.27] - 2026-02-05