diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 89165be..d3e6020 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -128,7 +128,7 @@ jobs: --region $REGION \ --project $PROJECT_ID \ --allow-unauthenticated \ - --set-env-vars="^|^CONFIG_REPO_OWNER=grove-platform|CONFIG_REPO_NAME=github-copier|CONFIG_REPO_BRANCH=main|PEM_NAME=CODE_COPIER_PEM|WEBHOOK_SECRET_NAME=webhook-secret|MONGO_URI_SECRET_NAME=mongo-uri|WEBSERVER_PATH=/events|MAIN_CONFIG_FILE=.copier/main.yaml|USE_MAIN_CONFIG=true|DEPRECATION_FILE=deprecated_examples.json|COMMITTER_NAME=GitHub Copier App|COMMITTER_EMAIL=bot@mongodb.com|GOOGLE_CLOUD_PROJECT_ID=github-copy-code-examples|COPIER_LOG_NAME=code-copier-log|AUDIT_ENABLED=false|METRICS_ENABLED=true|GITHUB_APP_ID=${{ secrets.APP_ID }}|INSTALLATION_ID=${{ secrets.INSTALLATION_ID }}" \ + --set-env-vars="^|^CONFIG_REPO_OWNER=grove-platform|CONFIG_REPO_NAME=github-copier|CONFIG_REPO_BRANCH=main|PEM_NAME=CODE_COPIER_PEM|WEBHOOK_SECRET_NAME=webhook-secret|MONGO_URI_SECRET_NAME=mongo-uri|WEBSERVER_PATH=/events|MAIN_CONFIG_FILE=.copier/main.yaml|USE_MAIN_CONFIG=true|DEPRECATION_FILE=deprecated_examples.json|COMMITTER_NAME=GitHub Copier App|COMMITTER_EMAIL=bot@mongodb.com|GOOGLE_CLOUD_PROJECT_ID=github-copy-code-examples|COPIER_LOG_NAME=code-copier-log|AUDIT_ENABLED=true|METRICS_ENABLED=true|OPERATOR_UI_ENABLED=true|OPERATOR_AUTH_REPO=grove-platform/github-copier|OPERATOR_REPO_SLUG=grove-platform/github-copier|LLM_PROVIDER=anthropic|LLM_BASE_URL=https://grove-gateway-prod.azure-api.net/grove-foundry-prod/anthropic|LLM_MODEL=claude-haiku-4-5|ANTHROPIC_API_KEY_SECRET_NAME=anthropic-api-key|GITHUB_APP_ID=${{ secrets.APP_ID }}|INSTALLATION_ID=${{ secrets.INSTALLATION_ID }}" \ --set-build-env-vars="VERSION=${{ steps.version.outputs.tag }}" \ --tag="${{ steps.version.outputs.traffic_tag }}" \ --max-instances=10 \ diff --git a/AGENT.md b/AGENT.md index f5c9680..33d3905 100644 --- a/AGENT.md +++ b/AGENT.md @@ -1,93 +1,145 @@ # Agent Context: GitHub Copier -Webhook service: PR merged → match files → transform paths → copy to target repos. +Webhook service + operator UI. + +**Webhook pipeline:** PR merged → match files → transform paths → copy to target repos. +**Operator UI:** `/operator/` — diagnostic dashboard with PAT auth, replay, audit browsing, and an AI rule suggester. Enabled via `OPERATOR_UI_ENABLED=true` + `OPERATOR_AUTH_REPO`. ## File Map ``` -app.go # Entrypoint, HTTP server, graceful shutdown +app.go # Entrypoint, HTTP server, graceful shutdown, startup banner services/ - webhook_handler_new.go # HandleWebhookWithContainer() orchestrator - workflow_processor.go # ProcessWorkflow() - core file matching logic - pattern_matcher.go # MatchFile(pattern, path) bool - token_manager.go # TokenManager (thread-safe token state, sync.RWMutex) - github_auth.go # ConfigurePermissions(), JWT generation + # Webhook pipeline + webhook_handler_new.go # HandleWebhookWithContainer() — orchestrator + workflow_processor.go # ProcessWorkflow() — core file matching logic + pattern_matcher.go # MatchFile(pattern, path) — prefix/glob/regex + github_auth.go # ConfigurePermissions(), JWT generation, LoadWebhookSecret, LoadMongoURI, LoadAnthropicAPIKey github_read.go # GetFilesChangedInPr() (GraphQL), RetrieveFileContents() - github_write_to_target.go # AddFilesToTargetRepos(), addFilesViaPR() + github_write_to_target.go # AddFilesToTargetRepos(); errTreeUnchanged sentinel for empty commits github_write_to_source.go # UpdateDeprecationFile(filesToDeprecate) + token_manager.go # TokenManager (thread-safe install tokens, sync.RWMutex) rate_limit.go # RateLimitTransport (auto-retry on 403/429) - delivery_tracker.go # DeliveryTracker (webhook idempotency via X-GitHub-Delivery) + delivery_tracker.go # Webhook idempotency via X-GitHub-Delivery + file_state_service.go # Per-request upload/deprecate queues (thread-safe) errors.go # Sentinel errors (ErrRateLimited, ErrNotFound, etc.) logger.go # slog JSON handler, LogCritical, LogAndReturnError - file_state_service.go # Tracks upload/deprecate queues (thread-safe) main_config_loader.go # LoadConfig() with $ref support config_loader.go # Config loading & validation - config_cache.go # CachedConfigLoader (TTL-based config caching) + config_cache.go # CachedConfigLoader (TTL-based) service_container.go # DI container - health_metrics.go # /health (liveness), /ready (readiness), /metrics - audit_logger.go # MongoDB audit logging + health_metrics.go # /health, /ready, /metrics, /config + audit_logger.go # MongoDB audit logging (driver v2; ObjectIDAsHexString for read decoding) slack_notifier.go # Slack notifications pr_template_fetcher.go # PR template resolution from target repos + webhook_trace_buffer.go # Ring buffer of recent webhook traces (Overview/Webhooks tabs) + log_buffer.go # Context-tagged per-delivery log ring buffer (logs drawer) + + # Operator UI + operator_ui.go # RegisterOperatorRoutes, wrapAPI / wrapOperatorOnly middleware, + # handleMe, handleRepoPermission, handleDeployment, handleReplay, + # handleRelease, githubCreateVersionTag, sharedGithubHTTPClient, + # llmPingCache, ReleaseAPIMode enum + operator_auth.go # GitHub PAT validation; ghAuthCache (SHA-256 hashed keys); + # validateGitHubPAT role mapping; ghAPIError (StatusCode, + # IsTransient); 5xx = soft-fail to writer, else RoleDenied + operator_ratelimit.go # tokenBucket — fixed-window rate limiter keyed by hashed PAT + # (30/hour on /suggest-rule) + operator_suggest_rule.go # AI rule suggester; SuggestRuleSystemPrompt (exported); + # verifySuggestedRule (runs rule through PatternMatcher) + operator_llm_admin.go # /llm/status (cached 30s), /llm/settings, /llm/pull (NDJSON), + # /llm/model delete. Maps ErrModelManagementNotSupported to 400. + llm_client.go # LLMClient interface, NewLLMClient(LLMClientOptions) dispatch, + # ErrModelManagementNotSupported, ollamaClient impl + llm_anthropic.go # anthropicClient — /v1/messages, /v1/models, dual x-api-key + + # api-key headers (native API + Azure APIM gateway support) + web/operator/index.html # Embedded single-file SPA (HTML + CSS + JS); served by serveIndex + types/ - config.go # Workflow, Transformation, SourcePattern structs + config.go # Workflow, Transformation, SourcePattern, CommitStrategyConfig types.go # ChangedFile, UploadKey, UploadFileContent -configs/environment.go # Config struct, LoadEnvironment() -tests/utils.go # Test helpers, httpmock setup +configs/environment.go # Config struct, LoadEnvironment(), validateOperatorAuth (hard-fail + # when UI enabled without auth repo), per-provider LLM defaults cmd/ config-validator/ # CLI: validate configs, test patterns, init templates test-webhook/ # CLI: send test webhook payloads (with delivery ID) test-pem/ # CLI: verify PEM key + App ID against GitHub API + test-llm/ # CLI: smoke-test LLM provider end-to-end (Ping, ListModels, + # GenerateJSON with the real SuggestRuleSystemPrompt) scripts/ - ci-local.sh # Run full CI pipeline locally (build, test, lint, vet) + ci-local.sh # Run full CI pipeline locally run-local.sh # Run app locally with dev settings - deploy-cloudrun.sh # Deploy to Google Cloud Run + deploy-cloudrun.sh # Deploy to Google Cloud Run (manual fallback) integration-test.sh # End-to-end integration test - release.sh # Create versioned release (tag, changelog, GitHub Release) + release.sh # Create versioned release (tag, CHANGELOG, GitHub Release) test-slack.sh # Test Slack notification integration diagnose-github-auth.sh # Debug GitHub App authentication issues - check-installation-repos.sh # List repos accessible to GitHub App installation ``` ## Key Types ```go // types/config.go -type PatternType string // "prefix" | "glob" | "regex" -type TransformationType string // "move" | "copy" | "glob" | "regex" +type PatternType string // "prefix" | "glob" | "regex" +type TransformationType string // "move" | "copy" | "glob" | "regex" type Workflow struct { Name string - Source Source // Repo, Branch, InstallationID - Destination Destination // Repo, Branch - Transformations []Transformation // Type, From, To, Pattern, Replacement + Source Source // Repo, Branch, InstallationID + Destination Destination // Repo, Branch + Transformations []Transformation // Type, From, To, Pattern, Replacement Exclude []string - CommitStrategy *CommitStrategyConfig // Type (direct|pull_request), PRTitle, PRBody, AutoMerge + CommitStrategy *CommitStrategyConfig // Type, PRTitle, PRBody, AutoMerge DeprecationCheck *DeprecationConfig } -// types/types.go -type ChangedFile struct { Path, Status string } // Status: "ADDED"|"MODIFIED"|"DELETED" -type UploadKey struct { RepoName, BranchPath string } +// services/llm_client.go +type LLMClient interface { + GenerateJSON(ctx, system, user string) (string, error) + ProviderName() string + Ping(ctx) error + Get/SetBaseURL, Get/SetActiveModel + ListModels(ctx) ([]LLMModel, error) + PullModel(ctx, name, progressFn) error // ollama only + DeleteModel(ctx, name) error // ollama only +} +type LLMClientOptions struct { Provider, BaseURL, Model, APIKey string } + +// services/operator_auth.go +type OperatorRole string // "operator" | "writer" | "denied" +type ghAPIError struct { StatusCode int; Body string } // exposes IsTransient() ``` ## State Management -All mutable state is encapsulated in `TokenManager` (thread-safe via `sync.RWMutex`): -- Installation access token -- Per-org installation tokens with expiry -- Cached JWT -- HTTP client +- **Per-install tokens**: `TokenManager` (thread-safe via `sync.RWMutex`), cached JWT, HTTP client. +- **Per-request file state**: `FileStateService` on the `ServiceContainer`. +- **Webhook idempotency**: `DeliveryTracker` (TTL-based, in-memory). +- **PAT auth cache**: `ghAuthCache` (5-min TTL). Keys are **SHA-256 hashes** of the PAT — raw tokens never sit in the heap. Stores the full `*OperatorUser` and per-repo permission levels. +- **LLM settings**: process-global, in-memory, mutated at runtime via `/llm/settings`. Revert to env defaults on restart; the UI hint calls this out. +- **LLM ping cache**: 30s TTL; invalidated on `SetBaseURL` / `SetActiveModel`. +- **Rate limit buckets**: fixed-window (30/hour) on `/suggest-rule`, keyed by hashed PAT. Opportunistic eviction. +- **Log buffer**: context-tagged ring buffer (`ContextWithLogBuffer`) captures slog output per webhook delivery for the logs drawer. + +## Authorization Model (Operator UI) + +Each user signs in with their own GitHub PAT. Permission on `OPERATOR_AUTH_REPO` decides role: + +| GitHub permission | Role | Capabilities | +|-------------------|------------|-----------------------------------------------------------| +| `admin`, `maintain`| operator | All UI, replay, release, AI settings | +| `write`, `triage`, `read` | writer | View audit/workflows/copies, AI rule suggester | +| none | denied | 401 | -Per-request file state is managed via `FileStateService` in the `ServiceContainer`. +`write` is deliberately **not** operator — docs contributors typically have `write` on the auth repo and shouldn't get replay/release capability. -Webhook idempotency is handled by `DeliveryTracker` (TTL-based, in-memory). +Additional gate on replay: user's PAT must have read access to the **source repo of the webhook being replayed** (checked via `ghAuthCache.CanUserReadRepo`). + +Permission-check error handling: **5xx from GitHub is soft-failed to writer** (transient outage shouldn't lock everyone out); everything else (404, 401, 403, network, parse error) → `RoleDenied`. The distinction is carried by `ghAPIError.IsTransient()`. ## Target Repo Batching -Multiple workflows targeting the **same destination repo** are batched into a single commit/PR. -The last workflow's commit strategy, PR title/body, and auto-merge setting wins. -To get separate PRs per workflow, use different destination repos or branches. -See `docs/ARCHITECTURE.md` § "Target Repo Batching" for full details. +Multiple workflows targeting the same destination repo are batched into a single commit/PR. The last workflow's commit strategy, PR title/body, and auto-merge setting wins. See `docs/ARCHITECTURE.md` § "Target Repo Batching". ## Config Example @@ -97,7 +149,7 @@ workflows: source: { repo: "org/src", branch: "main", patterns: [{type: glob, pattern: "docs/**"}] } destination: { repo: "org/dest", branch: "main" } transformations: [{ type: move, from: "docs/", to: "public/" }] - commit_strategy: { type: pull_request, pr_title: "Sync docs" } # type: direct|pull_request + commit_strategy: { type: pull_request, pr_title: "Sync docs" } ``` ## Quick Reference @@ -110,74 +162,78 @@ make run # run with .env # Testing go test -race ./... # all tests with race detector -go test ./services/... -run TestWorkflow -v # specific test -make test # run all tests via Makefile +go test ./services/ -run TestValidateGitHubPAT -v # specific test -# Linting -golangci-lint run ./... # lint (config: .golangci.yml) -make lint # lint via Makefile +# Linting + security +golangci-lint run ./... # lint (.golangci.yml) +gosec ./... # security scanner; should be 0 issues # CI (local) ./scripts/ci-local.sh # full CI: build, test, lint, vet # Release -./scripts/release.sh v1.2.3 # create release (see below) -./scripts/release.sh v1.2.3 --dry-run # preview without changes +./scripts/release.sh v1.2.3 --dry-run # preview +./scripts/release.sh v1.2.3 # tag + push, triggers Cloud Run deploy + +# Operator UI smoke test +go build -o test-llm ./cmd/test-llm && ./test-llm -env .env.test ``` ## Release Process -Releases use semantic versioning (`vMAJOR.MINOR.PATCH`) and are automated via `scripts/release.sh`. - -**Prerequisites:** -- Clean working tree on `main` branch -- `gh` CLI authenticated -- `CHANGELOG.md` has content in `[Unreleased]` section - -**What the script does:** -1. Validates version format and prerequisites -2. Renames `[Unreleased]` → `[vX.Y.Z] - YYYY-MM-DD` in `CHANGELOG.md` -3. Adds fresh `[Unreleased]` section -4. Commits: `Release vX.Y.Z` -5. Creates annotated git tag -6. Pushes to origin (triggers CI deploy via `v*` tag) -7. Creates GitHub Release with changelog excerpt +Semantic versioning (`vMAJOR.MINOR.PATCH`) via `scripts/release.sh`. Prereqs: clean `main`, `gh` authed, `[Unreleased]` populated. The script promotes `[Unreleased]` to a dated heading, commits, tags, pushes — the tag push triggers the Cloud Run deploy in `.github/workflows/ci.yml`. See the `## Release` section of `README.md` for detail. -**Changelog convention:** -- Follow [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) format -- Sections: `Added`, `Changed`, `Fixed`, `Security`, `Deprecated`, `Removed` -- Add entries to `[Unreleased]` as you work; the release script promotes them +**Changelog**: Follow [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). Sections: `Added`, `Changed`, `Fixed`, `Security`, `Deprecated`, `Removed`. ## Edit Patterns -| Task | Files to modify | -|------|-----------------| -| New transformation | `types/config.go` (TransformationType) → `workflow_processor.go` (processFileForWorkflow) | -| New pattern type | `types/config.go` (PatternType) → `pattern_matcher.go` | -| New config field | `types/config.go` (struct) → consumers in `workflow_processor.go` | -| Webhook logic | `webhook_handler_new.go` | -| Rate limit behavior | `rate_limit.go` | -| Auth flow | `github_auth.go` + `token_manager.go` | -| CLI tool | `cmd//main.go` + `cmd//README.md` | +| Task | Files to modify | +|----------------------------------------|-----------------| +| New transformation type | `types/config.go` (TransformationType) → `workflow_processor.go` (processFileForWorkflow) | +| New pattern type | `types/config.go` (PatternType) → `pattern_matcher.go` | +| New config field | `types/config.go` → consumers in `workflow_processor.go` | +| New env var | `configs/environment.go` (field + const + loader); update `docs/CONFIG-REFERENCE.md` | +| Webhook pipeline logic | `webhook_handler_new.go` → `workflow_processor.go` | +| Rate-limit behavior (GitHub API) | `rate_limit.go` | +| Auth flow (App) | `github_auth.go` + `token_manager.go` | +| Operator UI route | `operator_ui.go` (RegisterOperatorRoutes + handler) + `services/web/operator/index.html` | +| Operator UI auth / role | `operator_auth.go` (role mapping, `ghAPIError`, cache) | +| LLM provider | Implement `LLMClient` in new `llm_.go`; dispatch in `llm_client.go` `NewLLMClient` | +| LLM prompt change | `operator_suggest_rule.go` (`SuggestRuleSystemPrompt`); rerun `cmd/test-llm` to validate | +| AI suggester UI change | `services/web/operator/index.html` §§ `ai-settings` / `ai-suggester` | +| CLI tool | `cmd//main.go` + `cmd//README.md` | ## Conventions -- Return `error`, never `log.Fatal` -- Wrap errors: `fmt.Errorf("context: %w", err)` -- Use sentinel errors from `errors.go` where appropriate -- Nil-check GitHub API responses before dereference -- Use `log/slog` for all logging (never `log` or `fmt.Print` for operational output) -- Tests use `httpmock`; see `tests/utils.go` -- Always run tests with `-race` flag -- **Changelog**: Update `CHANGELOG.md` for all notable changes (follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/)) +- Return `error`, never `log.Fatal`. Wrap with `fmt.Errorf("context: %w", err)`. +- Sentinel errors from `errors.go`; new sentinels go next to the function that owns them (e.g. `ErrModelManagementNotSupported` in `llm_client.go`, `errTreeUnchanged` in `github_write_to_target.go`). +- Nil-check GitHub API responses before dereferencing. +- All logging via `log/slog`. Never `log.*` or `fmt.Print*` for operational output. +- Tests use `httpmock` (see `tests/utils.go`) for webhook flow; `httptest.Server` with `githubAPIBaseURL` package var override for operator auth tests. +- Always run tests with `-race`. +- **gosec must stay clean.** New HTTP URLs go through `githubAPIBaseURL` (or the equivalent Anthropic base URL) with validated path components + `url.PathEscape`, not raw user input. Document each `#nosec` inline. +- **Secrets never get logged or embedded in paths.** Use `hashToken` when you need a stable identifier derived from a PAT. +- **CHANGELOG.md**: update `[Unreleased]` for all notable changes. + +## Security Posture (recap) + +Details that tripped previous reviews: + +- **Auth failure ≠ writer role**: only transient 5xx from the GitHub permission check keeps the default writer role. Every other failure → `RoleDenied`. +- **No raw PATs in heap beyond request scope**: `ghAuthCache` keys on `hashToken(pat)`. Memory dumps can't leak active tokens. +- **LLM cost cap**: `/suggest-rule` is 30/hour per hashed-PAT; `/llm/status` ping is cached 30s. +- **SSRF defense-in-depth**: all GitHub API paths validate owner/repo/branch against RE2 whitelists (`ghUsernameRe`, `ghRepoNameRe`, `ghBranchNameRe`) and use `url.PathEscape` before embedding. ## Key Documentation | Doc | Purpose | |-----|---------| -| `docs/ARCHITECTURE.md` | System design, data flow, batching behavior | -| `docs/CONFIG-REFERENCE.md` | Full config schema and field reference | -| `docs/DEPLOYMENT.md` | Cloud Run deployment, secrets setup | -| `docs/TROUBLESHOOTING.md` | Common issues and debugging | -| `docs/LOCAL-TESTING.md` | Running and testing locally | -| `testdata/README.md` | Test fixtures and webhook payload examples | +| [`README.md`](README.md) | Feature overview, quick start, operator UI + AI suggester | +| [`docs/ARCHITECTURE.md`](docs/ARCHITECTURE.md) | System design, data flow, batching behavior | +| [`docs/CONFIG-REFERENCE.md`](docs/CONFIG-REFERENCE.md) | Full env-var + YAML schema reference | +| [`docs/DEPLOYMENT.md`](docs/DEPLOYMENT.md) | Cloud Run deployment, Secret Manager setup | +| [`docs/LOCAL-TESTING.md`](docs/LOCAL-TESTING.md) | Running and testing locally (incl. operator UI) | +| [`docs/TROUBLESHOOTING.md`](docs/TROUBLESHOOTING.md) | Common issues and debugging | +| [`docs/FAQ.md`](docs/FAQ.md) | FAQ including operator UI / AI suggester | +| [`cmd/test-llm/README.md`](cmd/test-llm/README.md) | LLM provider smoke test | +| [`testdata/README.md`](testdata/README.md) | Test fixtures and webhook payload examples | diff --git a/CHANGELOG.md b/CHANGELOG.md index bb44838..c5ac2d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,35 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ## [Unreleased] +### Added + +- **Operator UI — comprehensive writer + operator dashboard** at `/operator/` (`OPERATOR_UI_ENABLED=true`). Five tabs (Overview, Webhooks, Audit, Workflows, System), sticky status bar, dark mode, keyboard shortcuts, shareable URLs, and a writer/operator mode toggle persisted to localStorage. +- **GitHub PAT authentication** — users sign in with their personal access token; role is derived from their permission on `OPERATOR_AUTH_REPO` (admin/maintain → operator, write/triage/read → writer). Operator actions (replay, release, AI settings) require an explicit admin or maintain grant, since most writers have `write` on the auth repo. Replay additionally enforces read access on the source repo for that specific delivery. +- **AI rule suggester** — paste a source path and desired target state, receive a suggested workflow rule with self-verification via the in-process `PatternMatcher`. Two providers supported: + - **Anthropic (hosted)** — default for Cloud Run. API key loaded from Secret Manager via `ANTHROPIC_API_KEY_SECRET_NAME`. No infra required; operators switch between Haiku / Sonnet / Opus from the UI. + - **Ollama (local)** — for dev or self-hosted deployments. UI manages connection, model pulls, deletes, and active-model switching without a redeploy. +- **Writer-facing features** — workflow browser with per-rule coverage, PR lookup by URL, recent copies feed, file match tester (with clear button and Python-style `(?P)` regex translation for in-browser use), PR timeline, and in-app help overlay. +- **Per-delivery log viewer** — context-tagged ring buffer captures logs per webhook delivery, surfaced in an audit drawer alongside the trace and outcome summary. +- **Audit event enrichment** — `processed_ok` traces now include destination repo(s), files matched / uploaded / failed, and commit SHA. +- **Startup banner** — Operator UI, auth repo, AI model, and AI base URL are now surfaced when the app boots (local and Cloud Run). + +### Changed + +- **MongoDB audit logging enabled in production** — the Cloud Run deploy previously forced `AUDIT_ENABLED=false`; it is now `true`, aligning with the v0.3.0 "enabled by default" change. +- **Operator auth hardened** — token-based auth (`OPERATOR_UI_TOKEN`) removed entirely; GitHub PAT is the only supported mechanism. `OPERATOR_UI_ENABLED=true` now requires `OPERATOR_AUTH_REPO` at config load (validated in `validateOperatorAuth`). +- **`createPullRequest` skipped for empty commits** — `commitFilesToBranch` now returns an `errTreeUnchanged` sentinel so `addFilesViaPR` no longer calls the GitHub PR API with an unchanged tree (previously 422'd). +- **MongoDB driver v2 ObjectID decoding** — audit reads set `ObjectIDAsHexString: true` to avoid "error decoding key `_id`" on queries. + +### Fixed + +- **gosec G107 / G704 SSRF findings** — GitHub API URL construction in `services/operator_auth.go` now validates path components against strict RE2-compatible whitelists (`ghUsernameRe`, `ghRepoNameRe`) and escapes them with `url.PathEscape` before request construction; `slack_notifier.go` `#nosec` annotation extended to cover `NewRequestWithContext`. +- **Keyboard-shortcut overlay wouldn't close** — `.help-bg[hidden]` now wins over the base `display:flex`. +- **File match tester returned no matches for Java files** — JavaScript `RegExp` does not support Python-style `(?P)` named groups; the tester now rewrites `(?P<` → `(?<` before compilation. + +### Security + +- **Token auth removed** — the operator UI no longer accepts a shared bearer token; all access is per-user via GitHub PAT with repo-scoped permission checks. + ## [v0.3.0] - 2026-04-14 ### Changed diff --git a/README.md b/README.md index 2a74c22..4ecfe41 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,13 @@ A GitHub app that automatically copies code examples and files from source repos - **Development Tools** - Dry-run mode, CLI validation, enhanced logging - **Thread-Safe** - Concurrent webhook processing with proper state management +### Operator UI +- **Web dashboard at `/operator/`** - Five-tab UI (Overview, Webhooks, Audit, Workflows, System) with dark mode, keyboard shortcuts, and shareable URLs +- **GitHub PAT authentication** - Users sign in with their personal access token; role is derived from their permission on a configured auth repo (`admin`/`maintain` → operator, `write`/`triage`/`read` → writer) +- **Per-repo replay authorization** - Replay requires the caller's PAT to have read access to the source repo of the webhook being replayed +- **Writer-facing tools** - Workflow browser, PR lookup, recent copies feed, file match tester, audit drawer, per-delivery log viewer +- **AI rule suggester** - Paste a source/target pair; get a generated copier rule self-verified against the in-process pattern matcher. Two providers: [Anthropic](https://www.anthropic.com/) (hosted, default in prod via the Grove Foundry APIM gateway) or [Ollama](https://ollama.com) (local, for dev) + ## 🚀 Quick Start ### Prerequisites @@ -385,6 +392,47 @@ Get performance metrics: curl http://localhost:8080/metrics ``` +## Operator UI + +The operator UI is a web dashboard served from `/operator/` for diagnosing webhook processing, replaying failed deliveries, browsing workflows, and generating copier rules with AI assistance. + +### Enabling the UI + +Set the required env vars: + +```yaml +OPERATOR_UI_ENABLED: "true" +OPERATOR_AUTH_REPO: "your-org/some-repo" # user permissions here determine role +OPERATOR_REPO_SLUG: "your-org/some-repo" # optional; enables audit-row deep links +``` + +**Startup fails** if `OPERATOR_UI_ENABLED=true` without `OPERATOR_AUTH_REPO` — this prevents an accidentally-open operator UI. + +### Authentication and roles + +Each user authenticates with their own **GitHub Personal Access Token**. Paste the PAT into the sign-in prompt; the server checks the user's permission on `OPERATOR_AUTH_REPO` and assigns a role: + +| GitHub permission | Operator UI role | Can do | +|---|---|---| +| `admin` / `maintain` | **operator** | View everything; replay deliveries; cut release tags; change AI settings | +| `write` / `triage` / `read` | **writer** | View workflows, audit, recent copies, file match tester, AI rule suggester | +| None | **denied** | 401 Unauthorized | + +`write` maps to writer (not operator) so typical docs contributors with repo write access can't replay deliveries or cut releases — those need an explicit `admin` / `maintain` grant. + +On top of the role, **replay is repo-scoped**: the user's PAT must also have read access to the source repo of the webhook being replayed. + +### AI rule suggester + +The operator UI includes an LLM-backed helper that takes a source/target file pair and returns a generated copier workflow rule, self-verified against the in-process pattern matcher before display. + +Two providers are supported via `LLM_PROVIDER`: + +- **`anthropic`** (default in Cloud Run): calls the Anthropic Messages API. For MongoDB deployments this routes through the Grove Foundry APIM gateway — set `LLM_BASE_URL=https://grove-gateway-prod.azure-api.net/grove-foundry-prod/anthropic` and load the gateway key from Secret Manager via `ANTHROPIC_API_KEY_SECRET_NAME`. +- **`ollama`** (default for local dev): runs against a local Ollama instance at `http://localhost:11434`. Connect, pull models, and switch the active model from the UI's System → AI settings panel without a redeploy. + +Smoke-test the LLM provider end-to-end with [`cmd/test-llm`](cmd/test-llm/README.md). + ## Audit Logging When enabled, all operations are logged to MongoDB: @@ -598,4 +646,6 @@ See [DEPLOYMENT.md](./docs/DEPLOYMENT.md) for the complete deployment and rollba - **[Config Validator](cmd/config-validator/README.md)** - CLI tool for validating configs - **[Test Webhook](cmd/test-webhook/README.md)** - CLI tool for testing webhooks +- **[Test PEM](cmd/test-pem/README.md)** - CLI tool for verifying the GitHub App private key +- **[Test LLM](cmd/test-llm/README.md)** - CLI tool for smoke-testing the AI rule suggester's LLM provider - **[Scripts](scripts/README.md)** - Helper scripts for deployment, testing, and releases diff --git a/app.go b/app.go index 00c50b4..ac4d804 100644 --- a/app.go +++ b/app.go @@ -63,6 +63,15 @@ func main() { os.Exit(1) } + // Anthropic API key is only needed when the operator UI's AI suggester uses + // the anthropic provider. Failure to load is non-fatal — the UI will show + // "not configured" and writers can still use every other feature. + if config.OperatorUIEnabled && config.LLMProvider == "anthropic" { + if err := services.LoadAnthropicAPIKey(ctx, config); err != nil { + fmt.Printf("⚠️ Anthropic API key not loaded: %v (AI suggester will be disabled)\n", err) + } + } + // Override dry-run from command line if dryRun { config.DryRun = true @@ -136,15 +145,35 @@ func printBanner(config *configs.Config, container *services.ServiceContainer) { fmt.Printf("║ Version: %-48s║\n", version) fmt.Printf("║ Port: %-48s║\n", config.Port) fmt.Printf("║ Webhook Path: %-48s║\n", config.WebserverPath) - fmt.Printf("║ Config File: %-48s║\n", config.EffectiveConfigFile()) + fmt.Printf("║ Config File: %-48s║\n", truncMiddle(config.EffectiveConfigFile(), 48)) fmt.Printf("║ Dry Run: %-48v║\n", config.DryRun) fmt.Printf("║ Audit Log: %-48v║\n", config.AuditEnabled) fmt.Printf("║ Metrics: %-48v║\n", config.MetricsEnabled) fmt.Printf("║ Slack: %-48v║\n", config.SlackEnabled) + fmt.Printf("║ Operator UI: %-48v║\n", config.OperatorUIEnabled) + if config.OperatorUIEnabled { + fmt.Printf("║ Auth Repo: %-48s║\n", truncMiddle(config.OperatorAuthRepo, 48)) + fmt.Printf("║ AI Provider:%-48s║\n", truncMiddle(config.LLMProvider, 48)) + fmt.Printf("║ AI Model: %-48s║\n", truncMiddle(config.LLMModel, 48)) + fmt.Printf("║ AI URL: %-48s║\n", truncMiddle(config.LLMBaseURL, 48)) + } fmt.Println("╚════════════════════════════════════════════════════════════════╝") fmt.Println() } +// truncMiddle shortens s to max bytes, replacing the middle with "..." when +// too long. Uses ASCII so Go's byte-count-based %-Ns padding stays aligned. +func truncMiddle(s string, max int) string { + if len(s) <= max { + return s + } + if max < 6 { + return s[:max] + } + keep := (max - 3) / 2 + return s[:keep] + "..." + s[len(s)-(max-3-keep):] +} + func validateConfiguration(container *services.ServiceContainer) error { ctx := context.Background() _, err := container.ConfigLoader.LoadConfig(ctx, container.Config) @@ -155,24 +184,22 @@ func startWebServer(config *configs.Config, container *services.ServiceContainer // Create HTTP handler with all routes mux := http.NewServeMux() - // Webhook endpoint - mux.HandleFunc(config.WebserverPath, func(w http.ResponseWriter, r *http.Request) { - handleWebhook(w, r, config, container) - }) - - // Liveness probe — lightweight, always 200 if process is running + // Register built-in paths before the configurable webhook route so a mis-set + // WEBSERVER_PATH can never shadow /health, /ready, /metrics, /config, or /operator. mux.HandleFunc("/health", services.HealthHandler(container.StartTime, version)) - - // Readiness probe — checks GitHub auth, MongoDB connectivity mux.HandleFunc("/ready", services.ReadinessHandler(container)) - - // Metrics endpoint (if enabled) if config.MetricsEnabled { mux.HandleFunc("/metrics", services.MetricsHandler(container.MetricsCollector, container.FileStateService)) } - - // Config diagnostic endpoint — shows resolved config with secrets redacted mux.HandleFunc("/config", services.ConfigDiagnosticHandler(container, version)) + if config.OperatorUIEnabled { + services.RegisterOperatorRoutes(mux, config, container, version) + } + + // GitHub webhook (configurable path, typically /events) + mux.HandleFunc(config.WebserverPath, func(w http.ResponseWriter, r *http.Request) { + handleWebhook(w, r, config, container) + }) // Info endpoint mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { @@ -189,6 +216,9 @@ func startWebServer(config *configs.Config, container *services.ServiceContainer if config.MetricsEnabled { _, _ = fmt.Fprintf(w, "Metrics: /metrics\n") } + if config.OperatorUIEnabled { + _, _ = fmt.Fprintf(w, "Operator UI: /operator/ (authenticate with a GitHub PAT; role from %s)\n", config.OperatorAuthRepo) + } }) // Create server diff --git a/cmd/test-llm/README.md b/cmd/test-llm/README.md new file mode 100644 index 0000000..648f709 --- /dev/null +++ b/cmd/test-llm/README.md @@ -0,0 +1,79 @@ +# test-llm + +Smoke-test the operator UI's LLM client against the configured provider. + +## Purpose + +Verify end-to-end that: + +- The provider URL and API key are reachable from your machine +- Auth headers are accepted (direct Anthropic API or APIM-fronted gateway) +- The active model responds to a real rule-suggester prompt and returns valid JSON + +Useful after rotating `ANTHROPIC_API_KEY`, changing `LLM_BASE_URL`, or pointing at a new gateway. + +## Build + +```bash +go build -o test-llm ./cmd/test-llm +``` + +## Usage + +```bash +./test-llm [-env ] [-timeout ] +``` + +The tool reads standard env vars — `LLM_PROVIDER`, `LLM_BASE_URL`, `LLM_MODEL`, `ANTHROPIC_API_KEY` — from the process environment. Use `-env` to load a `.env`-style file first. Inline env vars on the command line override file values. + +## Examples + +Smoke-test against the local `.env.test`: + +```bash +./test-llm -env .env.test +``` + +Override the key without editing the env file: + +```bash +ANTHROPIC_API_KEY='sk-...' ./test-llm -env .env.test +``` + +Test Ollama locally: + +```bash +LLM_PROVIDER=ollama LLM_BASE_URL=http://localhost:11434 LLM_MODEL=qwen2.5-coder:7b ./test-llm +``` + +## Output + +On success: + +``` +Provider: anthropic +Base URL: https://grove-gateway-prod.azure-api.net/grove-foundry-prod/anthropic +Model: claude-haiku-4-5 +API key: sk-a…xyz9 + +✅ Ping OK +✅ ListModels: 3 models + - claude-opus-4-7 + - claude-sonnet-4-6 + - claude-haiku-4-5-20251001 +✅ GenerateJSON parsed OK: + { + "transform_type": "move", + "transform_from": "agg/python/models", + ... + } + +🎉 All checks passed — the LLM provider is reachable and usable. +``` + +## Exit Codes + +| Code | Meaning | +|------|--------------------------------------| +| 0 | All checks passed | +| 1 | Any failure (auth, network, parsing) | diff --git a/cmd/test-llm/main.go b/cmd/test-llm/main.go new file mode 100644 index 0000000..bd0d492 --- /dev/null +++ b/cmd/test-llm/main.go @@ -0,0 +1,136 @@ +// test-llm exercises the operator UI's LLM client end-to-end against the +// configured provider. It loads LLM_PROVIDER / LLM_BASE_URL / LLM_MODEL / +// ANTHROPIC_API_KEY from the environment (or an env file via -env), calls +// Ping, ListModels, and a minimal GenerateJSON with the real rule-suggester +// prompt, and prints the result. Useful for verifying a gateway URL or +// rotated API key before deploying. +package main + +import ( + "context" + "encoding/json" + "flag" + "fmt" + "os" + "strings" + "time" + + "github.com/joho/godotenv" + + "github.com/grove-platform/github-copier/services" +) + +func main() { + envFile := flag.String("env", "", "Optional path to a .env file to load before running") + timeout := flag.Duration("timeout", 30*time.Second, "Per-call timeout") + flag.Parse() + + if *envFile != "" { + if err := godotenv.Load(*envFile); err != nil { + fmt.Fprintf(os.Stderr, "❌ failed to load %s: %v\n", *envFile, err) + os.Exit(1) + } + fmt.Printf("Loaded env file: %s\n", *envFile) + } + + provider := strings.ToLower(strings.TrimSpace(os.Getenv("LLM_PROVIDER"))) + if provider == "" { + provider = "ollama" + } + baseURL := os.Getenv("LLM_BASE_URL") + model := os.Getenv("LLM_MODEL") + apiKey := os.Getenv("ANTHROPIC_API_KEY") + + fmt.Printf("Provider: %s\n", provider) + fmt.Printf("Base URL: %s\n", defaultIfEmpty(baseURL, "(default)")) + fmt.Printf("Model: %s\n", defaultIfEmpty(model, "(default)")) + if provider == "anthropic" { + fmt.Printf("API key: %s\n", redact(apiKey)) + } + fmt.Println() + + client, err := services.NewLLMClient(services.LLMClientOptions{ + Provider: provider, + BaseURL: baseURL, + Model: model, + APIKey: apiKey, + }) + if err != nil { + fmt.Fprintf(os.Stderr, "❌ NewLLMClient: %v\n", err) + os.Exit(1) + } + + // 1. Ping + ctx, cancel := context.WithTimeout(context.Background(), *timeout) + if err := client.Ping(ctx); err != nil { + cancel() + fmt.Fprintf(os.Stderr, "❌ Ping: %v\n", err) + os.Exit(1) + } + cancel() + fmt.Println("✅ Ping OK") + + // 2. ListModels + ctx, cancel = context.WithTimeout(context.Background(), *timeout) + models, err := client.ListModels(ctx) + cancel() + if err != nil { + fmt.Fprintf(os.Stderr, "⚠️ ListModels: %v\n", err) + } else { + fmt.Printf("✅ ListModels: %d models\n", len(models)) + for _, m := range models { + fmt.Printf(" - %s\n", m.Name) + } + } + + // 3. GenerateJSON using the real rule-suggester system prompt. Importing + // services.SuggestRuleSystemPrompt keeps the smoke test in lock-step with + // what writers hit via the UI — if the prompt changes, the smoke test + // covers the new behavior automatically. + systemPrompt := services.SuggestRuleSystemPrompt + userPrompt := `Generate a copier rule for this transformation: + +Source file: agg/python/models/user.py +Target file: shared/python/models/user.py +Target repo: org/shared-examples + +Return ONLY a JSON object with the fields documented above. No prose outside the JSON.` + + ctx, cancel = context.WithTimeout(context.Background(), *timeout) + raw, err := client.GenerateJSON(ctx, systemPrompt, userPrompt) + cancel() + if err != nil { + fmt.Fprintf(os.Stderr, "❌ GenerateJSON: %v\n", err) + os.Exit(1) + } + + // Pretty-print if the response parses as JSON; otherwise show raw. + var pretty map[string]any + if jerr := json.Unmarshal([]byte(raw), &pretty); jerr == nil { + out, _ := json.MarshalIndent(pretty, " ", " ") + fmt.Printf("✅ GenerateJSON parsed OK:\n %s\n", out) + } else { + fmt.Printf("⚠️ GenerateJSON returned non-JSON (%v):\n%s\n", jerr, raw) + os.Exit(1) + } + + fmt.Println("\n🎉 All checks passed — the LLM provider is reachable and usable.") +} + +func defaultIfEmpty(s, def string) string { + if strings.TrimSpace(s) == "" { + return def + } + return s +} + +func redact(s string) string { + s = strings.TrimSpace(s) + if s == "" { + return "(not set)" + } + if len(s) <= 8 { + return "***" + } + return s[:4] + "…" + s[len(s)-4:] +} diff --git a/configs/environment.go b/configs/environment.go index f9f5b01..9223fad 100644 --- a/configs/environment.go +++ b/configs/environment.go @@ -69,6 +69,29 @@ type Config struct { // Webhook retry configuration WebhookMaxRetries int // max retry attempts for failed webhook processing WebhookRetryInitialDelay int // initial delay between retries in seconds (doubles each attempt) + + // Operator web UI — off unless OPERATOR_UI_ENABLED=true. Works with any HTTP + // origin (local dev, Cloud Run, etc.). Access is gated by GitHub PATs: + // each user authenticates with their personal token, and the role + // (operator or writer) is determined by their permission on OPERATOR_AUTH_REPO. + OperatorUIEnabled bool + OperatorAuthRepo string // "owner/repo" — user permissions here determine role (required when UI is enabled) + OperatorRepoSlug string // "owner/repo" for GitHub links in audit/trace rows (optional) + OperatorReleaseGitHubToken string // PAT with contents:write to create a version tag (optional) + OperatorReleaseTargetBranch string // branch SHA used when creating a tag (default main) + + // AI rule suggestion (optional) — LLM-powered rule generation in the operator UI. + // The feature is available whenever the LLM provider is reachable at runtime; + // operators can change the active model and base URL from the UI without restart. + LLMProvider string // "ollama" (local) or "anthropic" (hosted) + LLMBaseURL string // initial default; overridable from the UI + LLMModel string // initial default; overridable from the UI + + // Anthropic API key — required when LLMProvider="anthropic". Loaded from + // Secret Manager via AnthropicAPIKeySecretName, or directly via + // ANTHROPIC_API_KEY for local dev. + AnthropicAPIKey string + AnthropicAPIKeySecretName string } const ( @@ -117,6 +140,16 @@ const ( WebhookProcessingTimeoutSeconds = "WEBHOOK_PROCESSING_TIMEOUT_SECONDS" WebhookMaxRetries = "WEBHOOK_MAX_RETRIES" WebhookRetryInitialDelay = "WEBHOOK_RETRY_INITIAL_DELAY" //nolint:gosec // env var name, not a credential + OperatorUIEnabled = "OPERATOR_UI_ENABLED" + OperatorAuthRepo = "OPERATOR_AUTH_REPO" // repo for GitHub PAT permission check + OperatorRepoSlug = "OPERATOR_REPO_SLUG" + OperatorReleaseGitHubToken = "OPERATOR_RELEASE_GITHUB_TOKEN" // #nosec G101 -- env var name + OperatorReleaseTargetBranch = "OPERATOR_RELEASE_TARGET_BRANCH" + LLMProvider = "LLM_PROVIDER" + LLMBaseURL = "LLM_BASE_URL" + LLMModel = "LLM_MODEL" + AnthropicAPIKey = "ANTHROPIC_API_KEY" // #nosec G101 -- env var name, not a credential + AnthropicAPIKeySecretName = "ANTHROPIC_API_KEY_SECRET_NAME" // #nosec G101 -- env var name, not a credential ) // NewConfig returns a new Config instance with default values @@ -235,6 +268,24 @@ func LoadEnvironment(envFile string) (*Config, error) { config.WebhookMaxRetries = getIntEnvWithDefault(WebhookMaxRetries, config.WebhookMaxRetries) config.WebhookRetryInitialDelay = getIntEnvWithDefault(WebhookRetryInitialDelay, config.WebhookRetryInitialDelay) + config.OperatorUIEnabled = getBoolEnvWithDefault(OperatorUIEnabled, false) + config.OperatorAuthRepo = os.Getenv(OperatorAuthRepo) + config.OperatorRepoSlug = os.Getenv(OperatorRepoSlug) + config.OperatorReleaseGitHubToken = os.Getenv(OperatorReleaseGitHubToken) + config.OperatorReleaseTargetBranch = getEnvWithDefault(OperatorReleaseTargetBranch, "main") + + config.LLMProvider = strings.ToLower(getEnvWithDefault(LLMProvider, "ollama")) + // Per-provider defaults: Ollama runs locally, Anthropic is hosted. + if config.LLMProvider == "anthropic" { + config.LLMBaseURL = getEnvWithDefault(LLMBaseURL, "https://api.anthropic.com") + config.LLMModel = getEnvWithDefault(LLMModel, "claude-haiku-4-5") + } else { + config.LLMBaseURL = getEnvWithDefault(LLMBaseURL, "http://localhost:11434") + config.LLMModel = getEnvWithDefault(LLMModel, "qwen2.5-coder:7b") + } + config.AnthropicAPIKey = os.Getenv(AnthropicAPIKey) + config.AnthropicAPIKeySecretName = os.Getenv(AnthropicAPIKeySecretName) + if err := validateConfig(config); err != nil { return nil, err } @@ -323,5 +374,53 @@ func validateConfig(config *Config) error { } } + if err := validateWebserverPath(config.WebserverPath); err != nil { + return err + } + + if err := validateOperatorAuth(config); err != nil { + return err + } + + return nil +} + +// validateOperatorAuth enforces that OPERATOR_AUTH_REPO is set when the UI is +// enabled. Without it, any valid GitHub user could authenticate with full +// operator access since there would be no per-repo permission gate. +func validateOperatorAuth(config *Config) error { + if !config.OperatorUIEnabled { + return nil + } + if strings.TrimSpace(config.OperatorAuthRepo) == "" { + return fmt.Errorf("OPERATOR_UI_ENABLED=true requires OPERATOR_AUTH_REPO (owner/repo) to gate access — each user authenticates with their GitHub PAT and their permission on that repo determines their role") + } + if !strings.Contains(config.OperatorAuthRepo, "/") { + return fmt.Errorf("OPERATOR_AUTH_REPO must be in owner/repo format (got %q)", config.OperatorAuthRepo) + } + return nil +} + +// validateWebserverPath rejects values that would collide with built-in HTTP routes. +func validateWebserverPath(p string) error { + p = strings.TrimSpace(p) + if p == "" { + return fmt.Errorf("WEBSERVER_PATH cannot be empty") + } + if !strings.HasPrefix(p, "/") { + return fmt.Errorf("WEBSERVER_PATH must start with / (got %q)", p) + } + if p == "/" { + return fmt.Errorf("WEBSERVER_PATH cannot be / (reserved; use a dedicated path such as /events)") + } + for _, reserved := range []string{"/health", "/ready", "/metrics", "/config", "/operator"} { + if strings.EqualFold(p, reserved) { + return fmt.Errorf("WEBSERVER_PATH cannot be %s (reserved for a built-in route)", reserved) + } + } + norm := strings.TrimSuffix(strings.ToLower(p), "/") + "/" + if strings.HasPrefix(norm, "/operator/") { + return fmt.Errorf("WEBSERVER_PATH cannot be under /operator/ (reserved for the operator UI)") + } return nil } diff --git a/docs/CONFIG-REFERENCE.md b/docs/CONFIG-REFERENCE.md index 41895fb..b08bf61 100644 --- a/docs/CONFIG-REFERENCE.md +++ b/docs/CONFIG-REFERENCE.md @@ -15,6 +15,8 @@ Complete reference for all github-copier configuration options: environment vari - [Audit Logging](#audit-logging) - [GitHub API Tuning](#github-api-tuning) - [Webhook Processing](#webhook-processing) + - [Operator UI](#operator-ui) + - [AI Rule Suggester (LLM)](#ai-rule-suggester-llm) - [Google Cloud](#google-cloud) - [Workflow YAML Schema](#workflow-yaml-schema) - [Main Config](#main-config) @@ -127,6 +129,32 @@ Set via `.env` files, `env-cloudrun.yaml`, or process environment. | `WEBHOOK_MAX_RETRIES` | int | `2` | Max retry attempts for failed webhook processing (total attempts = retries + 1). | | `WEBHOOK_RETRY_INITIAL_DELAY` | int | `5` | Initial delay between retries in **seconds** (doubles each attempt). | +### Operator UI + +Mount the web dashboard at `/operator/` (see the [Operator UI section of the README](../README.md#operator-ui) for access model, roles, and feature overview). Off unless `OPERATOR_UI_ENABLED=true`. + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `OPERATOR_UI_ENABLED` | bool | `false` | Enable the operator UI routes (`/operator/*`). | +| `OPERATOR_AUTH_REPO` | string | — | `owner/repo` — the user's permission on this repo determines their role (`admin`/`maintain` → operator, `write`/`triage`/`read` → writer). **Required** when the UI is enabled — startup fails otherwise. | +| `OPERATOR_REPO_SLUG` | string | — | `owner/repo` used to build clickable GitHub links in audit/trace rows. Optional. | +| `OPERATOR_RELEASE_GITHUB_TOKEN` | string | — | PAT with `contents:write` used by the UI to create version tag refs. Optional; without it the release button is hidden. | +| `OPERATOR_RELEASE_TARGET_BRANCH` | string | `main` | Branch whose HEAD SHA is tagged when cutting a release from the UI. | + +### AI Rule Suggester (LLM) + +Powers `/operator/api/suggest-rule`. The feature surface is always available when the operator UI is enabled; connectivity to the configured provider is checked at request time, and operators can switch model / base URL from the UI at runtime (process-global, reverts on restart). + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `LLM_PROVIDER` | string | `ollama` | Provider selector: `ollama` (local) or `anthropic` (hosted, default in Cloud Run). | +| `LLM_BASE_URL` | string | per-provider | Provider endpoint. Default `http://localhost:11434` for Ollama or `https://api.anthropic.com` for Anthropic. For MongoDB's Grove Foundry APIM gateway, use `https://grove-gateway-prod.azure-api.net/grove-foundry-prod/anthropic`. | +| `LLM_MODEL` | string | per-provider | Initial active model. Default `qwen2.5-coder:7b` for Ollama or `claude-haiku-4-5` for Anthropic. | +| `ANTHROPIC_API_KEY` | string | — | Anthropic API / gateway key. Loaded directly from the env for local dev. Ignored when `LLM_PROVIDER=ollama`. | +| `ANTHROPIC_API_KEY_SECRET_NAME` | string | — | GCP Secret Manager name for the Anthropic key; used in Cloud Run so no key material is ever in env vars or YAML. Short name (e.g. `anthropic-api-key`) is resolved to a full path via `SecretPath()`. | + +The suggester is rate-limited to 30 requests/hour per authenticated user (keyed by hashed PAT) to cap provider cost. Denied requests return 429 with a `Retry-After` header. + ### Google Cloud | Variable | Type | Default | Description | diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index 8235616..e846b39 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -164,6 +164,22 @@ echo -n "mongodb+srv://user:pass@cluster.mongodb.net/dbname" | \ --replication-policy="automatic" ``` +#### 4. Anthropic API Key (Optional - for the AI rule suggester) + +Required only when the operator UI is enabled and `LLM_PROVIDER=anthropic` (the default in the committed CI deploy). Skip if you're using Ollama or don't plan to use the AI rule suggester. + +```bash +# For the Grove Foundry APIM gateway, the value is the gateway key you were +# issued — not a raw Anthropic sk-... key. The app sends it as both the +# x-api-key (Anthropic) and api-key (APIM) header, so one key works either way. +echo -n "$GATEWAY_KEY" | \ + gcloud secrets create anthropic-api-key \ + --data-file=- \ + --replication-policy="automatic" +``` + +The env-var that points at this secret is `ANTHROPIC_API_KEY_SECRET_NAME=anthropic-api-key` (already set in `.github/workflows/ci.yml` and `env-cloudrun.yaml`). Missing key is non-fatal — the operator UI shows "not configured" and every other feature still works. + ### Grant Cloud Run Access ```bash @@ -185,6 +201,11 @@ gcloud secrets add-iam-policy-binding webhook-secret \ gcloud secrets add-iam-policy-binding mongo-uri \ --member="serviceAccount:${SERVICE_ACCOUNT}" \ --role="roles/secretmanager.secretAccessor" + +# Only if using the AI rule suggester with LLM_PROVIDER=anthropic +gcloud secrets add-iam-policy-binding anthropic-api-key \ + --member="serviceAccount:${SERVICE_ACCOUNT}" \ + --role="roles/secretmanager.secretAccessor" ``` **Note:** Cloud Run uses the default compute service account by default. You can also create a dedicated service account for better security isolation. @@ -322,11 +343,12 @@ services.LoadMongoURI(config) // Loads from Secret Manager ### Pre-Deployment Checklist -- [ ] Secrets created in Secret Manager -- [ ] IAM permissions granted to Cloud Run service account +- [ ] Secrets created in Secret Manager (`CODE_COPIER_PEM`, `webhook-secret`, `mongo-uri`, and `anthropic-api-key` if using the AI rule suggester) +- [ ] IAM permissions granted to Cloud Run service account on each secret - [ ] `env-cloudrun.yaml` created and configured - [ ] `env-cloudrun.yaml` in `.gitignore` - [ ] `Dockerfile` exists in project root +- [ ] (Operator UI) `OPERATOR_AUTH_REPO` points at a repo you own and can manage collaborators on — its permission list decides who gets operator vs writer access ### Deploy to Cloud Run @@ -480,6 +502,16 @@ gcloud run services logs read github-copier --limit=50 # ❌ "webhook signature verification failed" ``` +### Smoke-Test the Operator UI (if enabled) + +Only applicable when `OPERATOR_UI_ENABLED=true`: + +1. Open `https:///operator/` in a browser. +2. Generate a GitHub PAT with `repo` scope, paste it into the sign-in prompt. +3. Confirm the user chip in the header shows your GitHub avatar and the correct role (`operator` if you're `admin`/`maintain` on `OPERATOR_AUTH_REPO`, `writer` if you're `write`/`triage`/`read`). +4. Click the **System** tab → **AI settings** → **Refresh status**. You should see the provider connected (e.g. "Anthropic connected at https://grove-gateway-prod.azure-api.net/…"). +5. If AI settings shows "unreachable", the `anthropic-api-key` secret wasn't granted to the Cloud Run service account, or the deploy is pointing at a URL the gateway doesn't accept. Check the Cloud Run revision logs for `Anthropic API key not loaded` or a 401/403 from the gateway. + ## Monitoring ### View Logs diff --git a/docs/FAQ.md b/docs/FAQ.md index ed1ee9b..989955c 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -30,6 +30,48 @@ The GitHub copier is a GitHub app that automatically copies code examples and fi - Health and metrics endpoints - Slack notifications - Dry-run mode for testing +- **[Operator UI](../README.md#operator-ui)** — Web dashboard at `/operator/` for replay, audit browsing, workflow inspection, and AI-assisted rule generation + +## Operator UI + +### What is the operator UI? + +A web dashboard served from `/operator/` when `OPERATOR_UI_ENABLED=true`. Five tabs: + +- **Overview** — live metrics, recent activity, health of dependent services +- **Webhooks** — recent webhook traces with filter/search and one-click replay +- **Audit** — searchable audit event history with a per-event drawer (trace + logs + replay) +- **Workflows** — browse the loaded copier config; test path matches with the built-in file match tester +- **System** — deployment metadata, AI settings, release tagging + +### Who can access the operator UI? + +Anyone with a GitHub PAT that has access to `OPERATOR_AUTH_REPO`. The user's permission on that repo determines their UI role: + +- `admin` / `maintain` → **operator**: full access including replay, release, AI settings +- `write` / `triage` / `read` → **writer**: view audit, workflows, recent copies, run the AI rule suggester and file match tester, but no replay / release +- No access → 401 Unauthorized + +`write` is deliberately mapped to **writer** (not operator) so typical docs contributors can't replay deliveries or cut releases just by having repo write access. Operator capability requires an explicit `admin` / `maintain` grant. + +### How does the AI rule suggester work? + +Paste a source file path and the target file path you want; optionally name the target repo. The server sends the pair plus a structured prompt to the configured LLM, parses the returned JSON, and runs the generated rule through the in-process pattern matcher to verify it actually produces your target from your source. If it doesn't match, the UI shows a "not verified" warning next to the YAML so you can review before copying it into your config. + +Two providers are supported: + +- **Anthropic** (default in Cloud Run) — calls the hosted Messages API. In this repo's deploy it routes through the Grove Foundry APIM gateway so no infrastructure needs to be stood up. +- **Ollama** (local dev) — runs against a local model server. The UI can pull models, switch the active one, and delete models without a redeploy. + +To cap cost, the suggester is rate-limited to 30 requests/hour per authenticated user. + +### The AI settings panel says "not connected" — how do I fix it? + +Check the banner at startup — it prints the active `AI Provider`, `AI Model`, and `AI URL`. Then: + +- **Anthropic**: make sure `ANTHROPIC_API_KEY` (local) or `ANTHROPIC_API_KEY_SECRET_NAME` (Cloud Run) is set. In Cloud Run, the runtime service account also needs `roles/secretmanager.secretAccessor` on the secret. +- **Ollama**: confirm `ollama serve` is running on the host at `LLM_BASE_URL` (default `http://localhost:11434`) and that you've pulled a model. +- Use [`cmd/test-llm`](../cmd/test-llm/README.md) to exercise the full path outside the UI — it reports Ping, ListModels, and a real GenerateJSON call. ## Configuration diff --git a/docs/LOCAL-TESTING.md b/docs/LOCAL-TESTING.md index 3c74dca..1621d6e 100644 --- a/docs/LOCAL-TESTING.md +++ b/docs/LOCAL-TESTING.md @@ -343,6 +343,43 @@ AUDIT_DATABASE=code_copier_dev AUDIT_COLLECTION=audit_events ``` +### Optional (for Operator UI + AI rule suggester) + +```bash +# Mount the operator dashboard at http://localhost:8080/operator/ +OPERATOR_UI_ENABLED=true +OPERATOR_AUTH_REPO=your-org/some-repo # your GitHub permission here decides your UI role +OPERATOR_REPO_SLUG=your-org/some-repo # optional; enables clickable audit-row deep links + +# AI rule suggester — pick ONE provider: +# +# Option A: Ollama (local, no cloud calls, no API key needed) +# 1. Install Ollama: https://ollama.com/download +# 2. Leave LLM_PROVIDER unset — it defaults to ollama with http://localhost:11434 +# 3. From the UI's System → AI settings panel, pull a model (e.g. qwen2.5-coder:7b) +# +# Option B: Anthropic via Grove Foundry APIM gateway +LLM_PROVIDER=anthropic +LLM_BASE_URL=https://grove-gateway-prod.azure-api.net/grove-foundry-prod/anthropic +LLM_MODEL=claude-haiku-4-5 +ANTHROPIC_API_KEY= # never commit this; use a local-only env file +``` + +### Testing the Operator UI Locally + +1. Start the app with the env vars above. The startup banner will confirm `Operator UI: true` and show the configured auth repo, AI provider, model, and base URL. +2. Open `http://localhost:8080/operator/` in a browser. +3. Generate a [GitHub Personal Access Token](https://github.com/settings/tokens) with `repo` scope. Paste it into the sign-in prompt. The UI caches it in `localStorage` so you only paste once. +4. If you own `OPERATOR_AUTH_REPO`, grant yourself `admin` for the operator role, or `read`/`write` for the writer role — the header chip will show which one you got. +5. Smoke-test the LLM connection end-to-end with `cmd/test-llm` before hitting the UI: + + ```bash + go build -o test-llm ./cmd/test-llm + ./test-llm -env .env.test + ``` + + A successful run pings the provider, lists models, and issues a real rule-suggester prompt. See [cmd/test-llm/README.md](../cmd/test-llm/README.md) for details. + ## Troubleshooting ### Error: "A JSON web token could not be decoded" / "Failed to configure GitHub permissions" diff --git a/env-cloudrun.yaml b/env-cloudrun.yaml index 2829e5e..4a80ba0 100644 --- a/env-cloudrun.yaml +++ b/env-cloudrun.yaml @@ -33,3 +33,30 @@ COPIER_LOG_NAME: "code-copier-log" # Feature Flags AUDIT_ENABLED: "true" METRICS_ENABLED: "true" + +# Operator dashboard at https:///operator/ +# Access is gated by each user's GitHub PAT: they authenticate with their +# personal token, and their permission on OPERATOR_AUTH_REPO determines role +# (admin/maintain → operator, write/triage/read → writer). +OPERATOR_UI_ENABLED: "true" +OPERATOR_AUTH_REPO: "grove-platform/github-copier" +OPERATOR_REPO_SLUG: "grove-platform/github-copier" +# +# Optional: OPERATOR_RELEASE_GITHUB_TOKEN, OPERATOR_RELEASE_TARGET_BRANCH +# +# AI rule suggester — calls Anthropic via the Grove Foundry APIM gateway so +# Cloud Run can reach it without standing up a model-serving VM. The gateway +# key is loaded from Secret Manager via ANTHROPIC_API_KEY_SECRET_NAME (create +# the secret once with `gcloud secrets create anthropic-api-key --data-file=...`). +# Operators can still switch the active model (haiku / sonnet / opus) from +# the UI without a redeploy. +LLM_PROVIDER: "anthropic" +LLM_BASE_URL: "https://grove-gateway-prod.azure-api.net/grove-foundry-prod/anthropic" +LLM_MODEL: "claude-haiku-4-5" +ANTHROPIC_API_KEY_SECRET_NAME: "anthropic-api-key" +# +# To use a local Ollama instance instead (e.g. in a dev environment with a +# reachable Ollama VM), comment out the three lines above and use: +# LLM_PROVIDER: "ollama" +# LLM_BASE_URL: "http://ollama.internal:11434" +# LLM_MODEL: "qwen2.5-coder:7b" diff --git a/services/audit_logger.go b/services/audit_logger.go index 7fa6fca..0c264bb 100644 --- a/services/audit_logger.go +++ b/services/audit_logger.go @@ -21,21 +21,21 @@ const ( // AuditEvent represents an audit log entry type AuditEvent struct { - ID string `bson:"_id,omitempty"` - Timestamp time.Time `bson:"timestamp"` - EventType AuditEventType `bson:"event_type"` - RuleName string `bson:"rule_name,omitempty"` - SourceRepo string `bson:"source_repo"` - SourcePath string `bson:"source_path"` - TargetRepo string `bson:"target_repo,omitempty"` - TargetPath string `bson:"target_path,omitempty"` - CommitSHA string `bson:"commit_sha,omitempty"` - PRNumber int `bson:"pr_number,omitempty"` - Success bool `bson:"success"` - ErrorMessage string `bson:"error_message,omitempty"` - DurationMs int64 `bson:"duration_ms,omitempty"` - FileSize int64 `bson:"file_size,omitempty"` - AdditionalData map[string]any `bson:"additional_data,omitempty"` + ID string `bson:"_id,omitempty" json:"id,omitempty"` + Timestamp time.Time `bson:"timestamp" json:"timestamp"` + EventType AuditEventType `bson:"event_type" json:"event_type"` + RuleName string `bson:"rule_name,omitempty" json:"rule_name,omitempty"` + SourceRepo string `bson:"source_repo" json:"source_repo"` + SourcePath string `bson:"source_path" json:"source_path"` + TargetRepo string `bson:"target_repo,omitempty" json:"target_repo,omitempty"` + TargetPath string `bson:"target_path,omitempty" json:"target_path,omitempty"` + CommitSHA string `bson:"commit_sha,omitempty" json:"commit_sha,omitempty"` + PRNumber int `bson:"pr_number,omitempty" json:"pr_number,omitempty"` + Success bool `bson:"success" json:"success"` + ErrorMessage string `bson:"error_message,omitempty" json:"error_message,omitempty"` + DurationMs int64 `bson:"duration_ms,omitempty" json:"duration_ms,omitempty"` + FileSize int64 `bson:"file_size,omitempty" json:"file_size,omitempty"` + AdditionalData map[string]any `bson:"additional_data,omitempty" json:"additional_data,omitempty"` } // AuditLogger handles audit logging to MongoDB @@ -48,25 +48,37 @@ type AuditLogger interface { GetEventsByRule(ctx context.Context, ruleName string, limit int) ([]AuditEvent, error) GetStatsByRule(ctx context.Context) (map[string]RuleStats, error) GetDailyVolume(ctx context.Context, days int) ([]DailyStats, error) + QueryAuditEvents(ctx context.Context, q AuditListQuery) ([]AuditEvent, error) Ping(ctx context.Context) error Close(ctx context.Context) error } +// AuditListQuery filters audit rows for operator dashboards and APIs. +type AuditListQuery struct { + Limit int + EventType string // empty = any; otherwise copy | deprecation | error + Success *bool // nil = any + RuleName string // exact match when non-empty + PRNumber *int // nil = any; exact match when set + PathSearch string // substring match on source_path OR target_path when non-empty + Since *time.Time // inclusive lower bound on timestamp when set +} + // RuleStats represents statistics for a specific rule type RuleStats struct { - RuleName string `bson:"_id"` - TotalCopies int `bson:"total_copies"` - SuccessCount int `bson:"success_count"` - FailureCount int `bson:"failure_count"` - AvgDuration float64 `bson:"avg_duration"` + RuleName string `bson:"_id" json:"rule_name"` + TotalCopies int `bson:"total_copies" json:"total_copies"` + SuccessCount int `bson:"success_count" json:"success_count"` + FailureCount int `bson:"failure_count" json:"failure_count"` + AvgDuration float64 `bson:"avg_duration" json:"avg_duration_ms"` } // DailyStats represents daily copy volume statistics type DailyStats struct { - Date string `bson:"_id"` - TotalCopies int `bson:"total_copies"` - SuccessCount int `bson:"success_count"` - FailureCount int `bson:"failure_count"` + Date string `bson:"_id" json:"date"` + TotalCopies int `bson:"total_copies" json:"total_copies"` + SuccessCount int `bson:"success_count" json:"success_count"` + FailureCount int `bson:"failure_count" json:"failure_count"` } // MongoAuditLogger implements AuditLogger using MongoDB @@ -92,7 +104,10 @@ func NewMongoAuditLogger(ctx context.Context, mongoURI, database, collection str SetConnectTimeout(5 * time.Second). SetTimeout(10 * time.Second). SetMaxPoolSize(10). - SetRetryWrites(true) + SetRetryWrites(true). + SetBSONOptions(&options.BSONOptions{ + ObjectIDAsHexString: true, + }) client, err := mongo.Connect(clientOptions) if err != nil { return nil, fmt.Errorf("failed to connect to MongoDB: %w", err) @@ -168,7 +183,51 @@ func (mal *MongoAuditLogger) LogErrorEvent(ctx context.Context, event *AuditEven return err } -// GetRecentEvents retrieves recent audit events +// QueryAuditEvents retrieves audit events matching the given filter criteria. +func (mal *MongoAuditLogger) QueryAuditEvents(ctx context.Context, q AuditListQuery) ([]AuditEvent, error) { + limit := q.Limit + if limit <= 0 { + limit = 50 + } + if limit > 200 { + limit = 200 + } + filter := bson.M{} + if q.EventType != "" { + filter["event_type"] = AuditEventType(q.EventType) + } + if q.Success != nil { + filter["success"] = *q.Success + } + if q.RuleName != "" { + filter["rule_name"] = q.RuleName + } + if q.PRNumber != nil { + filter["pr_number"] = *q.PRNumber + } + if q.PathSearch != "" { + filter["$or"] = bson.A{ + bson.M{"source_path": bson.M{"$regex": q.PathSearch, "$options": "i"}}, + bson.M{"target_path": bson.M{"$regex": q.PathSearch, "$options": "i"}}, + } + } + if q.Since != nil { + filter["timestamp"] = bson.M{"$gte": *q.Since} + } + opts := options.Find().SetSort(bson.D{{Key: "timestamp", Value: -1}}).SetLimit(int64(limit)) + cursor, err := mal.collection.Find(ctx, filter, opts) + if err != nil { + return nil, err + } + defer func() { _ = cursor.Close(ctx) }() + + var events []AuditEvent + if err := cursor.All(ctx, &events); err != nil { + return nil, err + } + return events, nil +} + func (mal *MongoAuditLogger) GetRecentEvents(ctx context.Context, limit int) ([]AuditEvent, error) { opts := options.Find().SetSort(bson.D{{Key: "timestamp", Value: -1}}).SetLimit(int64(limit)) cursor, err := mal.collection.Find(ctx, bson.M{}, opts) @@ -318,5 +377,8 @@ func (nal *NoOpAuditLogger) GetStatsByRule(ctx context.Context) (map[string]Rule func (nal *NoOpAuditLogger) GetDailyVolume(ctx context.Context, days int) ([]DailyStats, error) { return []DailyStats{}, nil } +func (nal *NoOpAuditLogger) QueryAuditEvents(ctx context.Context, q AuditListQuery) ([]AuditEvent, error) { + return []AuditEvent{}, nil +} func (nal *NoOpAuditLogger) Ping(ctx context.Context) error { return nil } func (nal *NoOpAuditLogger) Close(ctx context.Context) error { return nil } diff --git a/services/audit_logger_test.go b/services/audit_logger_test.go index 1f89afa..7a1a3eb 100644 --- a/services/audit_logger_test.go +++ b/services/audit_logger_test.go @@ -173,6 +173,18 @@ func TestNoOpAuditLogger_GetStatsByRule(t *testing.T) { } } +func TestNoOpAuditLogger_QueryAuditEvents(t *testing.T) { + ctx := context.Background() + logger := &NoOpAuditLogger{} + got, err := logger.QueryAuditEvents(ctx, AuditListQuery{Limit: 10, EventType: string(AuditEventCopy)}) + if err != nil { + t.Fatalf("QueryAuditEvents: %v", err) + } + if len(got) != 0 { + t.Errorf("expected empty slice, got %d events", len(got)) + } +} + func TestNoOpAuditLogger_GetDailyVolume(t *testing.T) { logger := &NoOpAuditLogger{} ctx := context.Background() diff --git a/services/delivery_tracker.go b/services/delivery_tracker.go index fe36de0..0f46944 100644 --- a/services/delivery_tracker.go +++ b/services/delivery_tracker.go @@ -5,6 +5,15 @@ import ( "time" ) +const deliveryHistoryMax = 200 + +// DeliverySnapshot is one observed webhook delivery ID for operator diagnostics. +type DeliverySnapshot struct { + DeliveryID string `json:"delivery_id"` + SeenAt time.Time `json:"seen_at"` + Duplicate bool `json:"duplicate"` +} + // DeliveryTracker tracks processed GitHub webhook delivery IDs to prevent // duplicate processing. GitHub retries deliveries on timeout or error, and // the X-GitHub-Delivery header uniquely identifies each delivery. @@ -16,6 +25,9 @@ type DeliveryTracker struct { entries map[string]time.Time ttl time.Duration + // history is a bounded ring of recent TryRecord outcomes (new vs duplicate) for diagnostics. + history []DeliverySnapshot + // stopCleanup signals the background goroutine to stop stopCleanup chan struct{} } @@ -26,6 +38,7 @@ func NewDeliveryTracker(ttl time.Duration) *DeliveryTracker { dt := &DeliveryTracker{ entries: make(map[string]time.Time), ttl: ttl, + history: make([]DeliverySnapshot, 0, 32), stopCleanup: make(chan struct{}), } go dt.cleanupLoop() @@ -40,15 +53,59 @@ func (dt *DeliveryTracker) TryRecord(deliveryID string) bool { if seenAt, exists := dt.entries[deliveryID]; exists { if time.Since(seenAt) < dt.ttl { + dt.appendHistoryLocked(deliveryID, true) return false // duplicate within TTL } // Expired entry — allow reprocessing } dt.entries[deliveryID] = time.Now() + dt.appendHistoryLocked(deliveryID, false) return true } +func (dt *DeliveryTracker) appendHistoryLocked(deliveryID string, duplicate bool) { + if deliveryID == "" { + return + } + dt.history = append(dt.history, DeliverySnapshot{ + DeliveryID: deliveryID, + SeenAt: time.Now().UTC(), + Duplicate: duplicate, + }) + if len(dt.history) > deliveryHistoryMax { + dt.history = dt.history[len(dt.history)-deliveryHistoryMax:] + } +} + +// HistoryLen returns how many recent delivery observations are buffered for diagnostics. +func (dt *DeliveryTracker) HistoryLen() int { + dt.mu.Lock() + defer dt.mu.Unlock() + return len(dt.history) +} + +// RecentDeliveries returns the last up to max observations (newest at end). +func (dt *DeliveryTracker) RecentDeliveries(max int) []DeliverySnapshot { + dt.mu.Lock() + defer dt.mu.Unlock() + if len(dt.history) == 0 { + return nil + } + if max <= 0 { + max = 100 + } + if max > deliveryHistoryMax { + max = deliveryHistoryMax + } + if len(dt.history) <= max { + out := make([]DeliverySnapshot, len(dt.history)) + copy(out, dt.history) + return out + } + return append([]DeliverySnapshot(nil), dt.history[len(dt.history)-max:]...) +} + // Len returns the current number of tracked delivery IDs (for diagnostics). func (dt *DeliveryTracker) Len() int { dt.mu.Lock() diff --git a/services/delivery_tracker_test.go b/services/delivery_tracker_test.go index 0ebb725..e3d7483 100644 --- a/services/delivery_tracker_test.go +++ b/services/delivery_tracker_test.go @@ -20,6 +20,13 @@ func TestDeliveryTracker_TryRecord(t *testing.T) { if dt.TryRecord("delivery-1") { t.Error("expected duplicate TryRecord to return false") } + hist := dt.RecentDeliveries(10) + if len(hist) < 2 { + t.Fatalf("expected history len >= 2, got %d", len(hist)) + } + if !hist[len(hist)-1].Duplicate { + t.Error("expected last history entry to be duplicate") + } // Different ID should succeed if !dt.TryRecord("delivery-2") { diff --git a/services/github_auth.go b/services/github_auth.go index 8317d0a..25d80e4 100644 --- a/services/github_auth.go +++ b/services/github_auth.go @@ -158,6 +158,26 @@ func LoadMongoURI(ctx context.Context, config *configs.Config) error { return nil } +// LoadAnthropicAPIKey loads the Anthropic API key from Secret Manager or +// environment variable. Only called when the LLM provider is "anthropic". +// Missing value is non-fatal: NewLLMClient will refuse to construct a client, +// the operator UI will show "not configured", and the rest of the app runs. +func LoadAnthropicAPIKey(ctx context.Context, config *configs.Config) error { + if config.AnthropicAPIKey != "" { + return nil + } + if config.AnthropicAPIKeySecretName == "" { + return nil + } + resolvedName := config.SecretPath(config.AnthropicAPIKeySecretName) + key, err := getSecretFromSecretManager(ctx, resolvedName, "ANTHROPIC_API_KEY") + if err != nil { + return fmt.Errorf("failed to load Anthropic API key: %w", err) + } + config.AnthropicAPIKey = key + return nil +} + // getSecretFromSecretManager is a generic function to retrieve any secret from Secret Manager func getSecretFromSecretManager(ctx context.Context, secretName, envVarName string) (string, error) { if os.Getenv("SKIP_SECRET_MANAGER") == "true" { diff --git a/services/github_write_to_target.go b/services/github_write_to_target.go index 2159c51..a88fb6b 100644 --- a/services/github_write_to_target.go +++ b/services/github_write_to_target.go @@ -2,6 +2,7 @@ package services import ( "context" + "errors" "fmt" "net/http" "strings" @@ -12,6 +13,10 @@ import ( "github.com/grove-platform/github-copier/types" ) +// errTreeUnchanged is returned by commitFilesToBranch when the new file tree is +// identical to the branch HEAD, meaning there is nothing to commit. +var errTreeUnchanged = errors.New("tree unchanged — nothing to commit") + // parseRepoPath parses a repository path in the format "owner/repo" and returns owner and repo separately. // If the path doesn't contain a slash, it returns defaultOwner and the path as repo name. func parseRepoPath(repoPath string, defaultOwner string) (owner, repo string) { @@ -54,7 +59,8 @@ func normalizeRefPath(branchPath string, fullPath bool) string { // AddFilesToTargetRepos uploads files to target repository branches. // It accepts the upload map as a parameter for concurrency safety. -func AddFilesToTargetRepos(ctx context.Context, config *configs.Config, filesToUpload map[types.UploadKey]types.UploadFileContent, prTemplateFetcher PRTemplateFetcher, metricsCollector *MetricsCollector) { +// When auditLogger is non-nil, each file copy is recorded (success or failure) for MongoDB audit. +func AddFilesToTargetRepos(ctx context.Context, config *configs.Config, filesToUpload map[types.UploadKey]types.UploadFileContent, prTemplateFetcher PRTemplateFetcher, metricsCollector *MetricsCollector, auditLogger AuditLogger) { if config.DryRun { for key, value := range filesToUpload { LogInfo("[DRY-RUN] Would upload files to target repo", @@ -71,11 +77,95 @@ func AddFilesToTargetRepos(ctx context.Context, config *configs.Config, filesToU } for key, value := range filesToUpload { + batchStart := time.Now() if err := uploadToTarget(ctx, config, key, value, prTemplateFetcher); err != nil { LogCritical("Failed to upload files", "repo", key.RepoName, "error", err) recordBatchFailure(metricsCollector, len(value.Content)) + auditLogCopyBatchFailure(ctx, auditLogger, key, value, err) + } else { + auditLogCopyBatchSuccess(ctx, auditLogger, key, value, time.Since(batchStart)) + } + } +} + +func auditLogCopyBatchSuccess(ctx context.Context, auditLogger AuditLogger, key types.UploadKey, value types.UploadFileContent, elapsed time.Duration) { + if auditLogger == nil || len(value.Content) == 0 { + return + } + n := len(value.Content) + perFileMs := elapsed.Milliseconds() / int64(n) + if perFileMs == 0 && elapsed > 0 { + perFileMs = 1 + } + for i := range value.Content { + f := value.Content[i] + meta := types.CopierFileMeta{} + if i < len(value.FileMeta) { + meta = value.FileMeta[i] + } + srcPath := meta.SourcePath + if srcPath == "" { + srcPath = f.GetPath() + } + ev := &AuditEvent{ + RuleName: meta.RuleName, + SourceRepo: meta.SourceRepo, + SourcePath: srcPath, + CommitSHA: meta.CommitSHA, + TargetRepo: key.RepoName, + TargetPath: f.GetName(), + PRNumber: meta.PRNumber, + Success: true, + DurationMs: perFileMs, + FileSize: int64(decodedFileBytes(&f)), + } + if err := auditLogger.LogCopyEvent(ctx, ev); err != nil { + LogWarning("audit LogCopyEvent failed", "error", err) + } + } +} + +func auditLogCopyBatchFailure(ctx context.Context, auditLogger AuditLogger, key types.UploadKey, value types.UploadFileContent, batchErr error) { + if auditLogger == nil || len(value.Content) == 0 { + return + } + msg := batchErr.Error() + for i := range value.Content { + f := value.Content[i] + meta := types.CopierFileMeta{} + if i < len(value.FileMeta) { + meta = value.FileMeta[i] + } + srcPath := meta.SourcePath + if srcPath == "" { + srcPath = f.GetPath() } + ev := &AuditEvent{ + RuleName: meta.RuleName, + SourceRepo: meta.SourceRepo, + SourcePath: srcPath, + CommitSHA: meta.CommitSHA, + TargetRepo: key.RepoName, + TargetPath: f.GetName(), + PRNumber: meta.PRNumber, + Success: false, + ErrorMessage: msg, + } + if err := auditLogger.LogCopyEvent(ctx, ev); err != nil { + LogWarning("audit LogCopyEvent (failure) failed", "error", err) + } + } +} + +func decodedFileBytes(f *github.RepositoryContent) int { + if f == nil { + return 0 + } + c, err := f.GetContent() + if err != nil { + return 0 } + return len(c) } // uploadToTarget handles a single upload-key: authenticates for the target org, @@ -230,6 +320,11 @@ func addFilesViaPR(ctx context.Context, config *configs.Config, client *github.C // Push new files to the existing branch if err := commitFilesToBranch(ctx, config, client, key, files, existingBranch, commitMessage); err != nil { + if errors.Is(err, errTreeUnchanged) { + LogInfo("No changes to push to existing copier PR — files already up to date", + "pr_number", existingPR.GetNumber(), "repo", key.RepoName) + return nil + } return fmt.Errorf("commit to existing copier branch %s: %w", existingBranch, err) } @@ -259,6 +354,13 @@ func addFilesViaPR(ctx context.Context, config *configs.Config, client *github.C // 2. Commit files to temp branch if err := commitFilesToBranch(ctx, config, client, key, files, tempBranch, commitMessage); err != nil { + if errors.Is(err, errTreeUnchanged) { + LogInfo("No changes to commit — files already match target. Cleaning up temp branch.", + "repo", key.RepoName, "branch", tempBranch) + // Best-effort cleanup of the empty branch + _, _ = client.Git.DeleteRef(ctx, owner, repoName, "refs/heads/"+tempBranch) + return nil + } return err } @@ -305,7 +407,7 @@ func commitFilesToBranch(ctx context.Context, config *configs.Config, client *gi "branch", tempBranch, "tree_sha", tr.TreeSHA, ) - return nil + return errTreeUnchanged } if err = createCommit(ctx, client, config.ConfigRepoOwner, tempKey, tr.BaseSHA, tr.TreeSHA, commitMessage); err != nil { diff --git a/services/github_write_to_target_test.go b/services/github_write_to_target_test.go index 05e1383..e61221f 100644 --- a/services/github_write_to_target_test.go +++ b/services/github_write_to_target_test.go @@ -83,7 +83,7 @@ func TestAddFilesToTargetRepos_Direct_Succeeds(t *testing.T) { }, } - services.AddFilesToTargetRepos(context.Background(), test.TestConfig(), filesToUpload, nil, nil) + services.AddFilesToTargetRepos(context.Background(), test.TestConfig(), filesToUpload, nil, nil, nil) info := httpmock.GetCallCountInfo() require.Equal(t, 1, info["GET "+baseRefURL]) @@ -173,7 +173,7 @@ func TestAddFilesToTargetRepos_ViaPR_Succeeds(t *testing.T) { }, } - services.AddFilesToTargetRepos(context.Background(), cfg, filesToUpload, nil, nil) + services.AddFilesToTargetRepos(context.Background(), cfg, filesToUpload, nil, nil, nil) require.Equal(t, 1, test.CountByMethodAndURLRegexp("POST", regexp.MustCompile(`/app/installations/`+regexp.QuoteMeta(cfg.InstallationId)+`/access_tokens$`), @@ -244,7 +244,7 @@ func TestAddFilesToTargetRepos_Direct_SkipsEmptyCommit(t *testing.T) { }, } - services.AddFilesToTargetRepos(context.Background(), test.TestConfig(), filesToUpload, nil, nil) + services.AddFilesToTargetRepos(context.Background(), test.TestConfig(), filesToUpload, nil, nil, nil) info := httpmock.GetCallCountInfo() // Should still fetch the ref and create the tree @@ -284,7 +284,7 @@ func TestAddFiles_DirectConflict_NonFastForward(t *testing.T) { }, } - services.AddFilesToTargetRepos(context.Background(), test.TestConfig(), filesToUpload, nil, nil) + services.AddFilesToTargetRepos(context.Background(), test.TestConfig(), filesToUpload, nil, nil, nil) info := httpmock.GetCallCountInfo() require.Equal(t, 1, info["GET "+baseRefURL]) @@ -364,7 +364,7 @@ func TestAddFiles_ViaPR_MergeConflict_Dirty_NotMerged(t *testing.T) { }, } - services.AddFilesToTargetRepos(context.Background(), cfg, filesToUpload, nil, nil) + services.AddFilesToTargetRepos(context.Background(), cfg, filesToUpload, nil, nil, nil) info := httpmock.GetCallCountInfo() require.Equal(t, 1, info["POST "+createRefURL]) @@ -417,7 +417,7 @@ func TestPriority_Strategy_ConfigOverridesEnv_And_MessageFallbacks(t *testing.T) {RepoName: repo, BranchPath: "refs/heads/" + baseBranch, CommitStrategy: typeCfg.CopierCommitStrategy}: {TargetBranch: baseBranch, Content: files}, } - services.AddFilesToTargetRepos(context.Background(), testCfg, filesToUpload, nil, nil) + services.AddFilesToTargetRepos(context.Background(), testCfg, filesToUpload, nil, nil, nil) info := httpmock.GetCallCountInfo() require.Equal(t, 1, info["GET "+baseRefURL]) @@ -495,7 +495,7 @@ func TestPriority_PRTitleDefaultsToCommitMessage_And_NoAutoMergeWhenConfigPresen {RepoName: repo, BranchPath: "refs/heads/" + baseBranch, RuleName: "", CommitStrategy: "pr"}: {TargetBranch: baseBranch, Content: files, CommitStrategy: "pr"}, } - services.AddFilesToTargetRepos(context.Background(), cfg, filesToUpload, nil, nil) + services.AddFilesToTargetRepos(context.Background(), cfg, filesToUpload, nil, nil, nil) require.Equal(t, 1, test.CountByMethodAndURLRegexp("POST", regexp.MustCompile(`/pulls$`))) require.Equal(t, 0, test.CountByMethodAndURLRegexp("PUT", regexp.MustCompile(`/pulls/5/merge$`))) @@ -564,7 +564,7 @@ func TestAddFilesToTargetRepos_MixedStrategies_ProducesSeparateOperations(t *tes }, } - services.AddFilesToTargetRepos(context.Background(), cfg, filesToUpload, nil, nil) + services.AddFilesToTargetRepos(context.Background(), cfg, filesToUpload, nil, nil, nil) info := httpmock.GetCallCountInfo() @@ -647,7 +647,7 @@ func TestAddFilesViaPR_ReusesExistingCopierPR(t *testing.T) { }, } - services.AddFilesToTargetRepos(context.Background(), cfg, filesToUpload, nil, nil) + services.AddFilesToTargetRepos(context.Background(), cfg, filesToUpload, nil, nil, nil) info := httpmock.GetCallCountInfo() diff --git a/services/llm_anthropic.go b/services/llm_anthropic.go new file mode 100644 index 0000000..ad8a609 --- /dev/null +++ b/services/llm_anthropic.go @@ -0,0 +1,282 @@ +package services + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "strings" + "sync" + "time" +) + +// anthropicAPIVersion is pinned to the stable Messages API version. Bump this +// only when we intentionally adopt a new API contract. +const anthropicAPIVersion = "2023-06-01" + +// defaultAnthropicBaseURL is the hosted Anthropic API. Override (via SetBaseURL +// or LLM_BASE_URL) only to route through a gateway or proxy that speaks the +// same wire format. +const defaultAnthropicBaseURL = "https://api.anthropic.com" + +// anthropicFallbackModels is used when /v1/models returns an empty list or +// errors (e.g. behind a gateway that doesn't expose it). Kept deliberately +// minimal — listing every model here means every rotation ships dead +// dropdown options. Aliased names route to the current dated release, so +// this single entry stays valid across point releases. +var anthropicFallbackModels = []LLMModel{ + {Name: "claude-haiku-4-5"}, +} + +type anthropicClient struct { + mu sync.RWMutex + baseURL string + model string + apiKey string + http *http.Client +} + +func newAnthropicClient(baseURL, model, apiKey string) *anthropicClient { + if strings.TrimSpace(baseURL) == "" { + baseURL = defaultAnthropicBaseURL + } + if strings.TrimSpace(model) == "" { + model = "claude-haiku-4-5" + } + return &anthropicClient{ + baseURL: strings.TrimSuffix(baseURL, "/"), + model: model, + apiKey: apiKey, + http: &http.Client{Timeout: 60 * time.Second}, + } +} + +func (c *anthropicClient) ProviderName() string { return "anthropic" } + +func (c *anthropicClient) GetBaseURL() string { + c.mu.RLock() + defer c.mu.RUnlock() + return c.baseURL +} + +func (c *anthropicClient) SetBaseURL(url string) { + c.mu.Lock() + defer c.mu.Unlock() + c.baseURL = strings.TrimSuffix(strings.TrimSpace(url), "/") +} + +func (c *anthropicClient) GetActiveModel() string { + c.mu.RLock() + defer c.mu.RUnlock() + return c.model +} + +func (c *anthropicClient) SetActiveModel(model string) { + c.mu.Lock() + defer c.mu.Unlock() + c.model = strings.TrimSpace(model) +} + +// newAuthedRequest builds a request with the Anthropic auth + version headers. +// Callers must have already validated that URL components are not user-supplied; +// the base URL is derived from a pinned default or operator-set value. +// +// We set both x-api-key (native Anthropic) and api-key (Azure API Management +// gateway convention) so the same client works when LLM_BASE_URL points at +// either the direct API or an APIM-fronted proxy. Sending both is harmless — +// the target service uses whichever it recognizes and ignores the other. +func (c *anthropicClient) newAuthedRequest(ctx context.Context, method, path string, body io.Reader) (*http.Request, error) { + req, err := http.NewRequestWithContext(ctx, method, c.GetBaseURL()+path, body) // #nosec G107 -- base URL is pinned default or operator-set; path is a literal constant + if err != nil { + return nil, err + } + req.Header.Set("x-api-key", c.apiKey) + req.Header.Set("api-key", c.apiKey) + req.Header.Set("anthropic-version", anthropicAPIVersion) + req.Header.Set("content-type", "application/json") + return req, nil +} + +// Ping issues a minimal /v1/messages call as an auth + reachability check. +// Using /v1/messages (rather than /v1/models) keeps this working behind +// proxies — including Azure APIM-fronted gateways — that only expose the +// messages endpoint. Cost per ping is roughly 1 input + 1 output token. +func (c *anthropicClient) Ping(ctx context.Context) error { + if strings.TrimSpace(c.apiKey) == "" { + return fmt.Errorf("ANTHROPIC_API_KEY is not configured") + } + body, _ := json.Marshal(anthropicMessagesRequest{ + Model: c.GetActiveModel(), + MaxTokens: 1, + Messages: []anthropicMessage{{Role: "user", Content: "ping"}}, + }) + req, err := c.newAuthedRequest(ctx, http.MethodPost, "/v1/messages", bytes.NewReader(body)) + if err != nil { + return err + } + resp, err := c.http.Do(req) // #nosec G107 -- see newAuthedRequest + if err != nil { + return fmt.Errorf("anthropic unreachable at %s: %w", c.GetBaseURL(), err) + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden { + return fmt.Errorf("anthropic auth failed (HTTP %d) — check ANTHROPIC_API_KEY", resp.StatusCode) + } + if resp.StatusCode != http.StatusOK { + respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<15)) + return fmt.Errorf("anthropic returned %s: %s", resp.Status, strings.TrimSpace(string(respBody))) + } + return nil +} + +type anthropicModelsResponse struct { + Data []struct { + ID string `json:"id"` + DisplayName string `json:"display_name"` + CreatedAt string `json:"created_at"` + } `json:"data"` +} + +// ListModels returns models available to the account. Falls back to a static +// list if the API call fails so the UI stays usable. +func (c *anthropicClient) ListModels(ctx context.Context) ([]LLMModel, error) { + req, err := c.newAuthedRequest(ctx, http.MethodGet, "/v1/models", nil) + if err != nil { + return nil, err + } + resp, err := c.http.Do(req) // #nosec G107 -- see newAuthedRequest + if err != nil { + return anthropicFallbackModels, nil + } + defer func() { _ = resp.Body.Close() }() + body, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) + if resp.StatusCode != http.StatusOK { + return anthropicFallbackModels, nil + } + var out anthropicModelsResponse + if err := json.Unmarshal(body, &out); err != nil { + return anthropicFallbackModels, nil + } + if len(out.Data) == 0 { + return anthropicFallbackModels, nil + } + models := make([]LLMModel, 0, len(out.Data)) + for _, m := range out.Data { + models = append(models, LLMModel{Name: m.ID, ModifiedAt: m.CreatedAt}) + } + return models, nil +} + +// PullModel / DeleteModel are not supported for hosted providers. The UI +// hides the relevant sections when provider != "ollama", so these should not +// normally be reached; returning a sentinel lets the HTTP layer map cleanly. +func (c *anthropicClient) PullModel(_ context.Context, _ string, _ func(LLMPullProgress)) error { + return ErrModelManagementNotSupported +} + +func (c *anthropicClient) DeleteModel(_ context.Context, _ string) error { + return ErrModelManagementNotSupported +} + +// anthropicMessagesRequest is the body of POST /v1/messages. +type anthropicMessagesRequest struct { + Model string `json:"model"` + MaxTokens int `json:"max_tokens"` + System string `json:"system,omitempty"` + Messages []anthropicMessage `json:"messages"` +} + +type anthropicMessage struct { + Role string `json:"role"` + Content string `json:"content"` +} + +type anthropicMessagesResponse struct { + Content []struct { + Type string `json:"type"` + Text string `json:"text"` + } `json:"content"` + StopReason string `json:"stop_reason"` + Error *struct { + Type string `json:"type"` + Message string `json:"message"` + } `json:"error,omitempty"` +} + +// jsonGuardrail is appended to the system prompt to nudge the model toward +// raw JSON output. Anthropic has no native JSON mode on /v1/messages, so we +// rely on prompting + a post-processing fence strip. +const jsonGuardrail = "\n\nRespond with ONLY valid JSON — no prose, no explanations outside the JSON, no code fences, no backticks. Just the JSON object." + +func (c *anthropicClient) GenerateJSON(ctx context.Context, systemPrompt, userPrompt string) (string, error) { + if strings.TrimSpace(c.apiKey) == "" { + return "", fmt.Errorf("ANTHROPIC_API_KEY is not configured") + } + reqBody, err := json.Marshal(anthropicMessagesRequest{ + Model: c.GetActiveModel(), + MaxTokens: 4096, + System: systemPrompt + jsonGuardrail, + Messages: []anthropicMessage{{Role: "user", Content: userPrompt}}, + }) + if err != nil { + return "", fmt.Errorf("marshal anthropic request: %w", err) + } + + req, err := c.newAuthedRequest(ctx, http.MethodPost, "/v1/messages", bytes.NewReader(reqBody)) + if err != nil { + return "", err + } + resp, err := c.http.Do(req) // #nosec G107 -- see newAuthedRequest + if err != nil { + return "", fmt.Errorf("call anthropic at %s: %w", c.GetBaseURL(), err) + } + defer func() { _ = resp.Body.Close() }() + + body, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("anthropic returned %s: %s", resp.Status, strings.TrimSpace(string(body))) + } + + var out anthropicMessagesResponse + if err := json.Unmarshal(body, &out); err != nil { + return "", fmt.Errorf("parse anthropic response: %w", err) + } + if out.Error != nil { + return "", fmt.Errorf("anthropic error: %s: %s", out.Error.Type, out.Error.Message) + } + // Concatenate all text blocks (usually one). + var sb strings.Builder + for _, block := range out.Content { + if block.Type == "text" { + sb.WriteString(block.Text) + } + } + raw := strings.TrimSpace(sb.String()) + if raw == "" { + return "", fmt.Errorf("anthropic returned empty response (model %q)", c.GetActiveModel()) + } + return stripJSONFences(raw), nil +} + +// stripJSONFences removes ```json ... ``` or ``` ... ``` wrappers that models +// sometimes add despite being asked for raw JSON. If the input doesn't look +// fenced, it's returned unchanged. +func stripJSONFences(s string) string { + t := strings.TrimSpace(s) + if !strings.HasPrefix(t, "```") { + return t + } + // Drop the opening fence (```json or ```) + if nl := strings.IndexByte(t, '\n'); nl >= 0 { + t = t[nl+1:] + } else { + return strings.TrimSpace(strings.TrimPrefix(t, "```")) + } + // Drop the trailing fence + if idx := strings.LastIndex(t, "```"); idx >= 0 { + t = t[:idx] + } + return strings.TrimSpace(t) +} diff --git a/services/llm_client.go b/services/llm_client.go new file mode 100644 index 0000000..b928f6f --- /dev/null +++ b/services/llm_client.go @@ -0,0 +1,319 @@ +package services + +import ( + "bufio" + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "strings" + "sync" + "time" +) + +// ErrModelManagementNotSupported is returned by providers (e.g. Anthropic) where +// model pulls/deletes don't apply. Handlers should map this to a 400-class +// response rather than a 502, since it's a client-intent error, not a backend +// failure. +var ErrModelManagementNotSupported = errors.New("model management not supported for this provider") + +// LLMClient is the minimal interface used by the operator UI. It supports +// runtime reconfiguration (active model, base URL) and provider management +// operations (list/pull/delete models). +type LLMClient interface { + // GenerateJSON sends a prompt to the LLM and returns the raw response body. + GenerateJSON(ctx context.Context, systemPrompt, userPrompt string) (string, error) + + // ProviderName returns a short identifier for logging. + ProviderName() string + + // Ping checks whether the LLM service is reachable. + Ping(ctx context.Context) error + + // GetBaseURL returns the current base URL. + GetBaseURL() string + // SetBaseURL updates the base URL at runtime. + SetBaseURL(url string) + + // GetActiveModel returns the model that will be used for generations. + GetActiveModel() string + // SetActiveModel updates the active model at runtime. + SetActiveModel(model string) + + // ListModels returns the models installed/available on the LLM server. + ListModels(ctx context.Context) ([]LLMModel, error) + + // PullModel asks the server to download a model. Progress updates are + // written to progress as they arrive. The function blocks until the pull + // completes or the context is cancelled. + PullModel(ctx context.Context, name string, progress func(LLMPullProgress)) error + + // DeleteModel removes a model from the server. + DeleteModel(ctx context.Context, name string) error +} + +// LLMModel describes an installed model returned by the provider. +type LLMModel struct { + Name string `json:"name"` + Size int64 `json:"size,omitempty"` + ModifiedAt string `json:"modified_at,omitempty"` +} + +// LLMPullProgress is a single progress event emitted during PullModel. +type LLMPullProgress struct { + Status string `json:"status"` + Completed int64 `json:"completed,omitempty"` + Total int64 `json:"total,omitempty"` + Digest string `json:"digest,omitempty"` + Error string `json:"error,omitempty"` +} + +// LLMClientOptions carries the per-provider settings NewLLMClient needs. +// APIKey is required for hosted providers (anthropic); ignored by ollama. +type LLMClientOptions struct { + Provider string + BaseURL string + Model string + APIKey string +} + +// NewLLMClient returns a client for the configured provider. +func NewLLMClient(opts LLMClientOptions) (LLMClient, error) { + switch strings.ToLower(strings.TrimSpace(opts.Provider)) { + case "", "ollama": + baseURL := opts.BaseURL + if baseURL == "" { + baseURL = "http://localhost:11434" + } + model := opts.Model + if model == "" { + model = "qwen2.5-coder:7b" + } + return &ollamaClient{ + baseURL: strings.TrimSuffix(baseURL, "/"), + model: model, + http: &http.Client{Timeout: 60 * time.Second}, + pullHTTP: &http.Client{ + // No timeout for pulls — model downloads can take 10+ minutes + }, + }, nil + case "anthropic": + if strings.TrimSpace(opts.APIKey) == "" { + return nil, fmt.Errorf("anthropic provider requires ANTHROPIC_API_KEY") + } + return newAnthropicClient(opts.BaseURL, opts.Model, opts.APIKey), nil + default: + return nil, fmt.Errorf("unsupported LLM provider: %q (expected \"ollama\" or \"anthropic\")", opts.Provider) + } +} + +// ── Ollama ── + +type ollamaClient struct { + mu sync.RWMutex + baseURL string + model string + http *http.Client // short-timeout client for most calls + pullHTTP *http.Client // no-timeout client for streaming pull requests +} + +func (c *ollamaClient) ProviderName() string { return "ollama" } + +func (c *ollamaClient) GetBaseURL() string { + c.mu.RLock() + defer c.mu.RUnlock() + return c.baseURL +} + +func (c *ollamaClient) SetBaseURL(url string) { + c.mu.Lock() + defer c.mu.Unlock() + c.baseURL = strings.TrimSuffix(strings.TrimSpace(url), "/") +} + +func (c *ollamaClient) GetActiveModel() string { + c.mu.RLock() + defer c.mu.RUnlock() + return c.model +} + +func (c *ollamaClient) SetActiveModel(model string) { + c.mu.Lock() + defer c.mu.Unlock() + c.model = strings.TrimSpace(model) +} + +// Ping calls GET /api/tags as a reachability check (cheap, no model load). +func (c *ollamaClient) Ping(ctx context.Context) error { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.GetBaseURL()+"/api/tags", nil) + if err != nil { + return err + } + resp, err := c.http.Do(req) + if err != nil { + return fmt.Errorf("ollama unreachable at %s: %w", c.GetBaseURL(), err) + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<15)) + return fmt.Errorf("ollama returned %s: %s", resp.Status, strings.TrimSpace(string(body))) + } + return nil +} + +// ollamaTagsResponse is GET /api/tags. +type ollamaTagsResponse struct { + Models []struct { + Name string `json:"name"` + Size int64 `json:"size"` + ModifiedAt string `json:"modified_at"` + } `json:"models"` +} + +func (c *ollamaClient) ListModels(ctx context.Context) ([]LLMModel, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.GetBaseURL()+"/api/tags", nil) + if err != nil { + return nil, err + } + resp, err := c.http.Do(req) + if err != nil { + return nil, fmt.Errorf("list models: %w", err) + } + defer func() { _ = resp.Body.Close() }() + body, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("ollama returned %s: %s", resp.Status, strings.TrimSpace(string(body))) + } + var tags ollamaTagsResponse + if err := json.Unmarshal(body, &tags); err != nil { + return nil, fmt.Errorf("parse models: %w", err) + } + out := make([]LLMModel, 0, len(tags.Models)) + for _, m := range tags.Models { + out = append(out, LLMModel{Name: m.Name, Size: m.Size, ModifiedAt: m.ModifiedAt}) + } + return out, nil +} + +// PullModel starts a model pull and streams NDJSON progress events. +func (c *ollamaClient) PullModel(ctx context.Context, name string, progress func(LLMPullProgress)) error { + body, _ := json.Marshal(map[string]any{"name": name, "stream": true}) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.GetBaseURL()+"/api/pull", bytes.NewReader(body)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + resp, err := c.pullHTTP.Do(req) + if err != nil { + return fmt.Errorf("start pull: %w", err) + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode != http.StatusOK { + respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<15)) + return fmt.Errorf("ollama returned %s: %s", resp.Status, strings.TrimSpace(string(respBody))) + } + + // Ollama emits newline-delimited JSON progress events. Stream them through. + scanner := bufio.NewScanner(resp.Body) + scanner.Buffer(make([]byte, 64*1024), 1024*1024) + for scanner.Scan() { + line := scanner.Bytes() + if len(line) == 0 { + continue + } + var ev LLMPullProgress + if err := json.Unmarshal(line, &ev); err != nil { + // Skip unparseable lines rather than aborting + continue + } + if progress != nil { + progress(ev) + } + if ev.Error != "" { + return fmt.Errorf("pull error: %s", ev.Error) + } + } + return scanner.Err() +} + +// DeleteModel removes a locally installed model. +func (c *ollamaClient) DeleteModel(ctx context.Context, name string) error { + body, _ := json.Marshal(map[string]string{"name": name}) + req, err := http.NewRequestWithContext(ctx, http.MethodDelete, c.GetBaseURL()+"/api/delete", bytes.NewReader(body)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + resp, err := c.http.Do(req) + if err != nil { + return fmt.Errorf("delete model: %w", err) + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode != http.StatusOK { + respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<15)) + return fmt.Errorf("ollama returned %s: %s", resp.Status, strings.TrimSpace(string(respBody))) + } + return nil +} + +// ollamaGenerateRequest is the body of POST /api/generate. +type ollamaGenerateRequest struct { + Model string `json:"model"` + Prompt string `json:"prompt"` + System string `json:"system,omitempty"` + Stream bool `json:"stream"` + Format string `json:"format,omitempty"` // "json" constrains output to valid JSON +} + +type ollamaGenerateResponse struct { + Model string `json:"model"` + Response string `json:"response"` + Done bool `json:"done"` + DoneError string `json:"error,omitempty"` +} + +func (c *ollamaClient) GenerateJSON(ctx context.Context, systemPrompt, userPrompt string) (string, error) { + body, err := json.Marshal(ollamaGenerateRequest{ + Model: c.GetActiveModel(), + System: systemPrompt, + Prompt: userPrompt, + Stream: false, + Format: "json", + }) + if err != nil { + return "", fmt.Errorf("marshal ollama request: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.GetBaseURL()+"/api/generate", bytes.NewReader(body)) + if err != nil { + return "", err + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/json") + + resp, err := c.http.Do(req) + if err != nil { + return "", fmt.Errorf("call ollama at %s: %w (is ollama running?)", c.GetBaseURL(), err) + } + defer func() { _ = resp.Body.Close() }() + + respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("ollama returned %s: %s", resp.Status, strings.TrimSpace(string(respBody))) + } + + var out ollamaGenerateResponse + if err := json.Unmarshal(respBody, &out); err != nil { + return "", fmt.Errorf("parse ollama response: %w", err) + } + if out.DoneError != "" { + return "", fmt.Errorf("ollama error: %s", out.DoneError) + } + if out.Response == "" { + return "", fmt.Errorf("ollama returned empty response (check that model %q is pulled)", c.GetActiveModel()) + } + return out.Response, nil +} diff --git a/services/llm_client_test.go b/services/llm_client_test.go new file mode 100644 index 0000000..00d11f3 --- /dev/null +++ b/services/llm_client_test.go @@ -0,0 +1,118 @@ +package services + +import ( + "context" + "errors" + "strings" + "testing" +) + +func TestNewLLMClient_Dispatch(t *testing.T) { + cases := []struct { + name string + opts LLMClientOptions + wantProvider string + wantErr bool + errSubstring string + }{ + { + name: "empty provider defaults to ollama", + opts: LLMClientOptions{}, + wantProvider: "ollama", + }, + { + name: "explicit ollama", + opts: LLMClientOptions{Provider: "ollama"}, + wantProvider: "ollama", + }, + { + name: "case-insensitive provider name", + opts: LLMClientOptions{Provider: "Ollama"}, + wantProvider: "ollama", + }, + { + name: "anthropic requires API key", + opts: LLMClientOptions{Provider: "anthropic"}, + wantErr: true, + errSubstring: "ANTHROPIC_API_KEY", + }, + { + name: "anthropic with key succeeds", + opts: LLMClientOptions{Provider: "anthropic", APIKey: "sk-test"}, + wantProvider: "anthropic", + }, + { + name: "unsupported provider errors", + opts: LLMClientOptions{Provider: "openai"}, + wantErr: true, + errSubstring: "unsupported", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + client, err := NewLLMClient(tc.opts) + if tc.wantErr { + if err == nil { + t.Fatalf("want error, got nil") + } + if tc.errSubstring != "" && !strings.Contains(err.Error(), tc.errSubstring) { + t.Errorf("want error containing %q, got %q", tc.errSubstring, err.Error()) + } + return + } + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if client == nil { + t.Fatalf("want non-nil client") + } + if client.ProviderName() != tc.wantProvider { + t.Errorf("ProviderName()=%q, want %q", client.ProviderName(), tc.wantProvider) + } + }) + } +} + +func TestAnthropicClient_ModelManagementNotSupported(t *testing.T) { + c := newAnthropicClient("", "", "sk-test") + ctx := context.Background() + if err := c.PullModel(ctx, "anything", nil); !errors.Is(err, ErrModelManagementNotSupported) { + t.Errorf("PullModel: want ErrModelManagementNotSupported, got %v", err) + } + if err := c.DeleteModel(ctx, "anything"); !errors.Is(err, ErrModelManagementNotSupported) { + t.Errorf("DeleteModel: want ErrModelManagementNotSupported, got %v", err) + } +} + +func TestAnthropicClient_SetGetters(t *testing.T) { + c := newAnthropicClient("", "", "sk-test") + if c.GetBaseURL() != "https://api.anthropic.com" { + t.Errorf("default base URL mismatch: %q", c.GetBaseURL()) + } + if c.GetActiveModel() != "claude-haiku-4-5" { + t.Errorf("default model mismatch: %q", c.GetActiveModel()) + } + c.SetActiveModel("claude-sonnet-4-6") + if c.GetActiveModel() != "claude-sonnet-4-6" { + t.Errorf("SetActiveModel did not stick: %q", c.GetActiveModel()) + } + c.SetBaseURL("https://example.com/") + if c.GetBaseURL() != "https://example.com" { + t.Errorf("SetBaseURL should trim trailing slash, got %q", c.GetBaseURL()) + } +} + +func TestStripJSONFences(t *testing.T) { + cases := []struct{ in, want string }{ + {`{"a":1}`, `{"a":1}`}, + {"```json\n{\"a\":1}\n```", `{"a":1}`}, + {"```\n{\"a\":1}\n```", `{"a":1}`}, + {" ```json\n{\"nested\":{\"b\":2}}\n``` ", `{"nested":{"b":2}}`}, + {"not fenced at all", "not fenced at all"}, + } + for _, tc := range cases { + if got := stripJSONFences(tc.in); got != tc.want { + t.Errorf("stripJSONFences(%q) = %q, want %q", tc.in, got, tc.want) + } + } +} diff --git a/services/log_buffer.go b/services/log_buffer.go new file mode 100644 index 0000000..0fa3563 --- /dev/null +++ b/services/log_buffer.go @@ -0,0 +1,116 @@ +package services + +import ( + "context" + "sync" + "time" +) + +// Maximum entries per delivery and total across all deliveries. +const ( + logBufferMaxPerDelivery = 100 + logBufferMaxDeliveries = 50 +) + +// LogEntry is a single captured log line for operator diagnostics. +type LogEntry struct { + Time time.Time `json:"time"` + Level string `json:"level"` + Message string `json:"message"` + Fields map[string]any `json:"fields,omitempty"` +} + +// DeliveryLogBuffer stores recent log entries keyed by delivery ID for the operator UI. +type DeliveryLogBuffer struct { + mu sync.Mutex + entries map[string][]LogEntry + order []string // insertion order for eviction +} + +// NewDeliveryLogBuffer creates an empty delivery log buffer. +func NewDeliveryLogBuffer() *DeliveryLogBuffer { + return &DeliveryLogBuffer{ + entries: make(map[string][]LogEntry), + order: make([]string, 0, logBufferMaxDeliveries), + } +} + +// Append adds a log entry for a delivery ID. +func (b *DeliveryLogBuffer) Append(deliveryID string, entry LogEntry) { + if b == nil || deliveryID == "" { + return + } + if entry.Time.IsZero() { + entry.Time = time.Now().UTC() + } + b.mu.Lock() + defer b.mu.Unlock() + + logs, exists := b.entries[deliveryID] + if !exists { + b.order = append(b.order, deliveryID) + // Evict oldest delivery if over limit + if len(b.order) > logBufferMaxDeliveries { + evict := b.order[0] + b.order = b.order[1:] + delete(b.entries, evict) + } + } + logs = append(logs, entry) + if len(logs) > logBufferMaxPerDelivery { + logs = logs[len(logs)-logBufferMaxPerDelivery:] + } + b.entries[deliveryID] = logs +} + +// Get returns log entries for a delivery ID (nil if not found). +func (b *DeliveryLogBuffer) Get(deliveryID string) []LogEntry { + if b == nil { + return nil + } + b.mu.Lock() + defer b.mu.Unlock() + logs, ok := b.entries[deliveryID] + if !ok { + return nil + } + out := make([]LogEntry, len(logs)) + copy(out, logs) + return out +} + +// context key for log buffer +type logBufferCtxKey struct{} + +// ContextWithLogBuffer returns a context that carries a delivery ID for log capture. +func ContextWithLogBuffer(ctx context.Context, deliveryID string, buf *DeliveryLogBuffer) context.Context { + return context.WithValue(ctx, logBufferCtxKey{}, &logBufferCtxVal{deliveryID: deliveryID, buf: buf}) +} + +type logBufferCtxVal struct { + deliveryID string + buf *DeliveryLogBuffer +} + +// logBufferFromCtx extracts the log buffer from context (nil if not set). +func logBufferFromCtx(ctx context.Context) *logBufferCtxVal { + if ctx == nil { + return nil + } + val, _ := ctx.Value(logBufferCtxKey{}).(*logBufferCtxVal) + return val +} + +// appendToCtxBuffer appends a log entry to the context's delivery log buffer if present. +func appendToCtxBuffer(ctx context.Context, level, message string, fields map[string]any) { + val := logBufferFromCtx(ctx) + if val == nil || val.buf == nil { + return + } + val.buf.Append(val.deliveryID, LogEntry{ + Time: time.Now().UTC(), + Level: level, + Message: message, + Fields: fields, + }) +} diff --git a/services/logger.go b/services/logger.go index 0efa0fe..98ea6bc 100644 --- a/services/logger.go +++ b/services/logger.go @@ -208,12 +208,14 @@ func LogCritical(message string, args ...any) { func LogInfoCtx(ctx context.Context, message string, fields map[string]interface{}) { slog.InfoContext(ctx, message, mapToAttrs(fields)...) logToGCP(slog.LevelInfo, message, mapToAttrs(fields)...) + appendToCtxBuffer(ctx, "info", message, fieldsToAny(fields)) } // LogWarningCtx writes a warning-level log with context. func LogWarningCtx(ctx context.Context, message string, fields map[string]interface{}) { slog.WarnContext(ctx, message, mapToAttrs(fields)...) logToGCP(slog.LevelWarn, message, mapToAttrs(fields)...) + appendToCtxBuffer(ctx, "warn", message, fieldsToAny(fields)) } // LogErrorCtx writes an error-level log with context and an optional error. @@ -224,6 +226,22 @@ func LogErrorCtx(ctx context.Context, message string, err error, fields map[stri } slog.ErrorContext(ctx, message, attrs...) logToGCP(slog.LevelError, message, attrs...) + f := fieldsToAny(fields) + if err != nil { + f["error"] = err.Error() + } + appendToCtxBuffer(ctx, "error", message, f) +} + +func fieldsToAny(fields map[string]interface{}) map[string]any { + if fields == nil { + return nil + } + out := make(map[string]any, len(fields)) + for k, v := range fields { + out[k] = v + } + return out } // LogWebhookOperation logs webhook-related operations. diff --git a/services/operator_auth.go b/services/operator_auth.go new file mode 100644 index 0000000..a313e91 --- /dev/null +++ b/services/operator_auth.go @@ -0,0 +1,343 @@ +package services + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "strings" + "sync" + "time" +) + +// ghAPIError is returned by the GitHub API helper calls on any non-2xx +// response. Callers can inspect StatusCode to distinguish transient 5xx +// failures (should not flip authorization decisions) from 4xx responses +// (definitive "no access"). +type ghAPIError struct { + StatusCode int + Body string +} + +func (e *ghAPIError) Error() string { + return fmt.Sprintf("GitHub API HTTP %d: %s", e.StatusCode, e.Body) +} + +func (e *ghAPIError) IsTransient() bool { return e.StatusCode >= 500 } + +// hashToken returns the SHA-256 hex digest of a PAT. Used as the cache key so +// raw tokens never sit in the process heap beyond the lifetime of a single +// request — a memory dump of the running server won't leak active tokens. +func hashToken(t string) string { + sum := sha256.Sum256([]byte(t)) + return hex.EncodeToString(sum[:]) +} + +// githubAPIBaseURL is the base for GitHub REST API calls. Package var (rather +// than a const) so tests can point it at an httptest.Server. Never set from +// user input — the SSRF surface for ghAPIGet* is unchanged. +var githubAPIBaseURL = "https://api.github.com" + +// ghUsernameRe matches valid GitHub usernames: alphanumeric + hyphens, +// cannot start or end with a hyphen, max 39 chars. Used to reject hostile +// input before it reaches URL construction for the GitHub API. (RE2 has no +// lookahead, so this doesn't reject consecutive hyphens — that's a GitHub +// policy issue, not a security one; such requests simply fail downstream.) +var ghUsernameRe = regexp.MustCompile(`^[a-zA-Z0-9]([a-zA-Z0-9-]{0,37}[a-zA-Z0-9])?$`) + +// ghRepoNameRe matches valid GitHub repo names. +var ghRepoNameRe = regexp.MustCompile(`^[a-zA-Z0-9_.-]{1,100}$`) + +// OperatorRole represents the permission level for the operator UI. +type OperatorRole string + +const ( + // RoleOperator has full access: view, replay, release. + RoleOperator OperatorRole = "operator" + // RoleWriter has read-only access: view workflows, audit, recent copies. + RoleWriter OperatorRole = "writer" + // RoleDenied means the user has no access. + RoleDenied OperatorRole = "denied" +) + +// OperatorUser represents an authenticated operator UI user. +type OperatorUser struct { + Login string `json:"login"` + AvatarURL string `json:"avatar_url,omitempty"` + Role OperatorRole `json:"role"` +} + +// ghAuthCache caches GitHub PAT validation results to avoid hitting the API on every request. +// It also caches per-repo permission lookups (one permission level per token+repo pair). +type ghAuthCache struct { + mu sync.RWMutex + entries map[string]*ghAuthEntry + repoPerm map[string]*ghRepoPermEntry // key: token + "\x00" + repo + ttl time.Duration +} + +type ghAuthEntry struct { + user *OperatorUser + err error + expiresAt time.Time +} + +type ghRepoPermEntry struct { + permission string // "admin", "maintain", "write", "triage", "read", or "" for denied + err error + expiresAt time.Time +} + +func newGHAuthCache(ttl time.Duration) *ghAuthCache { + return &ghAuthCache{ + entries: make(map[string]*ghAuthEntry), + repoPerm: make(map[string]*ghRepoPermEntry), + ttl: ttl, + } +} + +// Cache methods take raw tokens and hash them internally, so callers never +// have to think about the token→digest boundary. Raw tokens never become +// map keys. + +func (c *ghAuthCache) get(token string) (*OperatorUser, error, bool) { + key := hashToken(token) + c.mu.RLock() + defer c.mu.RUnlock() + e, ok := c.entries[key] + if !ok || time.Now().After(e.expiresAt) { + return nil, nil, false + } + return e.user, e.err, true +} + +func (c *ghAuthCache) set(token string, user *OperatorUser, err error) { + key := hashToken(token) + c.mu.Lock() + defer c.mu.Unlock() + c.entries[key] = &ghAuthEntry{ + user: user, + err: err, + expiresAt: time.Now().Add(c.ttl), + } + // Evict expired entries periodically (simple sweep when cache grows) + if len(c.entries) > 100 { + now := time.Now() + for k, v := range c.entries { + if now.After(v.expiresAt) { + delete(c.entries, k) + } + } + } +} + +func (c *ghAuthCache) getRepoPerm(token, repo string) (string, error, bool) { + key := hashToken(token) + "\x00" + repo + c.mu.RLock() + defer c.mu.RUnlock() + e, ok := c.repoPerm[key] + if !ok || time.Now().After(e.expiresAt) { + return "", nil, false + } + return e.permission, e.err, true +} + +func (c *ghAuthCache) setRepoPerm(token, repo, permission string, err error) { + key := hashToken(token) + "\x00" + repo + c.mu.Lock() + defer c.mu.Unlock() + c.repoPerm[key] = &ghRepoPermEntry{ + permission: permission, + err: err, + expiresAt: time.Now().Add(c.ttl), + } + if len(c.repoPerm) > 500 { + now := time.Now() + for k, v := range c.repoPerm { + if now.After(v.expiresAt) { + delete(c.repoPerm, k) + } + } + } +} + +// CanUserReadRepo returns true if the user (identified by PAT) has at least read access to the repo. +// Uses the cache when available. Returns (hasAccess, error). +func (c *ghAuthCache) CanUserReadRepo(ctx context.Context, pat, username, repo string) (bool, error) { + if perm, err, ok := c.getRepoPerm(pat, repo); ok { + if err != nil { + return false, err + } + return permissionGrantsRead(perm), nil + } + perm, err := ghAPIGetRepoPermission(ctx, pat, repo, username) + c.setRepoPerm(pat, repo, perm, err) + if err != nil { + return false, err + } + return permissionGrantsRead(perm), nil +} + +func permissionGrantsRead(perm string) bool { + switch perm { + case "admin", "maintain", "write", "triage", "read": + return true + } + return false +} + +// validateGitHubPAT validates a GitHub PAT and returns the authenticated user with their role. +// It calls the GitHub API to get the user info, then checks their permission on the auth repo. +func validateGitHubPAT(ctx context.Context, pat string, authRepo string) (*OperatorUser, error) { + if pat == "" { + return nil, fmt.Errorf("empty token") + } + + // 1. Get the authenticated user + ghUser, err := ghAPIGetUser(ctx, pat) + if err != nil { + return nil, fmt.Errorf("validate token: %w", err) + } + + user := &OperatorUser{ + Login: ghUser.Login, + AvatarURL: ghUser.AvatarURL, + Role: RoleWriter, // default to read-only + } + + // authRepo is required in github mode (enforced at config load via + // validateOperatorAuth). This guard is defensive only. + if authRepo == "" { + return nil, fmt.Errorf("OPERATOR_AUTH_REPO is not configured") + } + + // 2. Check the user's permission on the auth repo. + // + // Authorization posture: only a transient GitHub outage (5xx) lets the + // caller through with the default writer role — otherwise a GitHub + // hiccup locks out every legitimate operator. Every other failure + // (404 "not a collaborator", 401/403, network error, parse error) + // denies access. This closes the "any valid PAT gets writer" hole that + // existed when we soft-failed on all errors. + perm, err := ghAPIGetRepoPermission(ctx, pat, authRepo, ghUser.Login) + if err != nil { + var apiErr *ghAPIError + if errors.As(err, &apiErr) && apiErr.IsTransient() { + LogWarning("GitHub permission check transiently failed, keeping writer role", + "user", ghUser.Login, "repo", authRepo, "status", apiErr.StatusCode) + return user, nil + } + user.Role = RoleDenied + return user, fmt.Errorf("user %s has no access to %s: %w", ghUser.Login, authRepo, err) + } + + // admin/maintain → operator; write/triage/read → writer. "write" is + // deliberately NOT operator: most writers have write access to the + // auth repo, so mapping write → operator would give every writer the + // ability to replay and cut releases. Operator actions require an + // explicit admin or maintain grant. + switch perm { + case "admin", "maintain": + user.Role = RoleOperator + case "write", "triage", "read": + user.Role = RoleWriter + default: + user.Role = RoleDenied + return user, fmt.Errorf("user %s has no access to %s", ghUser.Login, authRepo) + } + + return user, nil +} + +// ghUserResponse is the minimal response from GET /user. +type ghUserResponse struct { + Login string `json:"login"` + AvatarURL string `json:"avatar_url"` +} + +func ghAPIGetUser(ctx context.Context, pat string) (*ghUserResponse, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, githubAPIBaseURL+"/user", nil) // #nosec G107 -- githubAPIBaseURL is set by the binary, not user input + if err != nil { + return nil, err + } + req.Header.Set("Authorization", "Bearer "+pat) + req.Header.Set("Accept", "application/vnd.github+json") + req.Header.Set("X-GitHub-Api-Version", "2022-11-28") + + resp, err := (&http.Client{Timeout: 10 * time.Second}).Do(req) + if err != nil { + return nil, err + } + defer func() { _ = resp.Body.Close() }() + body, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<16)) + + if resp.StatusCode != http.StatusOK { + return nil, &ghAPIError{StatusCode: resp.StatusCode, Body: strings.TrimSpace(string(body))} + } + + var user ghUserResponse + if err := json.Unmarshal(body, &user); err != nil { + return nil, fmt.Errorf("parse user response: %w", err) + } + if user.Login == "" { + return nil, fmt.Errorf("empty login in GitHub response") + } + return &user, nil +} + +// ghPermissionResponse is the response from GET /repos/{owner}/{repo}/collaborators/{user}/permission. +type ghPermissionResponse struct { + Permission string `json:"permission"` +} + +func ghAPIGetRepoPermission(ctx context.Context, pat string, repo string, username string) (string, error) { + parts := strings.SplitN(repo, "/", 2) + if len(parts) != 2 { + return "", fmt.Errorf("invalid repo format: %s (expected owner/repo)", repo) + } + // Validate path components against strict whitelists before URL construction. + // Host is hardcoded to api.github.com — not user-controlled. + if !ghUsernameRe.MatchString(parts[0]) { + return "", fmt.Errorf("invalid owner in repo %q", repo) + } + if !ghRepoNameRe.MatchString(parts[1]) { + return "", fmt.Errorf("invalid repo name in %q", repo) + } + if !ghUsernameRe.MatchString(username) { + return "", fmt.Errorf("invalid username %q", username) + } + apiURL := fmt.Sprintf( + "%s/repos/%s/%s/collaborators/%s/permission", + githubAPIBaseURL, url.PathEscape(parts[0]), url.PathEscape(parts[1]), url.PathEscape(username), + ) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, apiURL, nil) // #nosec G107 G704 -- host is hardcoded to api.github.com; path components validated above + if err != nil { + return "", err + } + req.Header.Set("Authorization", "Bearer "+pat) + req.Header.Set("Accept", "application/vnd.github+json") + req.Header.Set("X-GitHub-Api-Version", "2022-11-28") + + resp, err := (&http.Client{Timeout: 10 * time.Second}).Do(req) // #nosec G107 G704 -- host is hardcoded to api.github.com; path components validated above + if err != nil { + return "", err + } + defer func() { _ = resp.Body.Close() }() + body, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<16)) + + if resp.StatusCode != http.StatusOK { + return "", &ghAPIError{StatusCode: resp.StatusCode, Body: strings.TrimSpace(string(body))} + } + + var perm ghPermissionResponse + if err := json.Unmarshal(body, &perm); err != nil { + return "", fmt.Errorf("parse permission response: %w", err) + } + return perm.Permission, nil +} diff --git a/services/operator_auth_test.go b/services/operator_auth_test.go new file mode 100644 index 0000000..8bac4c3 --- /dev/null +++ b/services/operator_auth_test.go @@ -0,0 +1,230 @@ +package services + +import ( + "context" + "errors" + "fmt" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +func TestHashToken(t *testing.T) { + a := hashToken("secret-pat-abc") + b := hashToken("secret-pat-abc") + c := hashToken("secret-pat-xyz") + if a != b { + t.Fatalf("same input must produce same digest: %s vs %s", a, b) + } + if a == c { + t.Fatalf("different inputs must produce different digests") + } + if strings.Contains(a, "secret") { + t.Fatalf("digest leaks plaintext: %s", a) + } + if len(a) != 64 { + t.Fatalf("expected 64-char sha256 hex digest, got %d chars", len(a)) + } +} + +func TestGHAPIError_IsTransient(t *testing.T) { + cases := []struct { + status int + transient bool + }{ + {http.StatusInternalServerError, true}, + {http.StatusBadGateway, true}, + {http.StatusServiceUnavailable, true}, + {http.StatusNotFound, false}, + {http.StatusUnauthorized, false}, + {http.StatusForbidden, false}, + {http.StatusBadRequest, false}, + } + for _, tc := range cases { + e := &ghAPIError{StatusCode: tc.status} + if got := e.IsTransient(); got != tc.transient { + t.Errorf("status %d: IsTransient()=%v, want %v", tc.status, got, tc.transient) + } + } +} + +func TestPermissionGrantsRead(t *testing.T) { + readers := []string{"admin", "maintain", "write", "triage", "read"} + nonReaders := []string{"", "none", "denied", "unknown"} + for _, p := range readers { + if !permissionGrantsRead(p) { + t.Errorf("permission %q must grant read", p) + } + } + for _, p := range nonReaders { + if permissionGrantsRead(p) { + t.Errorf("permission %q must NOT grant read", p) + } + } +} + +// stubGitHub replaces githubAPIBaseURL with an httptest.Server that returns +// the given /user and /permission responses. Returns a cleanup func. +type stubResponses struct { + userStatus int + userBody string + permStatus int + permBody string +} + +func stubGitHub(t *testing.T, rs stubResponses) func() { + t.Helper() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.URL.Path == "/user": + w.WriteHeader(rs.userStatus) + _, _ = w.Write([]byte(rs.userBody)) + case strings.HasPrefix(r.URL.Path, "/repos/") && strings.HasSuffix(r.URL.Path, "/permission"): + w.WriteHeader(rs.permStatus) + _, _ = w.Write([]byte(rs.permBody)) + default: + w.WriteHeader(http.StatusNotFound) + } + })) + prev := githubAPIBaseURL + githubAPIBaseURL = srv.URL + return func() { + githubAPIBaseURL = prev + srv.Close() + } +} + +func TestValidateGitHubPAT_RoleMapping(t *testing.T) { + cases := []struct { + name string + perm string + wantRole OperatorRole + wantErr bool + }{ + {"admin maps to operator", "admin", RoleOperator, false}, + {"maintain maps to operator", "maintain", RoleOperator, false}, + {"write maps to writer (not operator)", "write", RoleWriter, false}, + {"triage maps to writer", "triage", RoleWriter, false}, + {"read maps to writer", "read", RoleWriter, false}, + {"unknown permission denies", "mystery", RoleDenied, true}, + {"empty permission denies", "", RoleDenied, true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + cleanup := stubGitHub(t, stubResponses{ + userStatus: http.StatusOK, + userBody: `{"login":"alice","avatar_url":"https://example.com/a.png"}`, + permStatus: http.StatusOK, + permBody: fmt.Sprintf(`{"permission":%q}`, tc.perm), + }) + defer cleanup() + + user, err := validateGitHubPAT(context.Background(), "pat-123", "org/repo") + if tc.wantErr && err == nil { + t.Fatalf("want error for perm=%q, got none", tc.perm) + } + if !tc.wantErr && err != nil { + t.Fatalf("unexpected error for perm=%q: %v", tc.perm, err) + } + if user == nil { + t.Fatalf("want non-nil user") + } + if user.Role != tc.wantRole { + t.Errorf("perm=%q: role=%q, want %q", tc.perm, user.Role, tc.wantRole) + } + }) + } +} + +// Critical test for the review finding: a 404 from the permission check must +// deny access (not soft-fail to writer). This prevents "any valid PAT → writer". +func TestValidateGitHubPAT_PermissionCheck404_Denies(t *testing.T) { + cleanup := stubGitHub(t, stubResponses{ + userStatus: http.StatusOK, + userBody: `{"login":"mallory","avatar_url":""}`, + permStatus: http.StatusNotFound, + permBody: `{"message":"Not Found"}`, + }) + defer cleanup() + + user, err := validateGitHubPAT(context.Background(), "pat-xyz", "org/repo") + if err == nil { + t.Fatalf("want error when user is not a collaborator (404), got nil") + } + if user == nil || user.Role != RoleDenied { + var role OperatorRole + if user != nil { + role = user.Role + } + t.Fatalf("want RoleDenied on 404, got %q", role) + } +} + +// 5xx from GitHub is treated as transient — users keep their default writer +// role so a GitHub outage doesn't lock everyone out. The audit log captures +// the event; the cache TTL bounds exposure. +func TestValidateGitHubPAT_PermissionCheck5xx_KeepsWriter(t *testing.T) { + cleanup := stubGitHub(t, stubResponses{ + userStatus: http.StatusOK, + userBody: `{"login":"bob","avatar_url":""}`, + permStatus: http.StatusInternalServerError, + permBody: `upstream error`, + }) + defer cleanup() + + user, err := validateGitHubPAT(context.Background(), "pat-abc", "org/repo") + if err != nil { + t.Fatalf("5xx must not surface an error to the caller (soft-fail): %v", err) + } + if user == nil || user.Role != RoleWriter { + var role OperatorRole + if user != nil { + role = user.Role + } + t.Fatalf("want RoleWriter on 5xx soft-fail, got %q", role) + } +} + +// An invalid / expired PAT (401 on /user) must deny, not soft-fail. +func TestValidateGitHubPAT_UserLookup401_Denies(t *testing.T) { + cleanup := stubGitHub(t, stubResponses{ + userStatus: http.StatusUnauthorized, + userBody: `{"message":"Bad credentials"}`, + }) + defer cleanup() + + user, err := validateGitHubPAT(context.Background(), "expired-pat", "org/repo") + if err == nil { + t.Fatalf("want error for invalid PAT (401), got nil") + } + if user != nil { + t.Errorf("want nil user on failed token validation, got %+v", user) + } + + var apiErr *ghAPIError + if !errors.As(err, &apiErr) { + t.Errorf("want wrapped ghAPIError, got %T: %v", err, err) + } else if apiErr.StatusCode != http.StatusUnauthorized { + t.Errorf("want StatusCode=401, got %d", apiErr.StatusCode) + } +} + +func TestGHAuthCache_UsesHashedKeys(t *testing.T) { + c := newGHAuthCache(5 * 60) + pat := "super-secret-pat-12345" + user := &OperatorUser{Login: "alice", Role: RoleOperator} + c.set(pat, user, nil) + + // The raw token must not appear as a key — only its sha256 digest. + c.mu.RLock() + defer c.mu.RUnlock() + for k := range c.entries { + if strings.Contains(k, pat) { + t.Fatalf("cache key leaks raw PAT: %q", k) + } + } + if _, ok := c.entries[hashToken(pat)]; !ok { + t.Fatalf("expected cache entry under hashed key") + } +} diff --git a/services/operator_llm_admin.go b/services/operator_llm_admin.go new file mode 100644 index 0000000..2582d27 --- /dev/null +++ b/services/operator_llm_admin.go @@ -0,0 +1,218 @@ +package services + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "strings" + "time" +) + +// handleLLMStatus returns the current LLM settings, reachability, and installed models. +func (o *operatorUI) handleLLMStatus(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + w.WriteHeader(http.StatusMethodNotAllowed) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "method not allowed"}) + return + } + out := map[string]any{ + "available": o.llm != nil, + "provider": o.cfg.LLMProvider, + "base_url": "", + "active_model": "", + "reachable": false, + "models": []LLMModel{}, + // supports_model_mgmt tells the UI whether to show pull/delete sections. + // Hosted providers (anthropic) don't expose those operations. + "supports_model_mgmt": strings.ToLower(strings.TrimSpace(o.cfg.LLMProvider)) != "anthropic", + } + if o.llm == nil { + out["error"] = "LLM client not initialized" + _ = json.NewEncoder(w).Encode(out) + return + } + out["base_url"] = o.llm.GetBaseURL() + out["active_model"] = o.llm.GetActiveModel() + + ctx, cancel := context.WithTimeout(r.Context(), 5*time.Second) + defer cancel() + + // Cache the ping outcome for 30s. For Anthropic this saves real tokens + // (every refresh of the status tab used to hit /v1/messages); for + // Ollama it saves an /api/tags round-trip. handleLLMSettings clears the + // entry when base URL / model change so operators see fresh state. + pingErr, ok := o.llmPing.get(30 * time.Second) + if !ok { + pingErr = o.llm.Ping(ctx) + o.llmPing.set(pingErr) + } + if pingErr != nil { + out["error"] = pingErr.Error() + _ = json.NewEncoder(w).Encode(out) + return + } + out["reachable"] = true + + models, err := o.llm.ListModels(ctx) + if err != nil { + out["error"] = "list models: " + err.Error() + _ = json.NewEncoder(w).Encode(out) + return + } + out["models"] = models + _ = json.NewEncoder(w).Encode(out) +} + +// handleLLMSettings updates the active model and/or base URL at runtime. +// In-memory only — reverts to env-var defaults on process restart. +type llmSettingsRequest struct { + ActiveModel string `json:"active_model,omitempty"` + BaseURL string `json:"base_url,omitempty"` +} + +func (o *operatorUI) handleLLMSettings(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + w.WriteHeader(http.StatusMethodNotAllowed) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "method not allowed"}) + return + } + if o.llm == nil { + w.WriteHeader(http.StatusServiceUnavailable) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "LLM client not initialized"}) + return + } + body, _ := io.ReadAll(io.LimitReader(r.Body, 4096)) + var req llmSettingsRequest + if err := json.Unmarshal(body, &req); err != nil { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "invalid json"}) + return + } + changed := false + if m := strings.TrimSpace(req.ActiveModel); m != "" { + o.llm.SetActiveModel(m) + changed = true + } + if u := strings.TrimSpace(req.BaseURL); u != "" { + o.llm.SetBaseURL(u) + changed = true + } + // Invalidate the ping cache on mutation so the next /llm/status call + // re-checks liveness against the new config — otherwise an operator + // flipping the URL sees a stale "connected" line for up to 30s. + if changed { + o.llmPing.invalidate() + } + _ = json.NewEncoder(w).Encode(map[string]any{ + "active_model": o.llm.GetActiveModel(), + "base_url": o.llm.GetBaseURL(), + }) +} + +// handleLLMDeleteModel deletes a model from the LLM server. +func (o *operatorUI) handleLLMDeleteModel(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodDelete { + w.WriteHeader(http.StatusMethodNotAllowed) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "method not allowed"}) + return + } + if o.llm == nil { + w.WriteHeader(http.StatusServiceUnavailable) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "LLM client not initialized"}) + return + } + name := strings.TrimSpace(r.URL.Query().Get("name")) + if name == "" { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "name query param required"}) + return + } + ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second) + defer cancel() + if err := o.llm.DeleteModel(ctx, name); err != nil { + status := http.StatusBadGateway + if errors.Is(err, ErrModelManagementNotSupported) { + status = http.StatusBadRequest + } + w.WriteHeader(status) + _ = json.NewEncoder(w).Encode(map[string]string{"error": err.Error()}) + return + } + _ = json.NewEncoder(w).Encode(map[string]any{"ok": true, "deleted": name}) +} + +// handleLLMPullModel streams pull progress to the client as NDJSON. +// Each line is a JSON object with {status, completed, total, error}. +func (o *operatorUI) handleLLMPullModel(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + w.WriteHeader(http.StatusMethodNotAllowed) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "method not allowed"}) + return + } + if o.llm == nil { + w.WriteHeader(http.StatusServiceUnavailable) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "LLM client not initialized"}) + return + } + body, _ := io.ReadAll(io.LimitReader(r.Body, 4096)) + var req struct { + Name string `json:"name"` + } + if err := json.Unmarshal(body, &req); err != nil { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "invalid json"}) + return + } + req.Name = strings.TrimSpace(req.Name) + if req.Name == "" { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "name is required"}) + return + } + // Reject up-front for hosted providers so the client doesn't have to interpret + // an NDJSON error event. + if strings.ToLower(strings.TrimSpace(o.cfg.LLMProvider)) == "anthropic" { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(map[string]string{"error": ErrModelManagementNotSupported.Error()}) + return + } + + // Switch to NDJSON streaming + w.Header().Set("Content-Type", "application/x-ndjson") + w.Header().Set("Cache-Control", "no-cache") + w.Header().Set("X-Accel-Buffering", "no") // disable nginx buffering when behind a proxy + flusher, canFlush := w.(http.Flusher) + encoder := json.NewEncoder(w) + + // Pulls can take a long time; don't use r.Context() if the client could disconnect + // prematurely. Use a 20-minute timeout as a safety net. + ctx, cancel := context.WithTimeout(context.Background(), 20*time.Minute) + defer cancel() + // Still honor client cancellation + go func() { + <-r.Context().Done() + cancel() + }() + + err := o.llm.PullModel(ctx, req.Name, func(ev LLMPullProgress) { + _ = encoder.Encode(ev) + if canFlush { + flusher.Flush() + } + }) + if err != nil { + _ = encoder.Encode(LLMPullProgress{Error: fmt.Sprintf("pull failed: %s", err.Error())}) + if canFlush { + flusher.Flush() + } + return + } + // Final event so the client knows the stream ended successfully + _ = encoder.Encode(LLMPullProgress{Status: "done"}) + if canFlush { + flusher.Flush() + } +} diff --git a/services/operator_ratelimit.go b/services/operator_ratelimit.go new file mode 100644 index 0000000..5c15267 --- /dev/null +++ b/services/operator_ratelimit.go @@ -0,0 +1,65 @@ +package services + +import ( + "sync" + "time" +) + +// tokenBucket is a trivial fixed-window rate limiter keyed by opaque string +// (typically a hashed PAT digest). Not a strict token-bucket in the +// telecom sense — a fixed-window counter is good enough to cap LLM cost +// per operator, and is cheaper to reason about than leaky-bucket math. +// +// Eviction happens opportunistically on writes once the map grows past a +// soft cap; there's no background goroutine. +type tokenBucket struct { + mu sync.Mutex + buckets map[string]*bucketState + max int + window time.Duration +} + +type bucketState struct { + remaining int + resetAt time.Time +} + +func newTokenBucket(max int, window time.Duration) *tokenBucket { + return &tokenBucket{ + buckets: make(map[string]*bucketState), + max: max, + window: window, + } +} + +// Allow decrements the caller's remaining allowance for the current window. +// Returns (allowed, resetAt). resetAt is always non-zero so callers can +// surface a Retry-After hint regardless of the allow/deny decision. +func (t *tokenBucket) Allow(key string) (bool, time.Time) { + t.mu.Lock() + defer t.mu.Unlock() + now := time.Now() + b, ok := t.buckets[key] + if !ok || now.After(b.resetAt) { + reset := now.Add(t.window) + t.buckets[key] = &bucketState{remaining: t.max - 1, resetAt: reset} + t.evictExpiredLocked(now) + return true, reset + } + if b.remaining <= 0 { + return false, b.resetAt + } + b.remaining-- + return true, b.resetAt +} + +func (t *tokenBucket) evictExpiredLocked(now time.Time) { + if len(t.buckets) < 256 { + return + } + for k, b := range t.buckets { + if now.After(b.resetAt) { + delete(t.buckets, k) + } + } +} diff --git a/services/operator_ratelimit_test.go b/services/operator_ratelimit_test.go new file mode 100644 index 0000000..73bc2e1 --- /dev/null +++ b/services/operator_ratelimit_test.go @@ -0,0 +1,47 @@ +package services + +import ( + "testing" + "time" +) + +func TestTokenBucket_AllowsUpToMax(t *testing.T) { + b := newTokenBucket(3, time.Hour) + for i := 0; i < 3; i++ { + if ok, _ := b.Allow("key-a"); !ok { + t.Fatalf("call %d: want allowed", i+1) + } + } + if ok, reset := b.Allow("key-a"); ok { + t.Fatalf("4th call must be denied") + } else if reset.IsZero() { + t.Errorf("denied call must return non-zero resetAt so we can populate Retry-After") + } +} + +func TestTokenBucket_SeparateKeys(t *testing.T) { + b := newTokenBucket(1, time.Hour) + if ok, _ := b.Allow("alice"); !ok { + t.Fatalf("alice first call must be allowed") + } + if ok, _ := b.Allow("bob"); !ok { + t.Fatalf("bob must have his own bucket; first call must be allowed") + } + if ok, _ := b.Allow("alice"); ok { + t.Fatalf("alice second call must be denied") + } +} + +func TestTokenBucket_ResetsAfterWindow(t *testing.T) { + b := newTokenBucket(1, 10*time.Millisecond) + if ok, _ := b.Allow("k"); !ok { + t.Fatalf("first call must be allowed") + } + if ok, _ := b.Allow("k"); ok { + t.Fatalf("second call within window must be denied") + } + time.Sleep(15 * time.Millisecond) + if ok, _ := b.Allow("k"); !ok { + t.Fatalf("after window elapses, a new allowance must start") + } +} diff --git a/services/operator_suggest_rule.go b/services/operator_suggest_rule.go new file mode 100644 index 0000000..b1397f1 --- /dev/null +++ b/services/operator_suggest_rule.go @@ -0,0 +1,369 @@ +package services + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "regexp" + "strings" + "time" + + "github.com/grove-platform/github-copier/types" +) + +// operatorSuggestRuleRequest is what the operator UI sends when asking the LLM +// to generate a copier rule from a source→target example. +type operatorSuggestRuleRequest struct { + SourcePath string `json:"source_path"` + TargetPath string `json:"target_path"` + TargetRepo string `json:"target_repo,omitempty"` // optional + SourceRepo string `json:"source_repo,omitempty"` // optional, for context +} + +// operatorSuggestRuleResponse is what the handler returns: the generated rule, +// an explanation, and a verification check against the user's example. +type operatorSuggestRuleResponse struct { + RuleYAML string `json:"rule_yaml"` + Explanation string `json:"explanation,omitempty"` + Verified bool `json:"verified"` // true if the rule produces target_path from source_path + ComputedPath string `json:"computed_path,omitempty"` // actual target path the rule would produce + VerifyError string `json:"verify_error,omitempty"` // reason verification failed (if any) + Warning string `json:"warning,omitempty"` // any non-fatal concern + Error string `json:"error,omitempty"` +} + +// llmSuggestedRule is the structured JSON we ask the LLM to return. +type llmSuggestedRule struct { + Name string `json:"name"` + DestRepo string `json:"destination_repo"` + DestBranch string `json:"destination_branch,omitempty"` + TransformType string `json:"transform_type"` // "move" | "copy" | "glob" | "regex" + TransformFrom string `json:"transform_from,omitempty"` + TransformTo string `json:"transform_to,omitempty"` + Pattern string `json:"pattern,omitempty"` + TransformTempl string `json:"transform_template,omitempty"` + CommitStrategy string `json:"commit_strategy,omitempty"` // "direct" or "pull_request" + Explanation string `json:"explanation,omitempty"` + Extra map[string]string `json:"-"` +} + +// handleSuggestRule accepts a source/target pair and asks the configured LLM to +// generate a copier workflow rule that would produce that transformation. +// The generated rule is self-verified against the example before returning. +func (o *operatorUI) handleSuggestRule(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + w.WriteHeader(http.StatusMethodNotAllowed) + _ = json.NewEncoder(w).Encode(operatorSuggestRuleResponse{Error: "method not allowed"}) + return + } + if o.llm == nil { + w.WriteHeader(http.StatusServiceUnavailable) + _ = json.NewEncoder(w).Encode(operatorSuggestRuleResponse{ + Error: "LLM client not initialized on server (check startup logs)", + }) + return + } + + // Per-PAT rate limit caps Anthropic token spend per operator. Keyed by + // hashed PAT so the bucket survives across cache evictions of the full + // user record and can't be leaked by a memory dump. + if pat := bearerToken(r); pat != "" && o.suggestLimiter != nil { + allowed, resetAt := o.suggestLimiter.Allow(hashToken(pat)) + if !allowed { + retry := time.Until(resetAt).Round(time.Second) + w.Header().Set("Retry-After", fmt.Sprintf("%d", int(retry.Seconds()))) + w.WriteHeader(http.StatusTooManyRequests) + _ = json.NewEncoder(w).Encode(operatorSuggestRuleResponse{ + Error: fmt.Sprintf("rate limit exceeded — try again in %s", retry), + }) + return + } + } + + body, err := io.ReadAll(io.LimitReader(r.Body, 4096)) + if err != nil { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(operatorSuggestRuleResponse{Error: "read body"}) + return + } + var req operatorSuggestRuleRequest + if err := json.Unmarshal(body, &req); err != nil { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(operatorSuggestRuleResponse{Error: "invalid json"}) + return + } + req.SourcePath = strings.TrimSpace(req.SourcePath) + req.TargetPath = strings.TrimSpace(req.TargetPath) + req.TargetRepo = strings.TrimSpace(req.TargetRepo) + req.SourceRepo = strings.TrimSpace(req.SourceRepo) + if req.SourcePath == "" || req.TargetPath == "" { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(operatorSuggestRuleResponse{Error: "source_path and target_path are required"}) + return + } + + ctx, cancel := context.WithTimeout(r.Context(), 90*time.Second) + defer cancel() + + suggestion, err := o.askLLMForRule(ctx, req) + if err != nil { + w.WriteHeader(http.StatusBadGateway) + _ = json.NewEncoder(w).Encode(operatorSuggestRuleResponse{Error: err.Error()}) + return + } + + ruleYAML := renderRuleYAML(suggestion, req) + verified, computed, vErr := verifySuggestedRule(suggestion, req.SourcePath, req.TargetPath) + + resp := operatorSuggestRuleResponse{ + RuleYAML: ruleYAML, + Explanation: suggestion.Explanation, + Verified: verified, + ComputedPath: computed, + } + if vErr != nil { + resp.VerifyError = vErr.Error() + } + if !verified { + resp.Warning = "Generated rule did not produce the expected target path from your example. Review and adjust before saving." + } + _ = json.NewEncoder(w).Encode(resp) +} + +// SuggestRuleSystemPrompt is the system prompt used by the AI rule suggester. +// Exported so cmd/test-llm can exercise the real prompt end-to-end against +// the configured provider (same prompt writers will hit via the UI). +const SuggestRuleSystemPrompt = `You are a configuration generator for GitHub Copier workflows. + +Given a single source→target file transformation example, output ONLY a valid JSON object — no markdown, no prose outside the JSON. Generate ONE rule describing ONE transformation. + +Transform types (prefer the simplest that works — move > copy > glob > regex): +- "move" — rename a directory prefix. Matches any file under transform_from; replaces the prefix with transform_to. Use when the source and target share the subpath below the renamed prefix. +- "copy" — rename ONE exact file. Use when the example is a specific file pair, not a pattern. +- "glob" — wildcards in pattern (e.g. "dir/**/*.ext"). Use "${relative_path}" in transform_template to preserve subdir structure after the matched prefix. +- "regex" — Go RE2 regex with named captures (e.g. "(?P.+)"). Use ONLY when move/copy/glob cannot express the rename. + +Response shape (omit fields that don't apply to the chosen transform_type): +{ + "name": "kebab-case-rule-name", + "destination_repo": "org/dest-repo", + "destination_branch": "main", + "commit_strategy": "pull_request", + "transform_type": "move" | "copy" | "glob" | "regex", + "transform_from": "", + "transform_to": "", + "pattern": "", + "transform_template": "", + "explanation": "one sentence describing what this rule does" +} + +Rules: +- destination_branch defaults to "main"; commit_strategy defaults to "pull_request" (use "direct" only if the user's intent is clearly a direct commit). +- If the user did not provide a target repo, use a placeholder like "org/target-repo" so the writer can fill it in. +- name should be short and kebab-case, derived from the source directory or file. +- The rule MUST produce the user's target path when applied to their source path. Verify the logic before responding. + +Examples + +Input: source=mflix/server/java-spring/App.java target=server/App.java repo=mongodb/sample-app-java-mflix +Output: {"name":"mflix-java-spring-server","destination_repo":"mongodb/sample-app-java-mflix","destination_branch":"main","commit_strategy":"pull_request","transform_type":"move","transform_from":"mflix/server/java-spring","transform_to":"server","explanation":"Renames the mflix/server/java-spring prefix to server when copying into the target repo."} + +Input: source=mflix/README-JAVA-SPRING.md target=README.md repo=mongodb/sample-app-java-mflix +Output: {"name":"mflix-readme","destination_repo":"mongodb/sample-app-java-mflix","destination_branch":"main","commit_strategy":"pull_request","transform_type":"copy","transform_from":"mflix/README-JAVA-SPRING.md","transform_to":"README.md","explanation":"Copies one specific README file and renames it in the destination."} + +Input: source=agg/python/models/user.py target=shared/python/models/user.py repo=org/shared-examples +Output: {"name":"agg-python","destination_repo":"org/shared-examples","destination_branch":"main","commit_strategy":"pull_request","transform_type":"glob","pattern":"agg/python/**/*.py","transform_template":"shared/python/${relative_path}","explanation":"Matches any .py file under agg/python and preserves the subdirectory structure under shared/python."} + +Input: source=tutorials/v2/getting-started.mdx target=docs/getting-started-v2.mdx repo=org/docs-site +Output: {"name":"tutorials-versioned","destination_repo":"org/docs-site","destination_branch":"main","commit_strategy":"pull_request","transform_type":"regex","pattern":"tutorials/v(?P[0-9]+)/(?P.+)\\.mdx","transform_template":"docs/${slug}-v${ver}.mdx","explanation":"Extracts version and slug from the source path and rebuilds the target filename with the version as a suffix."}` + +// askLLMForRule sends a structured prompt to the LLM and parses the JSON response. +func (o *operatorUI) askLLMForRule(ctx context.Context, req operatorSuggestRuleRequest) (*llmSuggestedRule, error) { + userPrompt := fmt.Sprintf(`Generate a copier rule for this transformation: + +Source file: %s +Target file: %s +Target repo: %s + +Return ONLY a JSON object with the fields documented above. No prose outside the JSON.`, + req.SourcePath, req.TargetPath, defaultIfEmpty(req.TargetRepo, "(user did not specify — use a placeholder like \"org/target-repo\")")) + + raw, err := o.llm.GenerateJSON(ctx, SuggestRuleSystemPrompt, userPrompt) + if err != nil { + return nil, fmt.Errorf("LLM error: %w", err) + } + + var suggestion llmSuggestedRule + if err := json.Unmarshal([]byte(raw), &suggestion); err != nil { + return nil, fmt.Errorf("LLM returned invalid JSON: %w (response: %s)", err, truncate(raw, 200)) + } + suggestion.TransformType = strings.ToLower(strings.TrimSpace(suggestion.TransformType)) + if suggestion.DestRepo == "" && req.TargetRepo != "" { + suggestion.DestRepo = req.TargetRepo + } + if suggestion.DestBranch == "" { + suggestion.DestBranch = "main" + } + if suggestion.CommitStrategy == "" { + suggestion.CommitStrategy = "pull_request" + } + if suggestion.Name == "" { + suggestion.Name = "generated-rule" + } + return &suggestion, nil +} + +// verifySuggestedRule tests whether the suggested rule, applied to sourcePath, +// produces targetPath. Returns (matched, computedPath, error). +func verifySuggestedRule(s *llmSuggestedRule, sourcePath, targetPath string) (bool, string, error) { + transformer := NewPathTransformer() + + switch s.TransformType { + case "move": + if s.TransformFrom == "" || s.TransformTo == "" { + return false, "", fmt.Errorf("move rule missing from/to") + } + from := strings.TrimSuffix(s.TransformFrom, "/") + if !strings.HasPrefix(sourcePath, from) { + return false, "", fmt.Errorf("source path does not start with %q", from) + } + rel := strings.TrimPrefix(strings.TrimPrefix(sourcePath, from), "/") + computed := strings.TrimSuffix(s.TransformTo, "/") + "/" + rel + computed = strings.TrimSuffix(computed, "/") + return computed == targetPath, computed, nil + + case "copy": + if s.TransformFrom == "" || s.TransformTo == "" { + return false, "", fmt.Errorf("copy rule missing from/to") + } + if sourcePath != s.TransformFrom { + return false, "", fmt.Errorf("source path %q does not equal copy from %q", sourcePath, s.TransformFrom) + } + return s.TransformTo == targetPath, s.TransformTo, nil + + case "glob": + if s.Pattern == "" || s.TransformTempl == "" { + return false, "", fmt.Errorf("glob rule missing pattern/transform") + } + matcher := NewPatternMatcher() + result := matcher.Match(sourcePath, types.SourcePattern{Type: types.PatternTypeGlob, Pattern: s.Pattern}) + if !result.Matched { + return false, "", fmt.Errorf("glob pattern %q does not match %q", s.Pattern, sourcePath) + } + // Add relative_path (server-side glob transform convention): strip prefix before first wildcard + vars := result.Variables + if vars == nil { + vars = make(map[string]string) + } + vars["relative_path"] = computeGlobRelativePath(sourcePath, s.Pattern) + computed, err := transformer.Transform(sourcePath, s.TransformTempl, vars) + if err != nil { + return false, "", fmt.Errorf("apply transform: %w", err) + } + return computed == targetPath, computed, nil + + case "regex": + if s.Pattern == "" || s.TransformTempl == "" { + return false, "", fmt.Errorf("regex rule missing pattern/transform") + } + re, err := regexp.Compile(s.Pattern) + if err != nil { + return false, "", fmt.Errorf("invalid regex: %w", err) + } + match := re.FindStringSubmatch(sourcePath) + if match == nil { + return false, "", fmt.Errorf("regex %q does not match %q", s.Pattern, sourcePath) + } + vars := map[string]string{"matched_pattern": s.Pattern} + for i, name := range re.SubexpNames() { + if i > 0 && name != "" { + vars[name] = match[i] + } + } + computed, err := transformer.Transform(sourcePath, s.TransformTempl, vars) + if err != nil { + return false, "", fmt.Errorf("apply transform: %w", err) + } + return computed == targetPath, computed, nil + + default: + return false, "", fmt.Errorf("unknown transform type: %q", s.TransformType) + } +} + +// computeGlobRelativePath mirrors the server-side convention: strip the +// longest literal prefix (before the first wildcard) from the source path. +func computeGlobRelativePath(sourcePath, pattern string) string { + // Find the first wildcard character in the pattern + idx := strings.IndexAny(pattern, "*?[") + if idx < 0 { + return "" + } + prefix := pattern[:idx] + // Trim to the last '/' before the wildcard to get a clean directory prefix + if slash := strings.LastIndex(prefix, "/"); slash >= 0 { + prefix = prefix[:slash+1] + } + return strings.TrimPrefix(sourcePath, prefix) +} + +// renderRuleYAML produces a YAML snippet for the operator UI to display. +func renderRuleYAML(s *llmSuggestedRule, req operatorSuggestRuleRequest) string { + var sb strings.Builder + sb.WriteString("- name: \"") + sb.WriteString(s.Name) + sb.WriteString("\"\n") + if req.SourceRepo != "" { + sb.WriteString(" source:\n") + sb.WriteString(" repo: \"") + sb.WriteString(req.SourceRepo) + sb.WriteString("\"\n") + } + sb.WriteString(" destination:\n") + sb.WriteString(" repo: \"") + sb.WriteString(s.DestRepo) + sb.WriteString("\"\n") + sb.WriteString(" branch: \"") + sb.WriteString(s.DestBranch) + sb.WriteString("\"\n") + sb.WriteString(" transformations:\n") + switch s.TransformType { + case "move": + fmt.Fprintf(&sb, " - move: { from: %q, to: %q }\n", s.TransformFrom, s.TransformTo) + case "copy": + fmt.Fprintf(&sb, " - copy: { from: %q, to: %q }\n", s.TransformFrom, s.TransformTo) + case "glob": + sb.WriteString(" - glob:\n") + fmt.Fprintf(&sb, " pattern: %q\n", s.Pattern) + fmt.Fprintf(&sb, " transform: %q\n", s.TransformTempl) + case "regex": + sb.WriteString(" - regex:\n") + fmt.Fprintf(&sb, " pattern: %q\n", s.Pattern) + fmt.Fprintf(&sb, " transform: %q\n", s.TransformTempl) + } + sb.WriteString(" commit_strategy:\n") + sb.WriteString(" type: \"") + sb.WriteString(s.CommitStrategy) + sb.WriteString("\"\n") + return sb.String() +} + +func defaultIfEmpty(s, def string) string { + if s == "" { + return def + } + return s +} + +// truncate shortens s to at most n runes, appending an ellipsis when cut. +// Rune-aware (not byte-aware) so multi-byte glyphs in LLM output aren't +// cut in half when we truncate for logging. +func truncate(s string, n int) string { + runes := []rune(s) + if len(runes) <= n { + return s + } + return string(runes[:n]) + "…" +} diff --git a/services/operator_suggest_rule_test.go b/services/operator_suggest_rule_test.go new file mode 100644 index 0000000..2e04b73 --- /dev/null +++ b/services/operator_suggest_rule_test.go @@ -0,0 +1,164 @@ +package services + +import "testing" + +// verifySuggestedRule is the invariant the AI suggester relies on: every +// generated rule must produce the user's target path when applied to their +// source path. If it doesn't, the UI surfaces a "not verified" warning +// instead of silently showing a broken rule. +// +// These tests assert that invariant holds for each transform type. + +func TestVerifySuggestedRule_Move(t *testing.T) { + t.Run("matching prefix rename", func(t *testing.T) { + s := &llmSuggestedRule{ + TransformType: "move", + TransformFrom: "agg/python", + TransformTo: "shared/python", + } + ok, computed, err := verifySuggestedRule(s, "agg/python/models/user.py", "shared/python/models/user.py") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !ok { + t.Fatalf("expected match; computed=%q", computed) + } + if computed != "shared/python/models/user.py" { + t.Errorf("computed=%q", computed) + } + }) + t.Run("source doesn't start with from", func(t *testing.T) { + s := &llmSuggestedRule{ + TransformType: "move", + TransformFrom: "agg/python", + TransformTo: "shared/python", + } + ok, _, err := verifySuggestedRule(s, "other/path.py", "shared/python/other/path.py") + if err == nil || ok { + t.Fatalf("want error and no match; got ok=%v err=%v", ok, err) + } + }) + t.Run("target mismatch", func(t *testing.T) { + s := &llmSuggestedRule{ + TransformType: "move", + TransformFrom: "agg/python", + TransformTo: "shared/python", + } + ok, _, err := verifySuggestedRule(s, "agg/python/x.py", "different/target.py") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if ok { + t.Fatalf("want verification failure for target mismatch") + } + }) +} + +func TestVerifySuggestedRule_Copy(t *testing.T) { + t.Run("exact file rename", func(t *testing.T) { + s := &llmSuggestedRule{ + TransformType: "copy", + TransformFrom: "mflix/README-JAVA-SPRING.md", + TransformTo: "README.md", + } + ok, _, err := verifySuggestedRule(s, "mflix/README-JAVA-SPRING.md", "README.md") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !ok { + t.Fatalf("want match") + } + }) + t.Run("source doesn't equal from", func(t *testing.T) { + s := &llmSuggestedRule{ + TransformType: "copy", + TransformFrom: "mflix/README.md", + TransformTo: "README.md", + } + ok, _, err := verifySuggestedRule(s, "different/README.md", "README.md") + if err == nil || ok { + t.Fatalf("want error and no match; got ok=%v err=%v", ok, err) + } + }) +} + +func TestVerifySuggestedRule_Glob(t *testing.T) { + t.Run("wildcard with relative_path", func(t *testing.T) { + s := &llmSuggestedRule{ + TransformType: "glob", + Pattern: "agg/python/**/*.py", + TransformTempl: "shared/python/${relative_path}", + } + ok, computed, err := verifySuggestedRule(s, "agg/python/models/user.py", "shared/python/models/user.py") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !ok { + t.Fatalf("expected match; computed=%q", computed) + } + }) + t.Run("pattern doesn't match source", func(t *testing.T) { + s := &llmSuggestedRule{ + TransformType: "glob", + Pattern: "agg/python/**/*.py", + TransformTempl: "shared/python/${relative_path}", + } + ok, _, err := verifySuggestedRule(s, "not-matching/file.txt", "shared/python/file.txt") + if err == nil || ok { + t.Fatalf("want error; got ok=%v err=%v", ok, err) + } + }) +} + +func TestVerifySuggestedRule_Regex(t *testing.T) { + t.Run("named captures in template", func(t *testing.T) { + s := &llmSuggestedRule{ + TransformType: "regex", + Pattern: `tutorials/v(?P[0-9]+)/(?P.+)\.mdx`, + TransformTempl: "docs/${slug}-v${ver}.mdx", + } + ok, computed, err := verifySuggestedRule(s, "tutorials/v2/getting-started.mdx", "docs/getting-started-v2.mdx") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !ok { + t.Fatalf("expected match; computed=%q", computed) + } + }) + t.Run("invalid regex", func(t *testing.T) { + s := &llmSuggestedRule{ + TransformType: "regex", + Pattern: `[unclosed`, + TransformTempl: "anything", + } + ok, _, err := verifySuggestedRule(s, "src", "dst") + if err == nil || ok { + t.Fatalf("want error for invalid regex; got ok=%v err=%v", ok, err) + } + }) +} + +func TestVerifySuggestedRule_UnknownType(t *testing.T) { + s := &llmSuggestedRule{TransformType: "symlink"} + ok, _, err := verifySuggestedRule(s, "a", "b") + if err == nil || ok { + t.Fatalf("want error for unknown type; got ok=%v err=%v", ok, err) + } +} + +func TestTruncate_RuneSafe(t *testing.T) { + // ASCII path still works + if got := truncate("hello world", 5); got != "hello…" { + t.Errorf("ascii: got %q", got) + } + // Multi-byte runes must not be cut mid-byte + s := "日本語テスト" // 6 runes, 18 bytes + got := truncate(s, 3) + if got != "日本語…" { + t.Errorf("multibyte: got %q (len=%d)", got, len(got)) + } + // Short input returned unchanged, no ellipsis + if got := truncate("hi", 5); got != "hi" { + t.Errorf("short input changed: %q", got) + } +} diff --git a/services/operator_ui.go b/services/operator_ui.go new file mode 100644 index 0000000..6300e56 --- /dev/null +++ b/services/operator_ui.go @@ -0,0 +1,973 @@ +package services + +import ( + "bytes" + "context" + "crypto/rand" + _ "embed" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "os" + "regexp" + "strconv" + "strings" + "sync" + "time" + + "github.com/grove-platform/github-copier/configs" +) + +//go:embed web/operator/index.html +var operatorIndexHTML []byte + +var operatorVersionTagRe = regexp.MustCompile(`^v[0-9]+\.[0-9]+\.[0-9]+$`) + +// RegisterOperatorRoutes mounts the operator HTML UI and JSON APIs under /operator/. +// Call only when cfg.OperatorUIEnabled is true. Works with any HTTP origin (local +// dev, Cloud Run, Kubernetes, etc.). Every secured API requires an Authorization: +// Bearer header. The user's permission on cfg.OperatorAuthRepo +// determines their role (operator or writer). +func RegisterOperatorRoutes(mux *http.ServeMux, cfg *configs.Config, container *ServiceContainer, version string) { + o := &operatorUI{ + cfg: cfg, + container: container, + version: version, + ghCache: newGHAuthCache(5 * time.Minute), + // 30 suggestions/hour/PAT caps Anthropic spend per operator. Normal + // usage is well under this; a misbehaving client can't rack up a bill. + suggestLimiter: newTokenBucket(30, time.Hour), + } + // Always create the LLM client; availability is checked dynamically via Ping. + // Operators can change the active model and base URL from the UI without restart. + if client, err := NewLLMClient(LLMClientOptions{ + Provider: cfg.LLMProvider, + BaseURL: cfg.LLMBaseURL, + Model: cfg.LLMModel, + APIKey: cfg.AnthropicAPIKey, + }); err != nil { + LogWarning("LLM client init failed", "error", err.Error()) + } else { + o.llm = client + LogInfo("LLM rule suggester ready", "provider", client.ProviderName(), "base_url", cfg.LLMBaseURL, "model", cfg.LLMModel, "note", "availability checked at request time") + } + // Register specific paths before the /operator/ subtree so /operator/api/* is not handled by serveIndex. + mux.HandleFunc("/operator/api/status", o.handleOperatorStatus) + mux.HandleFunc("/operator/api/audit/events", o.wrapAPI(o.handleAuditEvents)) + mux.HandleFunc("/operator/api/audit/overview", o.wrapAPI(o.handleAuditOverview)) + mux.HandleFunc("/operator/api/observability/deliveries", o.wrapAPI(o.handleObservabilityDeliveries)) + mux.HandleFunc("/operator/api/observability/webhook-traces", o.wrapAPI(o.handleObservabilityWebhookTraces)) + mux.HandleFunc("/operator/api/deployment", o.wrapAPI(o.handleDeployment)) + mux.HandleFunc("/operator/api/release", o.wrapOperatorOnly(o.handleRelease)) + mux.HandleFunc("/operator/api/replay", o.wrapOperatorOnly(o.handleReplay)) + mux.HandleFunc("/operator/api/workflows", o.wrapAPI(o.handleWorkflows)) + mux.HandleFunc("/operator/api/logs", o.wrapAPI(o.handleDeliveryLogs)) + mux.HandleFunc("/operator/api/me", o.wrapAPI(o.handleMe)) + mux.HandleFunc("/operator/api/repo-permission", o.wrapAPI(o.handleRepoPermission)) + mux.HandleFunc("/operator/api/suggest-rule", o.wrapAPI(o.handleSuggestRule)) + mux.HandleFunc("/operator/api/llm/status", o.wrapAPI(o.handleLLMStatus)) + mux.HandleFunc("/operator/api/llm/settings", o.wrapOperatorOnly(o.handleLLMSettings)) + mux.HandleFunc("/operator/api/llm/model", o.wrapOperatorOnly(o.handleLLMDeleteModel)) + mux.HandleFunc("/operator/api/llm/pull", o.wrapOperatorOnly(o.handleLLMPullModel)) + mux.HandleFunc("/operator/", o.serveIndex) + mux.HandleFunc("/operator", func(w http.ResponseWriter, r *http.Request) { + http.Redirect(w, r, "/operator/", http.StatusFound) + }) + LogInfo("Operator UI: /operator/ with GitHub PAT authentication", "auth_repo", cfg.OperatorAuthRepo) +} + +type operatorUI struct { + cfg *configs.Config + container *ServiceContainer + version string + replayInFlight sync.Map // key: "owner/repo#pr" → prevents concurrent replays + ghCache *ghAuthCache // GitHub PAT validation + per-repo permission cache + llm LLMClient // optional: enabled when cfg.LLMEnabled is true + suggestLimiter *tokenBucket // per-PAT rate limit for /api/suggest-rule (LLM cost cap) + llmPing llmPingCache // cached Ping() result so /llm/status doesn't burn tokens on every refresh +} + +// llmPingCache memoises the most recent LLMClient.Ping() outcome. Status-tab +// refreshes don't need fresh liveness data more than once every 30s, and +// each uncached ping costs one input + one output Anthropic token. +type llmPingCache struct { + mu sync.RWMutex + err error + checkedAt time.Time +} + +func (p *llmPingCache) get(ttl time.Duration) (err error, ok bool) { + p.mu.RLock() + defer p.mu.RUnlock() + if p.checkedAt.IsZero() || time.Since(p.checkedAt) > ttl { + return nil, false + } + return p.err, true +} + +func (p *llmPingCache) set(err error) { + p.mu.Lock() + defer p.mu.Unlock() + p.err = err + p.checkedAt = time.Now() +} + +// invalidate forces the next get() to miss, so operators who change the +// base URL or active model see fresh liveness state on the next refresh. +func (p *llmPingCache) invalidate() { + p.mu.Lock() + defer p.mu.Unlock() + p.err = nil + p.checkedAt = time.Time{} +} + +// operatorUserCtxKey is the context key for the authenticated operator user. +type operatorUserCtxKey struct{} + +// operatorUserFromCtx returns the authenticated user from the request context (nil if not set). +func operatorUserFromCtx(r *http.Request) *OperatorUser { + u, _ := r.Context().Value(operatorUserCtxKey{}).(*OperatorUser) + return u +} + +// wrapAPI validates the incoming request's GitHub PAT and attaches the user to the context. +func (o *operatorUI) wrapAPI(next http.HandlerFunc) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + token := bearerToken(r) + if token == "" { + w.WriteHeader(http.StatusUnauthorized) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "provide a GitHub Personal Access Token as Bearer token"}) + return + } + user, err := o.authenticateGitHub(r.Context(), token) + if err != nil { + w.WriteHeader(http.StatusUnauthorized) + _ = json.NewEncoder(w).Encode(map[string]string{"error": err.Error()}) + return + } + ctx := context.WithValue(r.Context(), operatorUserCtxKey{}, user) + next(w, r.WithContext(ctx)) + } +} + +// wrapOperatorOnly wraps a handler that requires the "operator" role (replay, release). +func (o *operatorUI) wrapOperatorOnly(next http.HandlerFunc) http.HandlerFunc { + return o.wrapAPI(func(w http.ResponseWriter, r *http.Request) { + user := operatorUserFromCtx(r) + if user == nil || user.Role != RoleOperator { + role := "unknown" + if user != nil { + role = string(user.Role) + } + w.WriteHeader(http.StatusForbidden) + _ = json.NewEncoder(w).Encode(map[string]string{ + "error": fmt.Sprintf("this action requires operator access (you have %s)", role), + }) + return + } + next(w, r) + }) +} + +func (o *operatorUI) authenticateGitHub(ctx context.Context, pat string) (*OperatorUser, error) { + if o.ghCache != nil { + if user, err, ok := o.ghCache.get(pat); ok { + return user, err + } + } + authCtx, cancel := context.WithTimeout(ctx, 15*time.Second) + defer cancel() + user, err := validateGitHubPAT(authCtx, pat, o.cfg.OperatorAuthRepo) + if o.ghCache != nil { + o.ghCache.set(pat, user, err) + } + return user, err +} + +// handleOperatorStatus reports whether secured operator APIs are configured (no auth). +func (o *operatorUI) handleOperatorStatus(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusMethodNotAllowed) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "method not allowed"}) + return + } + w.Header().Set("Content-Type", "application/json") + out := map[string]any{ + "operator_apis_enabled": true, + "auth_repo": o.cfg.OperatorAuthRepo, + "llm_available": o.llm != nil, // client exists; reachability checked via /operator/api/llm/status + "metrics_enabled": o.cfg.MetricsEnabled, + "audit_enabled": o.cfg.AuditEnabled, + "version": o.version, + } + if o.container != nil && o.container.DeliveryTracker != nil { + out["webhook_dedupe_entries"] = o.container.DeliveryTracker.Len() + out["webhook_recent_observations"] = o.container.DeliveryTracker.HistoryLen() + } + if o.container != nil && o.container.WebhookTraces != nil { + out["webhook_trace_entries"] = o.container.WebhookTraces.Len() + } + _ = json.NewEncoder(w).Encode(out) +} + +// handleMe returns the authenticated user's GitHub login, avatar, and role. +func (o *operatorUI) handleMe(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + w.WriteHeader(http.StatusMethodNotAllowed) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "method not allowed"}) + return + } + user := operatorUserFromCtx(r) + if user == nil { + w.WriteHeader(http.StatusInternalServerError) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "no authenticated user in context"}) + return + } + _ = json.NewEncoder(w).Encode(user) +} + +// handleRepoPermission reports whether the authenticated user has read access to a given repo. +// Used by the frontend to pre-check replay eligibility per source repo. +// Query params: repos=owner/repo1,owner/repo2 (comma-separated). +func (o *operatorUI) handleRepoPermission(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + w.WriteHeader(http.StatusMethodNotAllowed) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "method not allowed"}) + return + } + reposParam := strings.TrimSpace(r.URL.Query().Get("repos")) + if reposParam == "" { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "repos query param required"}) + return + } + repos := strings.Split(reposParam, ",") + + // Per-repo result: Allowed + optional Error. Surfacing the error lets + // the frontend distinguish "user genuinely can't read this repo" from + // "GitHub rate limited us" so disabled replay buttons can carry an + // actionable tooltip instead of an opaque gray state. + type repoPerm struct { + Allowed bool `json:"allowed"` + Error string `json:"error,omitempty"` + } + result := make(map[string]repoPerm, len(repos)) + + user := operatorUserFromCtx(r) + userPAT := bearerToken(r) + if user == nil || userPAT == "" { + w.WriteHeader(http.StatusUnauthorized) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "unauthenticated"}) + return + } + + ctx, cancel := context.WithTimeout(r.Context(), 15*time.Second) + defer cancel() + for _, repo := range repos { + repo = strings.TrimSpace(repo) + if repo == "" { + continue + } + canRead, err := o.ghCache.CanUserReadRepo(ctx, userPAT, user.Login, repo) + entry := repoPerm{Allowed: canRead} + if err != nil { + entry.Error = err.Error() + } + result[repo] = entry + } + _ = json.NewEncoder(w).Encode(map[string]any{"permissions": result}) +} + +func bearerToken(r *http.Request) string { + h := r.Header.Get("Authorization") + const p = "Bearer " + if len(h) > len(p) && strings.EqualFold(h[:len(p)], p) { + return strings.TrimSpace(h[len(p):]) + } + return "" +} + +func (o *operatorUI) serveIndex(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/operator/" { + http.NotFound(w, r) + return + } + if r.Method != http.MethodGet { + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + w.Header().Set("Content-Type", "text/html; charset=utf-8") + _, _ = w.Write(operatorIndexHTML) +} + +func (o *operatorUI) handleAuditEvents(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + w.WriteHeader(http.StatusMethodNotAllowed) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "method not allowed"}) + return + } + q, err := parseAuditListQuery(r) + if err != nil { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(map[string]string{"error": err.Error()}) + return + } + ctx, cancel := context.WithTimeout(r.Context(), 10*time.Second) + defer cancel() + if o.container.AuditLogger == nil { + _ = json.NewEncoder(w).Encode(map[string]any{"events": []any{}}) + return + } + events, err := o.container.AuditLogger.QueryAuditEvents(ctx, q) + if err != nil { + w.WriteHeader(http.StatusBadGateway) + _ = json.NewEncoder(w).Encode(map[string]string{"error": err.Error()}) + return + } + _ = json.NewEncoder(w).Encode(map[string]any{"events": events}) +} + +func parseAuditListQuery(r *http.Request) (AuditListQuery, error) { + q := r.URL.Query() + lim := 50 + if v := q.Get("limit"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 { + lim = n + } + } + if lim > 200 { + lim = 200 + } + aq := AuditListQuery{Limit: lim} + if et := strings.TrimSpace(q.Get("event_type")); et != "" { + switch AuditEventType(et) { + case AuditEventCopy, AuditEventDeprecation, AuditEventError: + aq.EventType = et + default: + return AuditListQuery{}, fmt.Errorf("invalid event_type (use copy, deprecation, or error)") + } + } + switch strings.TrimSpace(strings.ToLower(q.Get("success"))) { + case "true": + t := true + aq.Success = &t + case "false": + f := false + aq.Success = &f + case "": + default: + return AuditListQuery{}, fmt.Errorf("invalid success (use true or false)") + } + if rn := strings.TrimSpace(q.Get("rule_name")); rn != "" { + aq.RuleName = rn + } + if prStr := strings.TrimSpace(q.Get("pr_number")); prStr != "" { + n, err := strconv.Atoi(prStr) + if err != nil || n <= 0 { + return AuditListQuery{}, fmt.Errorf("pr_number must be a positive integer") + } + aq.PRNumber = &n + } + if ps := strings.TrimSpace(q.Get("path")); ps != "" { + aq.PathSearch = ps + } + if since := strings.TrimSpace(q.Get("since")); since != "" { + t, err := time.Parse(time.RFC3339, since) + if err != nil { + return AuditListQuery{}, fmt.Errorf("since must be RFC3339: %w", err) + } + aq.Since = &t + } + return aq, nil +} + +func (o *operatorUI) handleAuditOverview(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + w.WriteHeader(http.StatusMethodNotAllowed) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "method not allowed"}) + return + } + days := 14 + if v := r.URL.Query().Get("days"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 { + days = n + } + } + if days > 366 { + days = 366 + } + ctx, cancel := context.WithTimeout(r.Context(), 20*time.Second) + defer cancel() + if o.container.AuditLogger == nil { + _ = json.NewEncoder(w).Encode(map[string]any{ + "days": days, + "daily_volume": []DailyStats{}, + "stats_by_rule": map[string]RuleStats{}, + "audit_disabled": true, + }) + return + } + daily, err1 := o.container.AuditLogger.GetDailyVolume(ctx, days) + if err1 != nil { + w.WriteHeader(http.StatusBadGateway) + _ = json.NewEncoder(w).Encode(map[string]string{"error": err1.Error()}) + return + } + byRule, err2 := o.container.AuditLogger.GetStatsByRule(ctx) + if err2 != nil { + w.WriteHeader(http.StatusBadGateway) + _ = json.NewEncoder(w).Encode(map[string]string{"error": err2.Error()}) + return + } + _ = json.NewEncoder(w).Encode(map[string]any{ + "days": days, + "daily_volume": daily, + "stats_by_rule": byRule, + }) +} + +func (o *operatorUI) handleObservabilityDeliveries(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + w.WriteHeader(http.StatusMethodNotAllowed) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "method not allowed"}) + return + } + max := 100 + if v := r.URL.Query().Get("limit"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 { + max = n + } + } + if max > deliveryHistoryMax { + max = deliveryHistoryMax + } + if o.container.DeliveryTracker == nil { + _ = json.NewEncoder(w).Encode(map[string]any{"deliveries": []DeliverySnapshot{}}) + return + } + snap := o.container.DeliveryTracker.RecentDeliveries(max) + _ = json.NewEncoder(w).Encode(map[string]any{ + "deliveries": snap, + "dedupe_entries": o.container.DeliveryTracker.Len(), + }) +} + +func (o *operatorUI) handleObservabilityWebhookTraces(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + w.WriteHeader(http.StatusMethodNotAllowed) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "method not allowed"}) + return + } + max := 50 + if v := r.URL.Query().Get("limit"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 { + max = n + } + } + if max > webhookTraceMaxEntries { + max = webhookTraceMaxEntries + } + if o.container == nil || o.container.WebhookTraces == nil { + _ = json.NewEncoder(w).Encode(map[string]any{"traces": []WebhookTraceEntry{}}) + return + } + tr := o.container.WebhookTraces.Recent(max) + _ = json.NewEncoder(w).Encode(map[string]any{ + "traces": tr, + "total": o.container.WebhookTraces.Len(), + }) +} + +// OperatorDeploymentInfo is non-secret runtime and platform metadata for the operator UI. +type OperatorDeploymentInfo struct { + Version string `json:"version"` + UptimeSeconds int64 `json:"uptime_seconds"` + MongoHealthy *bool `json:"mongo_healthy,omitempty"` + GoogleCloudRegion string `json:"google_cloud_region,omitempty"` + CloudRunService string `json:"cloud_run_service,omitempty"` + CloudRunRevision string `json:"cloud_run_revision,omitempty"` + CloudRunConfig string `json:"cloud_run_configuration,omitempty"` + GoogleCloudProject string `json:"google_cloud_project,omitempty"` + Port string `json:"port"` + WebhookPath string `json:"webhook_path"` + DryRun bool `json:"dry_run"` + AuditEnabled bool `json:"audit_enabled"` + AuditDatabase string `json:"audit_database,omitempty"` + AuditCollection string `json:"audit_collection,omitempty"` + ConfigRepo string `json:"config_repo,omitempty"` + EffectiveConfig string `json:"effective_config_file,omitempty"` + OperatorRepoSlug string `json:"operator_repo_slug,omitempty"` + ReleaseAPIMode ReleaseAPIMode `json:"release_api_mode"` + Env map[string]string `json:"cloud_env,omitempty"` +} + +// ReleaseAPIMode describes whether the operator UI can cut a release tag. +// Typed so the set of possible values is discoverable from the type alone +// and so the frontend can switch on a known enum instead of a free string. +type ReleaseAPIMode string + +const ( + // ReleaseAPIDisabled — neither OPERATOR_RELEASE_GITHUB_TOKEN nor + // OPERATOR_REPO_SLUG is configured; the UI hides the release button. + ReleaseAPIDisabled ReleaseAPIMode = "disabled" + // ReleaseAPITagCreateEnabled — both are configured; the UI shows the + // release flow and /api/release will attempt to create a tag ref. + ReleaseAPITagCreateEnabled ReleaseAPIMode = "tag_create_enabled" +) + +func (o *operatorUI) handleDeployment(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + w.WriteHeader(http.StatusMethodNotAllowed) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "method not allowed"}) + return + } + releaseMode := ReleaseAPIDisabled + if o.cfg.OperatorReleaseGitHubToken != "" && o.cfg.OperatorRepoSlug != "" { + releaseMode = ReleaseAPITagCreateEnabled + } + info := OperatorDeploymentInfo{ + Version: o.version, + UptimeSeconds: int64(time.Since(o.container.StartTime).Seconds()), + CloudRunService: os.Getenv("K_SERVICE"), + CloudRunRevision: os.Getenv("K_REVISION"), + CloudRunConfig: os.Getenv("K_CONFIGURATION"), + GoogleCloudProject: o.cfg.GoogleCloudProjectId, + Port: o.cfg.Port, + WebhookPath: o.cfg.WebserverPath, + DryRun: o.cfg.DryRun, + AuditEnabled: o.cfg.AuditEnabled, + AuditDatabase: o.cfg.AuditDatabase, + AuditCollection: o.cfg.AuditCollection, + ConfigRepo: o.cfg.ConfigRepoOwner + "/" + o.cfg.ConfigRepoName, + EffectiveConfig: o.cfg.EffectiveConfigFile(), + OperatorRepoSlug: o.cfg.OperatorRepoSlug, + ReleaseAPIMode: releaseMode, + Env: map[string]string{ + "ENV": firstEnv("ENV"), + }, + } + if region := os.Getenv("GOOGLE_CLOUD_REGION"); region != "" { + info.GoogleCloudRegion = region + } + if o.cfg.AuditEnabled && o.container.AuditLogger != nil { + ctx, cancel := context.WithTimeout(r.Context(), 3*time.Second) + defer cancel() + healthy := o.container.AuditLogger.Ping(ctx) == nil + info.MongoHealthy = &healthy + } + _ = json.NewEncoder(w).Encode(info) +} + +type operatorReleaseRequest struct { + Version string `json:"version"` +} + +type operatorReleaseResponse struct { + OK bool `json:"ok,omitempty"` + Ref string `json:"ref,omitempty"` + TagSHA string `json:"tag_sha,omitempty"` + Message string `json:"message,omitempty"` + Error string `json:"error,omitempty"` + Notice string `json:"notice,omitempty"` +} + +func (o *operatorUI) handleRelease(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + w.WriteHeader(http.StatusMethodNotAllowed) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "method not allowed"}) + return + } + body, err := io.ReadAll(io.LimitReader(r.Body, 4096)) + if err != nil { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(operatorReleaseResponse{Error: "read body"}) + return + } + var req operatorReleaseRequest + if err := json.Unmarshal(body, &req); err != nil { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(operatorReleaseResponse{Error: "invalid json"}) + return + } + v := strings.TrimSpace(req.Version) + if !operatorVersionTagRe.MatchString(v) { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(operatorReleaseResponse{Error: "version must match vMAJOR.MINOR.PATCH"}) + return + } + if o.cfg.OperatorReleaseGitHubToken == "" || o.cfg.OperatorRepoSlug == "" { + w.WriteHeader(http.StatusNotImplemented) + _ = json.NewEncoder(w).Encode(operatorReleaseResponse{ + Error: "set OPERATOR_RELEASE_GITHUB_TOKEN and OPERATOR_REPO_SLUG to enable tag creation from the UI", + Notice: "Full releases (changelog + GitHub Release) still use ./scripts/release.sh locally.", + }) + return + } + ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second) + defer cancel() + ref, sha, err := githubCreateVersionTag(ctx, o.cfg.OperatorReleaseGitHubToken, o.cfg.OperatorRepoSlug, o.cfg.OperatorReleaseTargetBranch, v) + if err != nil { + w.WriteHeader(http.StatusBadGateway) + _ = json.NewEncoder(w).Encode(operatorReleaseResponse{Error: err.Error()}) + return + } + _ = json.NewEncoder(w).Encode(operatorReleaseResponse{ + OK: true, + Ref: ref, + TagSHA: sha, + Message: "Tag pushed to GitHub; if CI is configured for tag deploys, the pipeline should start shortly.", + Notice: "This does not update CHANGELOG.md — use scripts/release.sh for a documented release.", + }) +} + +// ── Per-delivery log viewer ── + +func (o *operatorUI) handleDeliveryLogs(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + w.WriteHeader(http.StatusMethodNotAllowed) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "method not allowed"}) + return + } + deliveryID := strings.TrimSpace(r.URL.Query().Get("delivery_id")) + if deliveryID == "" { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "delivery_id is required"}) + return + } + if o.container.DeliveryLogs == nil { + _ = json.NewEncoder(w).Encode(map[string]any{"logs": []LogEntry{}, "delivery_id": deliveryID}) + return + } + logs := o.container.DeliveryLogs.Get(deliveryID) + if logs == nil { + logs = []LogEntry{} + } + _ = json.NewEncoder(w).Encode(map[string]any{"logs": logs, "delivery_id": deliveryID}) +} + +// ── Workflow config browser ── + +func (o *operatorUI) handleWorkflows(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + w.WriteHeader(http.StatusMethodNotAllowed) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "method not allowed"}) + return + } + if o.container.ConfigLoader == nil { + w.WriteHeader(http.StatusServiceUnavailable) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "config loader not initialized"}) + return + } + ctx, cancel := context.WithTimeout(r.Context(), 15*time.Second) + defer cancel() + yamlCfg, err := o.container.ConfigLoader.LoadConfig(ctx, o.cfg) + if err != nil { + w.WriteHeader(http.StatusBadGateway) + _ = json.NewEncoder(w).Encode(map[string]any{ + "error": "failed to load config: " + err.Error(), + "workflows": []any{}, + }) + return + } + _ = json.NewEncoder(w).Encode(map[string]any{ + "workflows": yamlCfg.Workflows, + "defaults": yamlCfg.Defaults, + "config_file": o.cfg.EffectiveConfigFile(), + "config_repo": o.cfg.ConfigRepoOwner + "/" + o.cfg.ConfigRepoName, + }) +} + +// ── Webhook replay ── + +type operatorReplayRequest struct { + Repo string `json:"repo"` // "owner/repo" + PRNumber int `json:"pr_number"` + Branch string `json:"branch"` // base branch + CommitSHA string `json:"commit_sha"` // optional — fetched from GitHub if empty +} + +type operatorReplayResponse struct { + OK bool `json:"ok,omitempty"` + Message string `json:"message,omitempty"` + Error string `json:"error,omitempty"` +} + +func (o *operatorUI) handleReplay(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + w.WriteHeader(http.StatusMethodNotAllowed) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "method not allowed"}) + return + } + + body, err := io.ReadAll(io.LimitReader(r.Body, 4096)) + if err != nil { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(operatorReplayResponse{Error: "read body"}) + return + } + var req operatorReplayRequest + if err := json.Unmarshal(body, &req); err != nil { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(operatorReplayResponse{Error: "invalid json"}) + return + } + + // Validate inputs + parts := strings.SplitN(strings.TrimSpace(req.Repo), "/", 2) + if len(parts) != 2 || parts[0] == "" || parts[1] == "" { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(operatorReplayResponse{Error: "repo must be owner/repo"}) + return + } + owner, repoName := parts[0], parts[1] + + if req.PRNumber <= 0 { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(operatorReplayResponse{Error: "pr_number must be > 0"}) + return + } + if strings.TrimSpace(req.Branch) == "" { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(operatorReplayResponse{Error: "branch is required"}) + return + } + + // Source-repo permission check: the user's PAT must have at least read + // access to the source repo being replayed. + { + user := operatorUserFromCtx(r) + userPAT := bearerToken(r) + if user == nil || userPAT == "" { + w.WriteHeader(http.StatusUnauthorized) + _ = json.NewEncoder(w).Encode(operatorReplayResponse{Error: "unauthenticated"}) + return + } + permCtx, cancel := context.WithTimeout(r.Context(), 10*time.Second) + canRead, permErr := o.ghCache.CanUserReadRepo(permCtx, userPAT, user.Login, req.Repo) + cancel() + if !canRead { + w.WriteHeader(http.StatusForbidden) + msg := fmt.Sprintf("you do not have access to source repo %s", req.Repo) + if permErr != nil { + msg = fmt.Sprintf("%s: %s", msg, permErr.Error()) + } + _ = json.NewEncoder(w).Encode(operatorReplayResponse{Error: msg}) + return + } + } + + // In-flight dedup: prevent concurrent replays for the same PR + replayKey := fmt.Sprintf("%s#%d", req.Repo, req.PRNumber) + if _, loaded := o.replayInFlight.LoadOrStore(replayKey, true); loaded { + w.WriteHeader(http.StatusConflict) + _ = json.NewEncoder(w).Encode(operatorReplayResponse{Error: "replay already in progress for this PR"}) + return + } + + // Fetch commit SHA from GitHub if not provided + commitSHA := strings.TrimSpace(req.CommitSHA) + if commitSHA == "" { + ctx, cancel := context.WithTimeout(r.Context(), 15*time.Second) + defer cancel() + client, err := GetRestClientForOrg(ctx, o.cfg, owner) + if err != nil { + o.replayInFlight.Delete(replayKey) + w.WriteHeader(http.StatusBadGateway) + _ = json.NewEncoder(w).Encode(operatorReplayResponse{Error: "github auth: " + err.Error()}) + return + } + pr, _, err := client.PullRequests.Get(ctx, owner, repoName, req.PRNumber) + if err != nil { + o.replayInFlight.Delete(replayKey) + w.WriteHeader(http.StatusBadGateway) + _ = json.NewEncoder(w).Encode(operatorReplayResponse{Error: "fetch PR: " + err.Error()}) + return + } + if !pr.GetMerged() { + o.replayInFlight.Delete(replayKey) + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(operatorReplayResponse{Error: "PR is not merged — only merged PRs can be replayed"}) + return + } + commitSHA = pr.GetMergeCommitSHA() + if commitSHA == "" { + o.replayInFlight.Delete(replayKey) + w.WriteHeader(http.StatusBadGateway) + _ = json.NewEncoder(w).Encode(operatorReplayResponse{Error: "PR has no merge commit SHA"}) + return + } + } + + // Dispatch replay in background (same path as real webhook processing) + // Millisecond timestamps alone collide when two operators replay in the + // same ms (rare but observed in tests). Append a short random suffix so + // the delivery ID is unique across concurrent replays on the same revision. + var rnd [3]byte + _, _ = rand.Read(rnd[:]) + deliveryID := fmt.Sprintf("replay-%d-%s", time.Now().UnixMilli(), hex.EncodeToString(rnd[:])) + baseBranch := strings.TrimSpace(req.Branch) + + LogInfo("operator replay requested", + "repo", req.Repo, + "pr_number", req.PRNumber, + "branch", baseBranch, + "commit_sha", commitSHA, + "delivery_id", deliveryID, + ) + + AppendWebhookTrace(o.container, WebhookTraceEntry{ + DeliveryID: deliveryID, + EventType: "operator_replay", + Repo: req.Repo, + BaseBranch: baseBranch, + CommitSHA: commitSHA, + PRNumber: req.PRNumber, + Outcome: "replay_started", + Detail: "initiated via operator UI", + }) + + bgCtx := context.Background() + if o.container.DeliveryLogs != nil { + bgCtx = ContextWithLogBuffer(bgCtx, deliveryID, o.container.DeliveryLogs) + } + if o.cfg.WebhookProcessingTimeoutSeconds > 0 { + var cancel context.CancelFunc + bgCtx, cancel = context.WithTimeout(bgCtx, time.Duration(o.cfg.WebhookProcessingTimeoutSeconds)*time.Second) + o.container.wg.Add(1) + go func() { + defer o.container.wg.Done() + defer cancel() + defer o.replayInFlight.Delete(replayKey) + processWebhookWithRetry(bgCtx, req.PRNumber, commitSHA, owner, repoName, baseBranch, deliveryID, o.cfg, o.container) + }() + } else { + o.container.wg.Add(1) + go func() { + defer o.container.wg.Done() + defer o.replayInFlight.Delete(replayKey) + processWebhookWithRetry(bgCtx, req.PRNumber, commitSHA, owner, repoName, baseBranch, deliveryID, o.cfg, o.container) + }() + } + + w.WriteHeader(http.StatusAccepted) + _ = json.NewEncoder(w).Encode(operatorReplayResponse{ + OK: true, + Message: fmt.Sprintf("Replay started for %s PR #%d (delivery %s). Check webhook traces for progress.", req.Repo, req.PRNumber, deliveryID), + }) +} + +func firstEnv(keys ...string) string { + for _, k := range keys { + if v := os.Getenv(k); v != "" { + return v + } + } + return "" +} + +// ghBranchNameRe matches branch names permitted by GitHub: no spaces, control +// chars, or the handful of reserved characters (~ ^ : ? * [ \). The regex is +// intentionally narrower than GitHub's full rules — it's a defense-in-depth +// check before we embed the value in an API path, not a validator. +var ghBranchNameRe = regexp.MustCompile(`^[A-Za-z0-9._/-]{1,120}$`) + +func githubCreateVersionTag(ctx context.Context, pat, repoSlug, baseBranch, version string) (ref string, sha string, err error) { + parts := strings.SplitN(strings.TrimSpace(repoSlug), "/", 2) + if len(parts) != 2 || parts[0] == "" || parts[1] == "" { + return "", "", fmt.Errorf("invalid OPERATOR_REPO_SLUG (want owner/repo)") + } + owner, repo := parts[0], parts[1] + // Defense-in-depth: even though these come from env vars and not user + // input, validate them against the same whitelists ghAPIGetRepoPermission + // uses before embedding in API paths. Apply url.PathEscape for the same + // reason. Keeps the gosec story consistent across the package. + if !ghUsernameRe.MatchString(owner) { + return "", "", fmt.Errorf("invalid owner in OPERATOR_REPO_SLUG %q", repoSlug) + } + if !ghRepoNameRe.MatchString(repo) { + return "", "", fmt.Errorf("invalid repo name in OPERATOR_REPO_SLUG %q", repoSlug) + } + if !ghBranchNameRe.MatchString(baseBranch) { + return "", "", fmt.Errorf("invalid OPERATOR_RELEASE_TARGET_BRANCH %q", baseBranch) + } + baseURL := fmt.Sprintf( + "%s/repos/%s/%s/git/ref/heads/%s", + githubAPIBaseURL, + url.PathEscape(owner), url.PathEscape(repo), url.PathEscape(baseBranch), + ) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, baseURL, nil) // #nosec G107 -- githubAPIBaseURL is binary-controlled; path components validated above + if err != nil { + return "", "", err + } + req.Header.Set("Authorization", "Bearer "+pat) + req.Header.Set("Accept", "application/vnd.github+json") + req.Header.Set("X-GitHub-Api-Version", "2022-11-28") + + resp, err := sharedGithubHTTPClient.Do(req) + if err != nil { + return "", "", err + } + defer func() { _ = resp.Body.Close() }() + baseBody, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) + if resp.StatusCode != http.StatusOK { + return "", "", fmt.Errorf("github get branch ref: %s: %s", resp.Status, strings.TrimSpace(string(baseBody))) + } + var refObj struct { + Object struct { + SHA string `json:"sha"` + } `json:"object"` + } + if err := json.Unmarshal(baseBody, &refObj); err != nil { + return "", "", fmt.Errorf("parse branch ref: %w", err) + } + headSHA := refObj.Object.SHA + if headSHA == "" { + return "", "", fmt.Errorf("empty base sha for branch %s", baseBranch) + } + + tagRef := "refs/tags/" + version + payload := map[string]string{"ref": tagRef, "sha": headSHA} + buf, err := json.Marshal(payload) + if err != nil { + return "", "", err + } + postURL := fmt.Sprintf("%s/repos/%s/%s/git/refs", githubAPIBaseURL, url.PathEscape(owner), url.PathEscape(repo)) + postReq, err := http.NewRequestWithContext(ctx, http.MethodPost, postURL, bytes.NewReader(buf)) // #nosec G107 -- githubAPIBaseURL is binary-controlled; path components validated above + if err != nil { + return "", "", err + } + postReq.Header.Set("Authorization", "Bearer "+pat) + postReq.Header.Set("Accept", "application/vnd.github+json") + postReq.Header.Set("Content-Type", "application/json") + postReq.Header.Set("X-GitHub-Api-Version", "2022-11-28") + + postResp, err := sharedGithubHTTPClient.Do(postReq) + if err != nil { + return "", "", err + } + defer func() { _ = postResp.Body.Close() }() + postBody, _ := io.ReadAll(io.LimitReader(postResp.Body, 1<<20)) + if postResp.StatusCode != http.StatusCreated { + return "", "", fmt.Errorf("github create tag ref: %s: %s", postResp.Status, strings.TrimSpace(string(postBody))) + } + var created struct { + Ref string `json:"ref"` + Object struct { + SHA string `json:"sha"` + } `json:"object"` + } + if err := json.Unmarshal(postBody, &created); err != nil { + return "", "", fmt.Errorf("parse create ref response: %w", err) + } + return created.Ref, created.Object.SHA, nil +} + +// sharedGithubHTTPClient is reused for all operator-originated GitHub API +// calls (release tagging, etc.). Reusing one *http.Client amortizes the +// underlying transport's connection pool. +var sharedGithubHTTPClient = &http.Client{Timeout: 25 * time.Second} diff --git a/services/service_container.go b/services/service_container.go index 85c1dfb..212c15c 100644 --- a/services/service_container.go +++ b/services/service_container.go @@ -28,6 +28,12 @@ type ServiceContainer struct { // Webhook deduplication DeliveryTracker *DeliveryTracker + // Recent webhook outcomes for operator troubleshooting (in-memory) + WebhookTraces *WebhookTraceBuffer + + // Per-delivery log capture for operator diagnostics (in-memory) + DeliveryLogs *DeliveryLogBuffer + // Server state StartTime time.Time @@ -101,6 +107,8 @@ func NewServiceContainer(config *configs.Config) (*ServiceContainer, error) { MetricsCollector: metricsCollector, SlackNotifier: slackNotifier, DeliveryTracker: NewDeliveryTracker(1 * time.Hour), + WebhookTraces: NewWebhookTraceBuffer(), + DeliveryLogs: NewDeliveryLogBuffer(), StartTime: time.Now(), }, nil } diff --git a/services/slack_notifier.go b/services/slack_notifier.go index 95a0bfb..b9ac25d 100644 --- a/services/slack_notifier.go +++ b/services/slack_notifier.go @@ -413,14 +413,14 @@ func (sn *DefaultSlackNotifier) sendMessageWithFallback(ctx context.Context, mes // sendPayload sends the raw JSON payload to Slack func (sn *DefaultSlackNotifier) sendPayload(ctx context.Context, payload []byte) error { - req, err := http.NewRequestWithContext(ctx, "POST", sn.webhookURL, bytes.NewBuffer(payload)) + req, err := http.NewRequestWithContext(ctx, "POST", sn.webhookURL, bytes.NewBuffer(payload)) // #nosec G107 G704 -- URL is the Slack webhook URL from trusted server config, not user input if err != nil { return fmt.Errorf("failed to create slack request: %w", err) } req.Header.Set("Content-Type", "application/json") - resp, err := sn.httpClient.Do(req) // #nosec G704 -- URL is the Slack webhook URL from trusted config + resp, err := sn.httpClient.Do(req) // #nosec G107 G704 -- URL is the Slack webhook URL from trusted server config, not user input if err != nil { return fmt.Errorf("failed to send slack message: %w", err) } diff --git a/services/web/operator/index.html b/services/web/operator/index.html new file mode 100644 index 0000000..3f322ce --- /dev/null +++ b/services/web/operator/index.html @@ -0,0 +1,1962 @@ + + + + + + + GitHub Copier — Operator + + + + +
+
+ GitHub Copier + ... + + + + + +
+
+
+ + + + +
+
+ + + +
+ +
+
+ +
+
+ + +
+

Stored in sessionStorage (this tab only). Your permission on the auth repo determines your role (operator or writer).

+
+
+ + +
+ + + + + +
+ + +
+
+
+

Live metrics

+ +
+
+
+ + +
+ +
+
+
+ +
+

Deployment

+
+ + + + + +
+
+
+ + +
+
+
+

Recent webhook activity

+ +
+
+
+
+
+
+ + +
+ +
+
TimeOutcomeRepoPRBaseEventActionDeliveryDetail
+
+
+
+ +
+

Webhook deliveries (dedup)

+
+

Recent X-GitHub-Delivery IDs. In-memory; resets on restart.

+
+
+ +
+ +
SeenDelivery IDDuplicate
+
+
+
+ + +
+ + + + + + + +
+
+

Recent copies

+
+
+

Latest file copies at a glance. Click any item to see full details.

+
+
+
+ +
+
+

Audit events

+ +
+
+
+
+
+
+
+
+ + + +
+
+ Quick: + + + + + +
+ + +
+
TimeTypeStatusRuleSourceTargetSHAPRError
+
+
+
+ +
+
+

Audit aggregates

+
+
+
+
+ +
+ +

Daily copy volume

+
DateTotalOKFailed
+

By rule

+
RuleTotalOKFailedAvg msSuccess
+
+
+
+ + +
+ +
+
+

File match tester

+
+
+

Enter a source file path to see which workflow rules would match it and where the file would be copied.

+
+
+ + +
+
+
+
+ + +
+
+

AI settingsAI

+ +
+
+

Configure the LLM provider for the AI rule suggester. Settings are process-global: changes affect every operator using this revision, apply immediately, and revert to env-var defaults on restart. Coordinate with other operators before changing the active model or base URL.

+
+ +
+
+ + + +
+
+ + +
+
+

AI rule suggesterAI

+
+
+ + +
+
+ + +
+
+

Active workflows

+
+
+

Source → target mappings loaded from the copier config. Shows file patterns, commit strategy, and transformation rules.

+
+
+ +
+ +
+
+
+ + +
+
+

How it works

+
+
+
+

What is GitHub Copier?

+

GitHub Copier automatically copies files between repositories when pull requests are merged. It listens for GitHub webhook events and runs workflow rules that define which files to copy and where.

+ +

When does a copy happen?

+
    +
  1. You merge a PR in a source repository
  2. +
  3. GitHub sends a webhook to the copier service
  4. +
  5. The copier loads its config and checks which workflows match the source repo and branch
  6. +
  7. For each matching workflow, it checks which changed files match the transformation rules
  8. +
  9. Matched files are copied to the target repository (via direct commit or pull request)
  10. +
+ +

How do I check if my file is covered?

+
    +
  1. Go to the File match tester above
  2. +
  3. Enter your file path (e.g., docs/api/endpoints.md)
  4. +
  5. Click Test to see which workflows match and where the file would be copied
  6. +
+ +

My file wasn't copied — why?

+
    +
  • The PR must be merged (not just closed) to trigger a copy
  • +
  • The source repo and branch must match a workflow's source setting
  • +
  • The file path must match at least one transformation rule (move, copy, glob, or regex)
  • +
  • The file must not be in an exclude pattern
  • +
  • Check the Audit tab → PR lookup to see if the webhook was received and what happened
  • +
+ +

Keyboard shortcuts

+

Press ? anywhere to see all shortcuts, or 15 to switch tabs.

+
+
+
+
+ + +
+
+

Liveness, readiness & metrics

+
+

Public probe routes. No token required.

+
+
+

GET /health

+

GET /ready

+

GET /metrics

+
+ +
+
+ +
+

Release (tag → deploy)

+
+

Pushes a vMAJOR.MINOR.PATCH tag. Changelog requires scripts/release.sh.

+ +
+ +
+
+
+ +
+ + +
+ + + + + + + + + + + diff --git a/services/webhook_handler_new.go b/services/webhook_handler_new.go index 9d8f4aa..871875d 100644 --- a/services/webhook_handler_new.go +++ b/services/webhook_handler_new.go @@ -88,26 +88,41 @@ func HandleWebhookWithContainer(w http.ResponseWriter, r *http.Request, config * if err != nil { LogWebhookOperation(ctx, "read_body", "failed to read webhook body", err) container.MetricsCollector.RecordWebhookFailed() + AppendWebhookTrace(container, WebhookTraceEntry{ + DeliveryID: r.Header.Get("X-GitHub-Delivery"), + EventType: r.Header.Get("X-GitHub-Event"), + Outcome: "read_body_failed", + Detail: err.Error(), + }) http.Error(w, "invalid body", http.StatusBadRequest) return } eventType := r.Header.Get("X-GitHub-Event") + deliveryID := r.Header.Get("X-GitHub-Delivery") if eventType == "" { LogWebhookOperation(ctx, "missing_event", "missing X-GitHub-Event header", nil) container.MetricsCollector.RecordWebhookFailed() + AppendWebhookTrace(container, WebhookTraceEntry{ + DeliveryID: deliveryID, + Outcome: "missing_event_header", + }) http.Error(w, "missing event type", http.StatusBadRequest) return } // Check for duplicate delivery using X-GitHub-Delivery header - deliveryID := r.Header.Get("X-GitHub-Delivery") if deliveryID != "" && container.DeliveryTracker != nil { if !container.DeliveryTracker.TryRecord(deliveryID) { LogInfoCtx(ctx, "duplicate webhook delivery, skipping", map[string]interface{}{ "delivery_id": deliveryID, "event_type": eventType, }) + AppendWebhookTrace(container, WebhookTraceEntry{ + DeliveryID: deliveryID, + EventType: eventType, + Outcome: "duplicate_delivery", + }) w.WriteHeader(http.StatusOK) return } @@ -125,6 +140,11 @@ func HandleWebhookWithContainer(w http.ResponseWriter, r *http.Request, config * if !simpleVerifySignature(sigHeader, payload, []byte(config.WebhookSecret)) { LogWebhookOperation(ctx, "signature_verification", "webhook signature verification failed", nil) container.MetricsCollector.RecordWebhookFailed() + AppendWebhookTrace(container, WebhookTraceEntry{ + DeliveryID: deliveryID, + EventType: eventType, + Outcome: "signature_failed", + }) http.Error(w, "unauthorized", http.StatusUnauthorized) return } @@ -141,6 +161,12 @@ func HandleWebhookWithContainer(w http.ResponseWriter, r *http.Request, config * LogWebhookOperation(ctx, "parse_payload", "failed to parse webhook payload", err, map[string]interface{}{"event_type": eventType}) container.MetricsCollector.RecordWebhookFailed() + AppendWebhookTrace(container, WebhookTraceEntry{ + DeliveryID: deliveryID, + EventType: eventType, + Outcome: "parse_failed", + Detail: err.Error(), + }) http.Error(w, "bad webhook", http.StatusBadRequest) return } @@ -156,6 +182,11 @@ func HandleWebhookWithContainer(w http.ResponseWriter, r *http.Request, config * "event_type": eventType, "size_bytes": len(payload), }) + AppendWebhookTrace(container, WebhookTraceEntry{ + DeliveryID: deliveryID, + EventType: eventType, + Outcome: "ignored_non_pull_request", + }) w.WriteHeader(http.StatusNoContent) return } @@ -173,6 +204,23 @@ func HandleWebhookWithContainer(w http.ResponseWriter, r *http.Request, config * "action": action, "merged": merged, }) + trace := WebhookTraceEntry{ + DeliveryID: deliveryID, + EventType: eventType, + Action: action, + Outcome: "skipped_not_merged_pr", + Detail: fmt.Sprintf("merged=%v", merged), + } + if r := prEvt.GetRepo(); r != nil { + trace.Repo = r.GetFullName() + } + if pr := prEvt.GetPullRequest(); pr != nil { + trace.PRNumber = pr.GetNumber() + if b := pr.GetBase(); b != nil { + trace.BaseBranch = b.GetRef() + } + } + AppendWebhookTrace(container, trace) w.WriteHeader(http.StatusNoContent) return } @@ -185,6 +233,13 @@ func HandleWebhookWithContainer(w http.ResponseWriter, r *http.Request, config * repo := prEvt.GetRepo() if repo == nil { LogWarningCtx(ctx, "webhook missing repository info", nil) + AppendWebhookTrace(container, WebhookTraceEntry{ + DeliveryID: deliveryID, + EventType: eventType, + Action: action, + PRNumber: prNumber, + Outcome: "invalid_payload_missing_repo", + }) w.WriteHeader(http.StatusBadRequest) return } @@ -230,6 +285,10 @@ func HandleWebhookWithContainer(w http.ResponseWriter, r *http.Request, config * // Process asynchronously in background with a new context. // Don't use the request context as it will be cancelled when the request completes. bgCtx := context.Background() + // Attach log buffer for operator diagnostics + if container.DeliveryLogs != nil { + bgCtx = ContextWithLogBuffer(bgCtx, deliveryID, container.DeliveryLogs) + } // Apply a timeout to prevent stuck API calls from running indefinitely (#9). if config.WebhookProcessingTimeoutSeconds > 0 { @@ -252,6 +311,27 @@ func HandleWebhookWithContainer(w http.ResponseWriter, r *http.Request, config * } } +// webhookResult carries completion info from a successful webhook processing run, +// surfaced in the operator UI webhook trace for at-a-glance diagnostics. +type webhookResult struct { + TargetRepos []string + FilesMatched int + FilesUploaded int + FilesFailed int +} + +func (r *webhookResult) traceDetail(attempt int) string { + if r == nil { + return fmt.Sprintf("attempt %d", attempt) + } + targets := strings.Join(r.TargetRepos, ", ") + if targets == "" { + targets = "(none)" + } + return fmt.Sprintf("attempt %d | %d matched, %d uploaded, %d failed | targets: %s", + attempt, r.FilesMatched, r.FilesUploaded, r.FilesFailed, targets) +} + // processWebhookWithRetry wraps handleMergedPRWithContainer with panic recovery // and exponential-backoff retries for transient failures (#7). func processWebhookWithRetry(ctx context.Context, prNumber int, sourceCommitSHA string, repoOwner string, repoName string, baseBranch string, deliveryID string, config *configs.Config, container *ServiceContainer) { @@ -261,8 +341,20 @@ func processWebhookWithRetry(ctx context.Context, prNumber int, sourceCommitSHA var lastErr error for attempt := 1; attempt <= maxAttempts; attempt++ { - lastErr = runWithRecovery(ctx, prNumber, sourceCommitSHA, repoOwner, repoName, baseBranch, config, container) + result, err := runWithRecovery(ctx, prNumber, sourceCommitSHA, repoOwner, repoName, baseBranch, config, container) + lastErr = err if lastErr == nil { + AppendWebhookTrace(container, WebhookTraceEntry{ + DeliveryID: deliveryID, + EventType: "pull_request", + Action: "closed", + Repo: webhookRepo, + BaseBranch: baseBranch, + CommitSHA: sourceCommitSHA, + PRNumber: prNumber, + Outcome: "processed_ok", + Detail: result.traceDetail(attempt), + }) return // success } @@ -322,6 +414,17 @@ func processWebhookWithRetry(ctx context.Context, prNumber int, sourceCommitSHA "error", lastErr, ) container.MetricsCollector.RecordWebhookFailed() + AppendWebhookTrace(container, WebhookTraceEntry{ + DeliveryID: deliveryID, + EventType: "pull_request", + Action: "closed", + Repo: webhookRepo, + BaseBranch: baseBranch, + CommitSHA: sourceCommitSHA, + PRNumber: prNumber, + Outcome: "processing_failed", + Detail: fmt.Sprintf("after %d attempt(s): %v", maxAttempts, lastErr), + }) if notifyErr := container.SlackNotifier.NotifyError(ctx, &ErrorEvent{ Operation: operation, Error: fmt.Errorf("failed after %d attempt(s): %w", maxAttempts, lastErr), @@ -336,9 +439,10 @@ func processWebhookWithRetry(ctx context.Context, prNumber int, sourceCommitSHA // runWithRecovery calls handleMergedPRWithContainer in a panic-safe wrapper, // converting panics into errors. -func runWithRecovery(ctx context.Context, prNumber int, sourceCommitSHA string, repoOwner string, repoName string, baseBranch string, config *configs.Config, container *ServiceContainer) (retErr error) { +func runWithRecovery(ctx context.Context, prNumber int, sourceCommitSHA string, repoOwner string, repoName string, baseBranch string, config *configs.Config, container *ServiceContainer) (retResult *webhookResult, retErr error) { defer func() { if r := recover(); r != nil { + retResult = nil retErr = fmt.Errorf("panic: %v", r) LogCritical("panic in webhook handler", "pr_number", prNumber, "repo_owner", repoOwner, "repo_name", repoName, "recovered", r) } @@ -348,8 +452,9 @@ func runWithRecovery(ctx context.Context, prNumber int, sourceCommitSHA string, // handleMergedPRWithContainer orchestrates processing of a merged PR: // auth → config → match workflows → fetch changed files → process → upload → notify. -// Returns an error if a retryable failure occurred (#6 — per-workflow error tracking). -func handleMergedPRWithContainer(ctx context.Context, prNumber int, sourceCommitSHA string, repoOwner string, repoName string, baseBranch string, config *configs.Config, container *ServiceContainer) error { +// Returns a webhookResult on success (for operator trace enrichment) and an error +// if a retryable failure occurred (#6 — per-workflow error tracking). +func handleMergedPRWithContainer(ctx context.Context, prNumber int, sourceCommitSHA string, repoOwner string, repoName string, baseBranch string, config *configs.Config, container *ServiceContainer) (*webhookResult, error) { startTime := time.Now() webhookRepo := fmt.Sprintf("%s/%s", repoOwner, repoName) @@ -359,20 +464,20 @@ func handleMergedPRWithContainer(ctx context.Context, prNumber int, sourceCommit LogAndReturnError(ctx, "auth", "failed to configure GitHub permissions", err) container.MetricsCollector.RecordWebhookFailed() notifySlackError(ctx, container, "auth", err, prNumber, webhookRepo) - return fmt.Errorf("auth: %w", err) + return nil, fmt.Errorf("auth: %w", err) } } // 2. Load config and find matching workflows yamlConfig, err := loadAndMatchWorkflows(ctx, config, container, webhookRepo, baseBranch, prNumber) if err != nil { - return fmt.Errorf("config: %w", err) + return nil, fmt.Errorf("config: %w", err) } // 3. Fetch changed files from the source PR changedFiles, err := fetchChangedFiles(ctx, config, container, repoOwner, repoName, prNumber, webhookRepo) if err != nil { - return fmt.Errorf("fetch_files: %w", err) + return nil, fmt.Errorf("fetch_files: %w", err) } // 4. Snapshot metrics before processing @@ -391,16 +496,24 @@ func handleMergedPRWithContainer(ctx context.Context, prNumber int, sourceCommit reportCompletion(ctx, container, webhookRepo, prNumber, sourceCommitSHA, startTime, filesMatchedBefore, filesUploadedBefore, filesFailedBefore, targetRepos) + // Build result for operator trace enrichment + result := &webhookResult{ + TargetRepos: targetRepos, + FilesMatched: container.MetricsCollector.GetFilesMatched() - filesMatchedBefore, + FilesUploaded: container.MetricsCollector.GetFilesUploaded() - filesUploadedBefore, + FilesFailed: container.MetricsCollector.GetFilesUploadFailed() - filesFailedBefore, + } + // Return an aggregate error if any workflows failed (enables retry for partial failures) if len(workflowErrors) > 0 { errMsgs := make([]string, 0, len(workflowErrors)) for wfName, wfErr := range workflowErrors { errMsgs = append(errMsgs, fmt.Sprintf("%s: %v", wfName, wfErr)) } - return fmt.Errorf("%d workflow(s) failed: %s", len(workflowErrors), strings.Join(errMsgs, "; ")) + return result, fmt.Errorf("%d workflow(s) failed: %s", len(workflowErrors), strings.Join(errMsgs, "; ")) } - return nil + return result, nil } // loadAndMatchWorkflows loads the YAML config and filters to workflows matching @@ -464,7 +577,7 @@ func fetchChangedFiles(ctx context.Context, config *configs.Config, container *S func uploadAndDeprecateFiles(ctx context.Context, config *configs.Config, container *ServiceContainer, sourceRepoOwner, sourceRepoName, sourceBranch string, prNumber int) { // Upload queued files filesToUpload := container.FileStateService.GetFilesToUpload() - AddFilesToTargetRepos(ctx, config, filesToUpload, container.PRTemplateFetcher, container.MetricsCollector) + AddFilesToTargetRepos(ctx, config, filesToUpload, container.PRTemplateFetcher, container.MetricsCollector, container.AuditLogger) container.FileStateService.ClearFilesToUpload() // Build deprecation map and update file in the source repo diff --git a/services/webhook_trace_buffer.go b/services/webhook_trace_buffer.go new file mode 100644 index 0000000..8d5b16e --- /dev/null +++ b/services/webhook_trace_buffer.go @@ -0,0 +1,97 @@ +package services + +import ( + "sync" + "time" +) + +const webhookTraceMaxEntries = 120 + +// WebhookTraceEntry is one observed webhook for operator troubleshooting (in-memory only). +type WebhookTraceEntry struct { + At time.Time `json:"at"` + DeliveryID string `json:"delivery_id,omitempty"` + EventType string `json:"event_type,omitempty"` + Action string `json:"action,omitempty"` + Repo string `json:"repo,omitempty"` + BaseBranch string `json:"base_branch,omitempty"` + CommitSHA string `json:"commit_sha,omitempty"` + PRNumber int `json:"pr_number,omitempty"` + Outcome string `json:"outcome"` + Detail string `json:"detail,omitempty"` +} + +// WebhookTraceBuffer stores the last N webhook outcomes for the operator UI. +type WebhookTraceBuffer struct { + mu sync.Mutex + buf []WebhookTraceEntry +} + +// NewWebhookTraceBuffer creates an empty trace buffer. +func NewWebhookTraceBuffer() *WebhookTraceBuffer { + return &WebhookTraceBuffer{buf: make([]WebhookTraceEntry, 0, 32)} +} + +// Append adds an entry (timestamps default to UTC now; detail is truncated). +func (b *WebhookTraceBuffer) Append(e WebhookTraceEntry) { + if b == nil { + return + } + if e.Outcome == "" { + e.Outcome = "unknown" + } + if e.At.IsZero() { + e.At = time.Now().UTC() + } + if len(e.Detail) > 500 { + e.Detail = e.Detail[:500] + "…" + } + b.mu.Lock() + defer b.mu.Unlock() + b.buf = append(b.buf, e) + if len(b.buf) > webhookTraceMaxEntries { + b.buf = b.buf[len(b.buf)-webhookTraceMaxEntries:] + } +} + +// Len returns how many trace entries are buffered. +func (b *WebhookTraceBuffer) Len() int { + if b == nil { + return 0 + } + b.mu.Lock() + defer b.mu.Unlock() + return len(b.buf) +} + +// Recent returns the last up to max entries (oldest first within the slice). +func (b *WebhookTraceBuffer) Recent(max int) []WebhookTraceEntry { + if b == nil { + return nil + } + b.mu.Lock() + defer b.mu.Unlock() + if len(b.buf) == 0 { + return nil + } + if max <= 0 { + max = 50 + } + if max > webhookTraceMaxEntries { + max = webhookTraceMaxEntries + } + if len(b.buf) <= max { + out := make([]WebhookTraceEntry, len(b.buf)) + copy(out, b.buf) + return out + } + return append([]WebhookTraceEntry(nil), b.buf[len(b.buf)-max:]...) +} + +// AppendWebhookTrace records one webhook row for the operator dashboard. +func AppendWebhookTrace(c *ServiceContainer, e WebhookTraceEntry) { + if c == nil || c.WebhookTraces == nil { + return + } + c.WebhookTraces.Append(e) +} diff --git a/services/workflow_processor.go b/services/workflow_processor.go index c29d1b1..8700223 100644 --- a/services/workflow_processor.go +++ b/services/workflow_processor.go @@ -150,7 +150,7 @@ func (wp *workflowProcessor) ProcessWorkflow( continue // fetch failed — already logged } mr.fileContent.Name = github.Ptr(mr.targetPath) - wp.queueUpload(ctx, mr.workflow, mr.fileContent, mr.targetPath, mr.prNumber, mr.sourceCommitSHA) + wp.queueUpload(ctx, mr.workflow, mr.fileContent, mr.targetPath, mr.prNumber, mr.sourceCommitSHA, mr.file.Path) filesMatched++ } @@ -411,6 +411,7 @@ func (wp *workflowProcessor) queueUpload( targetPath string, prNumber int, sourceCommitSHA string, + sourcePath string, ) { // Create upload key — includes CommitStrategy so that workflows with @@ -454,6 +455,13 @@ func (wp *workflowProcessor) queueUpload( // Add file to content content.Content = append(content.Content, *fileContent) + content.FileMeta = append(content.FileMeta, types.CopierFileMeta{ + RuleName: workflow.Name, + SourceRepo: workflow.Source.Repo, + SourcePath: sourcePath, + CommitSHA: sourceCommitSHA, + PRNumber: prNumber, + }) // Render templates with message context msgCtx := types.NewMessageContext() diff --git a/types/types.go b/types/types.go index 5f1a3c3..88a3d10 100644 --- a/types/types.go +++ b/types/types.go @@ -110,14 +110,25 @@ type UploadKey struct { } type UploadFileContent struct { - TargetBranch string `json:"target_branch"` - Content []github.RepositoryContent `json:"content"` - CommitStrategy CommitStrategy `json:"commit_strategy,omitempty"` - CommitMessage string `json:"commit_message,omitempty"` - PRTitle string `json:"pr_title,omitempty"` - PRBody string `json:"pr_body,omitempty"` - UsePRTemplate bool `json:"use_pr_template,omitempty"` // If true, fetch and merge PR template from target repo - AutoMergePR bool `json:"auto_merge_pr,omitempty"` + TargetBranch string `json:"target_branch"` + Content []github.RepositoryContent `json:"content"` + // FileMeta aligns 1:1 with Content — provenance for each file (audit, Slack, diagnostics). + FileMeta []CopierFileMeta `json:"file_meta,omitempty"` + CommitStrategy CommitStrategy `json:"commit_strategy,omitempty"` + CommitMessage string `json:"commit_message,omitempty"` + PRTitle string `json:"pr_title,omitempty"` + PRBody string `json:"pr_body,omitempty"` + UsePRTemplate bool `json:"use_pr_template,omitempty"` // If true, fetch and merge PR template from target repo + AutoMergePR bool `json:"auto_merge_pr,omitempty"` +} + +// CopierFileMeta carries per-file provenance for uploads (order matches UploadFileContent.Content). +type CopierFileMeta struct { + RuleName string `json:"rule_name,omitempty"` + SourceRepo string `json:"source_repo,omitempty"` + SourcePath string `json:"source_path,omitempty"` + CommitSHA string `json:"commit_sha,omitempty"` + PRNumber int `json:"pr_number,omitempty"` } // CommitStrategy represents the strategy for committing changes