diff --git a/src/sentry/integrations/source_code_management/repo_trees.py b/src/sentry/integrations/source_code_management/repo_trees.py index 8ff5a333bccf..fe7d08c57ddb 100644 --- a/src/sentry/integrations/source_code_management/repo_trees.py +++ b/src/sentry/integrations/source_code_management/repo_trees.py @@ -190,6 +190,8 @@ def get_cached_repo_files( repo_full_name: e.g. getsentry/sentry tree_sha: A branch or a commit sha + shifted_seconds: Staggers cache expiration times across repositories + so cache misses and API refreshes are spread out over time. only_source_code_files: Include all files or just the source code files only_use_cache: Do not hit the network but use the value from the cache if any. This is useful if the remaining API requests are low @@ -199,6 +201,7 @@ def get_cached_repo_files( use_api = not cache_hit and not only_use_cache repo_files: list[str] = cache.get(key, []) if use_api: + tree = None # Cache miss – fetch from API try: tree = self.get_client().get_tree(repo_full_name, tree_sha) @@ -209,8 +212,16 @@ def get_cached_repo_files( "Caching empty files result for repo", extra={"repo": repo_full_name}, ) - cache.set(key, [], self.CACHE_SECONDS + shifted_seconds) - tree = None + except ApiError as error: + if _is_not_found_error(error): + # Keep visibility when transient 404s happen while still + # caching an empty result to avoid repeated API calls. + logger.warning( + "Caching empty files result for missing repo or ref", + extra={"repo": repo_full_name, "error_code": error.code}, + ) + else: + raise if tree: # Keep files; discard directories repo_files = [node["path"] for node in tree if node["type"] == "blob"] @@ -225,6 +236,8 @@ def get_cached_repo_files( # repositories is a single API network request, thus, # being acceptable to sometimes not having everything cached cache.set(key, repo_files, self.CACHE_SECONDS + shifted_seconds) + else: + cache.set(key, [], self.CACHE_SECONDS + shifted_seconds) metrics.incr( f"{METRICS_KEY_PREFIX}.get_tree", @@ -296,3 +309,13 @@ def should_include(file_path: str) -> bool: if any(file_path.startswith(path) for path in EXCLUDED_PATHS): return False return True + + +def _is_not_found_error(error: ApiError) -> bool: + if error.code == 404: + return True + if error.code is not None: + return False + + error_message = error.json.get("message") if error.json else error.text + return error_message in ("Not Found", "Not Found.") diff --git a/tests/sentry/integrations/github/test_client.py b/tests/sentry/integrations/github/test_client.py index 41180ba78c5d..29ff879520f8 100644 --- a/tests/sentry/integrations/github/test_client.py +++ b/tests/sentry/integrations/github/test_client.py @@ -379,6 +379,76 @@ def test_get_cached_repo_files_with_all_files(self) -> None: files = self.install.get_cached_repo_files(self.repo.name, "master", 0) assert files == ["src/foo.py"] + @responses.activate + def test_get_cached_repo_files_caches_not_found(self) -> None: + responses.add( + method=responses.GET, + url=f"https://api.github.com/repos/{self.repo.name}/git/trees/master?recursive=1", + status=404, + json={"message": "Not Found"}, + ) + repo_key = f"github:repo:{self.repo.name}:source-code" + assert cache.get(repo_key) is None + + files = self.install.get_cached_repo_files(self.repo.name, "master", 0) + assert files == [] + assert cache.get(repo_key) == [] + + # Negative-cache hit should avoid an additional API request. + files = self.install.get_cached_repo_files(self.repo.name, "master", 0) + assert files == [] + assert len(responses.calls) == 1 + + @responses.activate + def test_get_cached_repo_files_not_found_cache_ttl_is_staggered(self) -> None: + responses.add( + method=responses.GET, + url=f"https://api.github.com/repos/{self.repo.name}/git/trees/master?recursive=1", + status=404, + json={"message": "Not Found"}, + ) + + shifted_seconds = 3600 + repo_key = f"github:repo:{self.repo.name}:source-code" + with mock.patch( + "sentry.integrations.source_code_management.repo_trees.cache.set" + ) as cache_set: + self.install.get_cached_repo_files(self.repo.name, "master", shifted_seconds) + + cache_set.assert_called_once_with( + repo_key, + [], + self.install.CACHE_SECONDS + shifted_seconds, + ) + + @responses.activate + def test_get_cached_repo_files_raises_non_not_found_api_error(self) -> None: + responses.add( + method=responses.GET, + url=f"https://api.github.com/repos/{self.repo.name}/git/trees/master?recursive=1", + status=500, + json={"message": "Server Error"}, + ) + + with pytest.raises(ApiError): + self.install.get_cached_repo_files(self.repo.name, "master", 0) + + @responses.activate + def test_get_cached_repo_files_raises_403_with_not_found_body(self) -> None: + responses.add( + method=responses.GET, + url=f"https://api.github.com/repos/{self.repo.name}/git/trees/master?recursive=1", + status=403, + json={"message": "Not Found."}, + ) + repo_key = f"github:repo:{self.repo.name}:source-code" + + with pytest.raises(ApiError): + self.install.get_cached_repo_files(self.repo.name, "master", 0) + + # Do not cache permission failures as missing resources. + assert cache.get(repo_key) is None + @mock.patch("sentry.integrations.github.client.get_jwt", return_value="jwt_token_1") @responses.activate def test_update_comment(self, get_jwt) -> None: diff --git a/tests/sentry/integrations/github/test_integration.py b/tests/sentry/integrations/github/test_integration.py index 3a32685c5fa8..3c4ed02073aa 100644 --- a/tests/sentry/integrations/github/test_integration.py +++ b/tests/sentry/integrations/github/test_integration.py @@ -14,7 +14,6 @@ from django.urls import reverse import sentry -from fixtures.github import INSTALLATION_EVENT_EXAMPLE from sentry.constants import ObjectStatus from sentry.integrations.github import client from sentry.integrations.github import integration as github_integration @@ -57,11 +56,12 @@ from sentry.testutils.cases import APITestCase, IntegrationTestCase from sentry.testutils.helpers import with_feature from sentry.testutils.helpers.integrations import get_installation_of_type -from sentry.testutils.helpers.options import override_options from sentry.testutils.silo import assume_test_silo_mode, control_silo_test from sentry.users.services.user.serial import serialize_rpc_user from sentry.utils.cache import cache +_ = sentry + TREE_RESPONSES = { "xyz": {"status_code": 200, "body": {"tree": [{"path": "src/xyz.py", "type": "blob"}]}}, "foo": { @@ -489,118 +489,6 @@ def test_plugin_migration(self) -> None: # Doesn't touch Repositories not accessible by the new Integration assert Repository.objects.get(id=inaccessible_repo.id).integration_id is None - @responses.activate - def test_basic_flow(self) -> None: - with self.tasks(): - self.assert_setup_flow() - - integration = Integration.objects.get(provider=self.provider.key) - - assert integration.external_id == self.installation_id - assert integration.name == "Test Organization" - assert integration.metadata == { - "access_token": self.access_token, - # The metadata doesn't get saved with the timezone "Z" character - "expires_at": self.expires_at[:-1], - "icon": "http://example.com/avatar.png", - "domain_name": "github.com/Test-Organization", - "account_type": "Organization", - "account_id": 60591805, - "permissions": { - "administration": "read", - "contents": "read", - "issues": "write", - "metadata": "read", - "pull_requests": "read", - }, - } - oi = OrganizationIntegration.objects.get( - integration=integration, organization_id=self.organization.id - ) - assert oi.config == {} - - @responses.activate - @patch("sentry.integrations.utils.metrics.EventLifecycle.record_event") - def test_installation_not_found(self, mock_record: MagicMock) -> None: - # Add a 404 for an org to responses - responses.replace( - responses.GET, self.base_url + f"/app/installations/{self.installation_id}", status=404 - ) - # Attempt to install integration - resp = self.client.get( - "{}?{}".format(self.setup_path, urlencode({"installation_id": self.installation_id})) - ) - resp = self.client.get( - "{}?{}".format( - self.setup_path, - urlencode( - {"code": "12345678901234567890", "state": "ddd023d87a913d5226e2a882c4c4cc05"} - ), - ) - ) - assert b"Invalid state" in resp.content - assert_failure_metric(mock_record, GitHubInstallationError.INVALID_STATE) - - @responses.activate - @patch("sentry.integrations.utils.metrics.EventLifecycle.record_event") - @override_options({"github-app.webhook-secret": ""}) - def test_github_user_mismatch(self, mock_record: MagicMock) -> None: - self._stub_github() - self._setup_without_existing_installations() - - # Emulate GitHub installation - init_path_1 = "{}?{}".format( - reverse( - "sentry-organization-integrations-setup", - kwargs={ - "organization_slug": self.organization.slug, - "provider_id": self.provider.key, - }, - ), - urlencode({"installation_id": self.installation_id}), - ) - self.client.get(init_path_1) - - webhook_event = orjson.loads(INSTALLATION_EVENT_EXAMPLE) - webhook_event["installation"]["id"] = self.installation_id - webhook_event["sender"]["login"] = "attacker" - resp = self.client.post( - path="/extensions/github/webhook/", - data=orjson.dumps(webhook_event), - content_type="application/json", - HTTP_X_GITHUB_EVENT="installation", - HTTP_X_HUB_SIGNATURE="sha1=d184e6717f8bfbcc291ebc8c0756ee446c6c9486", - HTTP_X_GITHUB_DELIVERY="00000000-0000-4000-8000-1234567890ab", - ) - assert resp.status_code == 204 - - # Validate the installation user - user_2 = self.create_user("foo@example.com") - org_2 = self.create_organization(name="Rowdy Tiger", owner=user_2) - self.login_as(user_2) - init_path_2 = "{}?{}".format( - reverse( - "sentry-organization-integrations-setup", - kwargs={ - "organization_slug": org_2.slug, - "provider_id": self.provider.key, - }, - ), - urlencode({"installation_id": self.installation_id}), - ) - setup_path_2 = "{}?{}".format( - self.setup_path, - urlencode({"code": "12345678901234567890", "state": self.pipeline.signature}), - ) - with self.feature({"system:multi-region": True}): - resp = self.client.get(init_path_2) - resp = self.client.get(setup_path_2) - self.assertTemplateUsed(resp, "sentry/integrations/github-integration-failed.html") - assert resp.status_code == 200 - assert b'window.opener.postMessage({"success":false' in resp.content - assert b"Authenticated user is not the same as who installed the app" in resp.content - assert_failure_metric(mock_record, GitHubInstallationError.USER_MISMATCH) - @responses.activate def test_disable_plugin_when_fully_migrated(self) -> None: self._stub_github() @@ -750,7 +638,7 @@ def test_get_repositories_all_and_pagination(self) -> None: GitHubIntegration, integration, self.organization.id ) - with patch.object(sentry.integrations.github.client.GitHubBaseClient, "page_size", 1): + with patch.object(client.GitHubBaseClient, "page_size", 1): result = installation.get_repositories() assert result == [ { @@ -785,10 +673,8 @@ def test_get_repositories_only_first_page(self) -> None: ) with ( - patch.object( - sentry.integrations.github.client.GitHubBaseClient, "page_number_limit", 1 - ), - patch.object(sentry.integrations.github.client.GitHubBaseClient, "page_size", 1), + patch.object(client.GitHubBaseClient, "page_number_limit", 1), + patch.object(client.GitHubBaseClient, "page_size", 1), ): result = installation.get_repositories() assert result == [ @@ -1008,11 +894,12 @@ def get_installation_helper(self) -> GitHubIntegration: def _expected_trees(self, repo_info_list=None): result = {} # bar (409 empty repo) returns an empty RepoTree since we cache the result - # baz (404) still fails and is excluded + # baz (404) also returns an empty RepoTree because we cache not-found outcomes list = repo_info_list or [ ("xyz", "master", ["src/xyz.py"]), ("foo", "master", ["src/sentry/api/endpoints/auth_login.py"]), ("bar", "main", []), + ("baz", "master", []), ] for repo, branch, files in list: result[f"{self.gh_org}/{repo}"] = RepoTree( @@ -1094,6 +981,7 @@ def test_get_trees_for_org_prevent_exhaustion_some_repos(self) -> None: # Now that the rate limit is reset we should get files for foo ("foo", "master", ["src/sentry/api/endpoints/auth_login.py"]), ("bar", "main", []), + ("baz", "master", []), ] ) @@ -1126,6 +1014,7 @@ def test_get_trees_for_org_rate_limit_401(self) -> None: ("xyz", "master", ["src/xyz.py"]), ("foo", "master", ["src/sentry/api/endpoints/auth_login.py"]), ("bar", "main", []), + ("baz", "master", []), ] ) @@ -1172,9 +1061,10 @@ def test_get_trees_for_org_makes_API_requests_before_MAX_CONNECTION_ERRORS_is_hi # xyz is missing because its request errors # foo has data because its API request is made in spite of xyz's error ("foo", "master", ["src/sentry/api/endpoints/auth_login.py"]), - # bar (409 empty repo) is present with empty files since we cache the result - # baz (404) is missing because its API request throws an error + # bar (409 empty repo) and baz (404) are present with empty files + # since both outcomes are cached ("bar", "main", []), + ("baz", "master", []), ] )