-
-
Notifications
You must be signed in to change notification settings - Fork 4.7k
fix(integrations): Cache missing GitHub repo tree lookups #113113
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
c29b65f
0e55365
8057229
498e27d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -190,6 +190,8 @@ def get_cached_repo_files( | |
|
|
||
| repo_full_name: e.g. getsentry/sentry | ||
| tree_sha: A branch or a commit sha | ||
| shifted_seconds: Staggers cache expiration times across repositories | ||
| so cache misses and API refreshes are spread out over time. | ||
| only_source_code_files: Include all files or just the source code files | ||
| only_use_cache: Do not hit the network but use the value from the cache | ||
| if any. This is useful if the remaining API requests are low | ||
|
|
@@ -199,6 +201,7 @@ def get_cached_repo_files( | |
| use_api = not cache_hit and not only_use_cache | ||
| repo_files: list[str] = cache.get(key, []) | ||
| if use_api: | ||
| tree = None | ||
| # Cache miss – fetch from API | ||
| try: | ||
| tree = self.get_client().get_tree(repo_full_name, tree_sha) | ||
|
|
@@ -209,8 +212,14 @@ def get_cached_repo_files( | |
| "Caching empty files result for repo", | ||
| extra={"repo": repo_full_name}, | ||
| ) | ||
| cache.set(key, [], self.CACHE_SECONDS + shifted_seconds) | ||
| tree = None | ||
| except ApiError as error: | ||
| if _is_not_found_error(error): | ||
| logger.info( | ||
| "Caching empty files result for missing repo or ref", | ||
| extra={"repo": repo_full_name}, | ||
| ) | ||
| else: | ||
| raise | ||
| if tree: | ||
| # Keep files; discard directories | ||
| repo_files = [node["path"] for node in tree if node["type"] == "blob"] | ||
|
|
@@ -225,6 +234,8 @@ def get_cached_repo_files( | |
| # repositories is a single API network request, thus, | ||
| # being acceptable to sometimes not having everything cached | ||
| cache.set(key, repo_files, self.CACHE_SECONDS + shifted_seconds) | ||
| else: | ||
| cache.set(key, [], self.CACHE_SECONDS + shifted_seconds) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Empty list as a negative cache. |
||
|
|
||
| metrics.incr( | ||
| f"{METRICS_KEY_PREFIX}.get_tree", | ||
|
|
@@ -296,3 +307,11 @@ def should_include(file_path: str) -> bool: | |
| if any(file_path.startswith(path) for path in EXCLUDED_PATHS): | ||
| return False | ||
| return True | ||
|
|
||
|
|
||
| def _is_not_found_error(error: ApiError) -> bool: | ||
| if error.code == 404: | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In all honestly, it is likely the 404 is real and even if we only fetch once a day we would still get a 404 but who knows maybe GitHub may send the wrong status code. |
||
| return True | ||
|
|
||
| error_message = error.json.get("message") if error.json else error.text | ||
| return error_message in ("Not Found", "Not Found.") | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,14 +13,14 @@ | |
| from django.http import HttpResponse | ||
| from django.urls import reverse | ||
|
|
||
| import sentry | ||
| from fixtures.github import INSTALLATION_EVENT_EXAMPLE | ||
| from sentry.constants import ObjectStatus | ||
| from sentry.integrations.github import client | ||
| from sentry.integrations.github import integration as github_integration | ||
| from sentry.integrations.github.client import ( | ||
| MINIMUM_REQUESTS, | ||
| GitHubApiClient, | ||
| GitHubBaseClient, | ||
| GithubSetupApiClient, | ||
| ) | ||
| from sentry.integrations.github.integration import ( | ||
|
|
@@ -750,7 +750,7 @@ def test_get_repositories_all_and_pagination(self) -> None: | |
| GitHubIntegration, integration, self.organization.id | ||
| ) | ||
|
|
||
| with patch.object(sentry.integrations.github.client.GitHubBaseClient, "page_size", 1): | ||
| with patch.object(GitHubBaseClient, "page_size", 1): | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Orthogonal change. |
||
| result = installation.get_repositories() | ||
| assert result == [ | ||
| { | ||
|
|
@@ -785,10 +785,8 @@ def test_get_repositories_only_first_page(self) -> None: | |
| ) | ||
|
|
||
| with ( | ||
| patch.object( | ||
| sentry.integrations.github.client.GitHubBaseClient, "page_number_limit", 1 | ||
| ), | ||
| patch.object(sentry.integrations.github.client.GitHubBaseClient, "page_size", 1), | ||
| patch.object(GitHubBaseClient, "page_number_limit", 1), | ||
| patch.object(GitHubBaseClient, "page_size", 1), | ||
| ): | ||
| result = installation.get_repositories() | ||
| assert result == [ | ||
|
|
@@ -1007,12 +1005,13 @@ def get_installation_helper(self) -> GitHubIntegration: | |
|
|
||
| def _expected_trees(self, repo_info_list=None): | ||
| result = {} | ||
| # bar (409 empty repo) returns an empty RepoTree since we cache the result | ||
| # baz (404) still fails and is excluded | ||
| # bar (409 empty repo) and baz (404 missing repo/ref) both return empty | ||
| # RepoTrees since those responses are negative-cached. | ||
| list = repo_info_list or [ | ||
| ("xyz", "master", ["src/xyz.py"]), | ||
| ("foo", "master", ["src/sentry/api/endpoints/auth_login.py"]), | ||
| ("bar", "main", []), | ||
| ("baz", "master", []), | ||
| ] | ||
| for repo, branch, files in list: | ||
| result[f"{self.gh_org}/{repo}"] = RepoTree( | ||
|
|
@@ -1094,6 +1093,7 @@ def test_get_trees_for_org_prevent_exhaustion_some_repos(self) -> None: | |
| # Now that the rate limit is reset we should get files for foo | ||
| ("foo", "master", ["src/sentry/api/endpoints/auth_login.py"]), | ||
| ("bar", "main", []), | ||
| ("baz", "master", []), | ||
| ] | ||
| ) | ||
|
|
||
|
|
@@ -1118,14 +1118,16 @@ def test_get_trees_for_org_rate_limit_401(self) -> None: | |
| ) | ||
|
|
||
| # This time the rate limit will not fail, thus, it will fetch the trees | ||
| # bar (409 empty repo) now returns an empty RepoTree since we cache the empty result | ||
| # bar (409) and baz (404) now return empty RepoTrees because those | ||
| # responses are cached as empty results. | ||
| self.set_rate_limit() | ||
| trees = installation.get_trees_for_org() | ||
| assert trees == self._expected_trees( | ||
| [ | ||
| ("xyz", "master", ["src/xyz.py"]), | ||
| ("foo", "master", ["src/sentry/api/endpoints/auth_login.py"]), | ||
| ("bar", "main", []), | ||
| ("baz", "master", []), | ||
| ] | ||
| ) | ||
|
|
||
|
|
@@ -1172,9 +1174,10 @@ def test_get_trees_for_org_makes_API_requests_before_MAX_CONNECTION_ERRORS_is_hi | |
| # xyz is missing because its request errors | ||
| # foo has data because its API request is made in spite of xyz's error | ||
| ("foo", "master", ["src/sentry/api/endpoints/auth_login.py"]), | ||
| # bar (409 empty repo) is present with empty files since we cache the result | ||
| # baz (404) is missing because its API request throws an error | ||
| # bar (409) and baz (404) are present with empty files | ||
| # because both outcomes are cached as empty results. | ||
| ("bar", "main", []), | ||
| ("baz", "master", []), | ||
| ] | ||
| ) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The caller makes the cache expire around 24 hour buckets (per repo) so it doesn't all expire around the same hour window.