Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions src/sentry/integrations/source_code_management/repo_trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,8 @@ def get_cached_repo_files(

repo_full_name: e.g. getsentry/sentry
tree_sha: A branch or a commit sha
shifted_seconds: Staggers cache expiration times across repositories
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The caller makes the cache expire around 24 hour buckets (per repo) so it doesn't all expire around the same hour window.

so cache misses and API refreshes are spread out over time.
only_source_code_files: Include all files or just the source code files
only_use_cache: Do not hit the network but use the value from the cache
if any. This is useful if the remaining API requests are low
Expand All @@ -199,6 +201,7 @@ def get_cached_repo_files(
use_api = not cache_hit and not only_use_cache
repo_files: list[str] = cache.get(key, [])
if use_api:
tree = None
# Cache miss – fetch from API
try:
tree = self.get_client().get_tree(repo_full_name, tree_sha)
Expand All @@ -209,8 +212,14 @@ def get_cached_repo_files(
"Caching empty files result for repo",
extra={"repo": repo_full_name},
)
cache.set(key, [], self.CACHE_SECONDS + shifted_seconds)
tree = None
except ApiError as error:
if _is_not_found_error(error):
logger.info(
"Caching empty files result for missing repo or ref",
extra={"repo": repo_full_name},
)
else:
raise
if tree:
# Keep files; discard directories
repo_files = [node["path"] for node in tree if node["type"] == "blob"]
Expand All @@ -225,6 +234,8 @@ def get_cached_repo_files(
# repositories is a single API network request, thus,
# being acceptable to sometimes not having everything cached
cache.set(key, repo_files, self.CACHE_SECONDS + shifted_seconds)
else:
cache.set(key, [], self.CACHE_SECONDS + shifted_seconds)
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Empty list as a negative cache.


metrics.incr(
f"{METRICS_KEY_PREFIX}.get_tree",
Expand Down Expand Up @@ -296,3 +307,11 @@ def should_include(file_path: str) -> bool:
if any(file_path.startswith(path) for path in EXCLUDED_PATHS):
return False
return True


def _is_not_found_error(error: ApiError) -> bool:
if error.code == 404:
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In all honestly, it is likely the 404 is real and even if we only fetch once a day we would still get a 404 but who knows maybe GitHub may send the wrong status code.

return True

error_message = error.json.get("message") if error.json else error.text
return error_message in ("Not Found", "Not Found.")
54 changes: 54 additions & 0 deletions tests/sentry/integrations/github/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,60 @@ def test_get_cached_repo_files_with_all_files(self) -> None:
files = self.install.get_cached_repo_files(self.repo.name, "master", 0)
assert files == ["src/foo.py"]

@responses.activate
def test_get_cached_repo_files_caches_not_found(self) -> None:
responses.add(
method=responses.GET,
url=f"https://api.github.com/repos/{self.repo.name}/git/trees/master?recursive=1",
status=404,
json={"message": "Not Found"},
)
repo_key = f"github:repo:{self.repo.name}:source-code"
assert cache.get(repo_key) is None

files = self.install.get_cached_repo_files(self.repo.name, "master", 0)
assert files == []
assert cache.get(repo_key) == []

# Negative-cache hit should avoid an additional API request.
files = self.install.get_cached_repo_files(self.repo.name, "master", 0)
assert files == []
assert len(responses.calls) == 1

@responses.activate
def test_get_cached_repo_files_not_found_cache_ttl_is_staggered(self) -> None:
responses.add(
method=responses.GET,
url=f"https://api.github.com/repos/{self.repo.name}/git/trees/master?recursive=1",
status=404,
json={"message": "Not Found"},
)

shifted_seconds = 3600
repo_key = f"github:repo:{self.repo.name}:source-code"
with mock.patch(
"sentry.integrations.source_code_management.repo_trees.cache.set"
) as cache_set:
self.install.get_cached_repo_files(self.repo.name, "master", shifted_seconds)

cache_set.assert_called_once_with(
repo_key,
[],
self.install.CACHE_SECONDS + shifted_seconds,
)

@responses.activate
def test_get_cached_repo_files_raises_non_not_found_api_error(self) -> None:
responses.add(
method=responses.GET,
url=f"https://api.github.com/repos/{self.repo.name}/git/trees/master?recursive=1",
status=500,
json={"message": "Server Error"},
)

with pytest.raises(ApiError):
self.install.get_cached_repo_files(self.repo.name, "master", 0)

@mock.patch("sentry.integrations.github.client.get_jwt", return_value="jwt_token_1")
@responses.activate
def test_update_comment(self, get_jwt) -> None:
Expand Down
25 changes: 14 additions & 11 deletions tests/sentry/integrations/github/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@
from django.http import HttpResponse
from django.urls import reverse

import sentry
from fixtures.github import INSTALLATION_EVENT_EXAMPLE
from sentry.constants import ObjectStatus
from sentry.integrations.github import client
from sentry.integrations.github import integration as github_integration
from sentry.integrations.github.client import (
MINIMUM_REQUESTS,
GitHubApiClient,
GitHubBaseClient,
GithubSetupApiClient,
)
from sentry.integrations.github.integration import (
Expand Down Expand Up @@ -750,7 +750,7 @@ def test_get_repositories_all_and_pagination(self) -> None:
GitHubIntegration, integration, self.organization.id
)

with patch.object(sentry.integrations.github.client.GitHubBaseClient, "page_size", 1):
with patch.object(GitHubBaseClient, "page_size", 1):
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Orthogonal change.

result = installation.get_repositories()
assert result == [
{
Expand Down Expand Up @@ -785,10 +785,8 @@ def test_get_repositories_only_first_page(self) -> None:
)

with (
patch.object(
sentry.integrations.github.client.GitHubBaseClient, "page_number_limit", 1
),
patch.object(sentry.integrations.github.client.GitHubBaseClient, "page_size", 1),
patch.object(GitHubBaseClient, "page_number_limit", 1),
patch.object(GitHubBaseClient, "page_size", 1),
):
result = installation.get_repositories()
assert result == [
Expand Down Expand Up @@ -1007,12 +1005,13 @@ def get_installation_helper(self) -> GitHubIntegration:

def _expected_trees(self, repo_info_list=None):
result = {}
# bar (409 empty repo) returns an empty RepoTree since we cache the result
# baz (404) still fails and is excluded
# bar (409 empty repo) and baz (404 missing repo/ref) both return empty
# RepoTrees since those responses are negative-cached.
list = repo_info_list or [
("xyz", "master", ["src/xyz.py"]),
("foo", "master", ["src/sentry/api/endpoints/auth_login.py"]),
("bar", "main", []),
("baz", "master", []),
]
for repo, branch, files in list:
result[f"{self.gh_org}/{repo}"] = RepoTree(
Expand Down Expand Up @@ -1094,6 +1093,7 @@ def test_get_trees_for_org_prevent_exhaustion_some_repos(self) -> None:
# Now that the rate limit is reset we should get files for foo
("foo", "master", ["src/sentry/api/endpoints/auth_login.py"]),
("bar", "main", []),
("baz", "master", []),
]
)

Expand All @@ -1118,14 +1118,16 @@ def test_get_trees_for_org_rate_limit_401(self) -> None:
)

# This time the rate limit will not fail, thus, it will fetch the trees
# bar (409 empty repo) now returns an empty RepoTree since we cache the empty result
# bar (409) and baz (404) now return empty RepoTrees because those
# responses are cached as empty results.
self.set_rate_limit()
trees = installation.get_trees_for_org()
assert trees == self._expected_trees(
[
("xyz", "master", ["src/xyz.py"]),
("foo", "master", ["src/sentry/api/endpoints/auth_login.py"]),
("bar", "main", []),
("baz", "master", []),
]
)

Expand Down Expand Up @@ -1172,9 +1174,10 @@ def test_get_trees_for_org_makes_API_requests_before_MAX_CONNECTION_ERRORS_is_hi
# xyz is missing because its request errors
# foo has data because its API request is made in spite of xyz's error
("foo", "master", ["src/sentry/api/endpoints/auth_login.py"]),
# bar (409 empty repo) is present with empty files since we cache the result
# baz (404) is missing because its API request throws an error
# bar (409) and baz (404) are present with empty files
# because both outcomes are cached as empty results.
("bar", "main", []),
("baz", "master", []),
]
)

Expand Down
Loading