From 82d3df4d5993e1b0b20b945f7fc269de2f0ad44f Mon Sep 17 00:00:00 2001 From: Stuart Mumford Date: Wed, 25 Jul 2018 09:58:24 +0100 Subject: [PATCH 1/2] Add a cache class for installation IDs --- baldrick/github/github_auth.py | 102 +++++++++++++++++++++++++++++++-- 1 file changed, 98 insertions(+), 4 deletions(-) diff --git a/baldrick/github/github_auth.py b/baldrick/github/github_auth.py index 1472396..78a0432 100644 --- a/baldrick/github/github_auth.py +++ b/baldrick/github/github_auth.py @@ -1,13 +1,12 @@ import os +import time import netrc import datetime -from collections import defaultdict - -import dateutil.parser +from collections import defaultdict, UserDict import jwt - import requests +import dateutil.parser TEN_MIN = datetime.timedelta(minutes=9) ONE_MIN = datetime.timedelta(minutes=1) @@ -112,6 +111,101 @@ def github_request_headers(installation): return headers +class InstallationCache(UserDict): + """ + Retrieve and cache a repository : installation mapping. + + The objective of this class is to do the minimum amount of API calls to + GitHub, while also having a way to respond to installations being + uninstalled, and therefore removed from the cache. + + To get the repositories for an installation id is one API call per + installation, while getting the list of installation ids is one API call. + + This class maintains a cache, which it will try and fetch installation ids + from. The cache has a TTL, which when it expires the whole cache will be + invalidated, as this is the only way to detect a repository being removed + from an installation (as opposed to the installation being removed). + + When a key is requested from the cache, before refreshing the whole cache, + a check is made for new installations as this is a cheaper operation than + checking for new repositories in existing installations. + """ + def __init__(self, ttl=48*60*60): + self.ttl = ttl + self._last_removal = time.time() + self._install_ids = set() + super().__init__() + + def clear_cache(self): + """ + Clear the cache and fetch a whole new lot from GitHub. + """ + self.data = {} + self._last_removal = time.time() + self._refresh_all_installations() + + def _update_install_ids(self): + url = 'https://api.github.com/app/installations' + headers = {} + headers['Authorization'] = 'Bearer {0}'.format(get_json_web_token()) + headers['Accept'] = 'application/vnd.github.machine-man-preview+json' + resp = requests.get(url, headers=headers) + payload = resp.json() + + self._install_ids = {p['id'] for p in payload} + + # TODO: We could cache this function to prevent repeat refreshes in short time + def _get_repos_for_id(self, iid): + headers = github_request_headers(iid) + resp = requests.get('https://api.github.com/installation/repositories', headers=headers) + payload = resp.json() + + return [repo['full_name'] for repo in payload['repositories']] + + def _refresh_all_installations(self): + self._update_install_ids() + + for iid in self._install_ids: + repos = self._get_repos_for_id(iid) + for repo in repos: + self.data[repo] = iid + + def _add_new_installations_to_map(self): + self._update_install_ids() + + mapped_ids = self.data.values() + + # Iterate over all the installation ids that are not in the map + for iid in self._install_ids.difference(mapped_ids): + repos = self._get_repos_for_id(iid) + for repo in repos: + self.data[repo] = iid + + def __getitem__(self, item): + refreshed = (time.time() - self._last_removal) > self.ttl + if refreshed: + self.clear_cache() + + # First try with current cache, then check for new installations. + try: + return super().__getitem__(item) + except KeyError: + self._add_new_installations_to_map() + + # If not found in the new installations, we have to refresh the whole + # lot in case someone added a new repo to an existing installation. + try: + return super().__getitem__(item) + except KeyError: + # If we have already blown away the cache, don't do it again. + if not refreshed: + self.clear_cache() # TODO: This duplicates the work done above + + # Try now, and raise KeyError if the repo is still not found. + return super().__getitem__(item) + + def repo_to_installation_id_mapping(): """ Returns a dictionary mapping full repository name to installation id. From d30886bc2b2b4a668cc316b499073f68eab6cbfa Mon Sep 17 00:00:00 2001 From: Stuart Mumford Date: Mon, 30 Jul 2018 15:03:24 +0100 Subject: [PATCH 2/2] comments --- baldrick/github/github_auth.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/baldrick/github/github_auth.py b/baldrick/github/github_auth.py index 78a0432..ae62a79 100644 --- a/baldrick/github/github_auth.py +++ b/baldrick/github/github_auth.py @@ -130,6 +130,9 @@ class InstallationCache(UserDict): When a key is requested from the cache, before refreshing the whole cache, a check is made for new installations as this is a cheaper operation than checking for new repositories in existing installations. + + Then we can use the username of the repository in the key to determine if + we already have an installation for that user, and just refresh that user. """ def __init__(self, ttl=48*60*60): self.ttl = ttl