From f9b7fb3352b5a0934cc38832bbb1b67a9789ea2e Mon Sep 17 00:00:00 2001 From: beckerfy <fynn.becker@hs-hannover.de> Date: Wed, 10 Jul 2019 14:09:34 +0200 Subject: [PATCH] Close #2 Add pagination Add pagination along with lazy evaluation, caching, counting strategies and a bunch of documentation on these concepts. --- README.md | 151 +++++++++++++++++++++-- postgrestutils/client/__init__.py | 6 +- postgrestutils/client/postgrestclient.py | 151 ++++++++++++++++++++--- postgrestutils/client/utils.py | 7 +- 4 files changed, 286 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index ea9a9b2..fc06d7e 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,172 @@ -### postgrestutils +# postgrestutils A very basic POSTGREST client and utils -#### Setup +## Usage -##### Django +### Setup + +#### Django - add `"postgrestutils"` to your `INSTALLED_APPS` setting - add `POSTGREST_UTILS_BASE_URI` (should default to the most frequently used POSTGREST instance in the future) and `POSTGREST_UTILS_JWT` to your project settings ```python from postgrestutils.client import pgrest_client -payload = { +params = { "select": "id,forename", "forename": "eq.Jebediah" } # this will send a request to 'POSTGREST_UTILS_BASE_URI/kerbals?select=id,forename&forename=eq.Jebediah' -res = pgrest_client.get("kerbals", params=payload) +res = pgrest_client.get("kerbals", params=params) ``` -##### Other projects +#### Other projects ```python from postgrestutils.client import pgrest_client pgrest_client.configure('your-JWT', base_uri='http://127.0.0.1:3000') -payload = { +params = { "select": "id,forename" } -res = pgrest_client.get("kerbals", params=payload) +res = pgrest_client.get("kerbals", params=params) +``` + +### Making requests + +`postgrestutils` tries to be as intuitive and pythonic as possible while also being efficient. +In order to do so it combines several concepts such as lazy evaluation, caching and counting strategies. + +Understanding these concepts allows you to write the most efficient code for your specific use case. +If you're already familiar with how django querysets work this should feel fairly natural to you. +There are however a few differences that will be explained in detail below. + +#### Lazy evaluation + +Akin to django querysets `postgrestutils` has a `LazyPostgrestJsonResult` that is returned from calls to `pgrest_client.get()` without making any API calls yet. +If you're familiar with django's rules for evaluation this list won't suprise you. +Since there are a few subtle differences however here is what will cause evaluation of a `LazyPostgrestJsonResult`: + +- Iteration. +A `LazyPostgrestJsonResult` is iterable and will fetch all elements from the API the first time you iterate over it. +- Slicing. +This will fetch the elements in the specified range. +- `repr()`. +As a convenience implementation for interactive interpreter sessions this will fetch the first 20 elements. +- `len()`. +Unsurprisingly this returns the count of the requested table. +Depending on the [counting strategy](#counting-strategy) this has different implications such as the cache being populated. +- `list()`. +This can be useful to force evaluation of a `LazyPostgrestJsonResult`. +- `bool()`. +Using a `LazyPostgrestJsonResult` in any boolean context will evaluate it. +- Using the `singular=True` kwarg. +Getting some lazy object when explicitly requesting a single element doesn't make much sense. +Like django's `Model.objects.get()` this will return the requested element or raise a HTTPError if none or multiple objects were found. + +#### Pagination + +Remember the part about `postgrestutils` trying to be intuitive and pythonic? +Do you know about Python's slicing notation? +Great, you already know how pagination works. +Just to be sure here is a snippet of pagination in action: + +```python +>>> business_roles = pgrest_client.get('business_role') +>>> business_roles[:3] # fetches the first 3 business roles +>>> business_roles[3:6] # fetches the next 3 business roles +``` + +#### Caching + +Also like django querysets there is a cache to minimize API calls. +Here's a short snippet explaining the gist of it: + +```python +# Bad: Fetches the same data from the API twice +>>> print([role['id'] for role in pgrest_client.get('business_role')]) +>>> print([role['name'] for role in pgrest_client.get('business_role')]) + +# Good: Uses the cache resulting in only a single API request +>>> business_roles = pgrest_client.get('business_role') +>>> print([role['id'] for role in business_roles]) # fetches all elements into the cache +>>> print([role['name'] for role in business_roles]) # re-uses the cached elements +``` + +##### When results are not cached + +There are a few cases where a `LazyPostgrestJsonResult` will not cache results: + +- Indexing and slicing. +If the cache is not yet populated indexing and slicing - even on the same index/ranges - will result in an API call. + +```python +# without populated cache +>>> business_roles = pgrest_client.get('business_role') +>>> business_roles[5] # fetches the 6th element from the API +>>> business_roles[5] # fetches the 6th element from the API again + +# with populated cache +>>> business_roles = pgrest_client.get('business_role') +>>> list(business_roles) # fetches all elements from the API +>>> business_roles[5] # re-uses the cached elements +>>> business_roles[5] # re-uses the cached elements ``` -### Filtering +- `repr()`. +Since this just returns a slice of itself the cache won't be populated. +- `len()` when not using the `count=Count.NONE` kwarg. +Counting strategies other than `Count.NONE` are not required to fetch all elements in order to determine their length. +[More on counting strategies.](#counting-strategies) + +##### A note on caching and `len()` + +Since calling `len()` can often be considerably expensive its result is also cached. +Any subsequent calls will re-use the cache instead of making any API calls. + + +##### Invalidating the cache + +If you have a `LazyPostgrestJsonResult` around that you want to re-use but need up-to-date data simply call the `.refresh_from_pgrest()` method on it. +That will lazily refresh data from PostgREST by invalidating the cache. +Your object will now behave as if you just created it. + +#### Counting strategies + +PostgREST currently offers two [counting strategies](http://postgrest.org/en/stable/api.html#limits-and-pagination): that is counting and not counting. +`postgrestutils` lets you decide on which to use by specifying the `count` kwarg. + +##### Using `count=Count.NONE` + +If you don't need to know the count for your request this is obviously a good counting strategy to choose. +But what happens if you need the count and just call `len()` on your `LazyPostgrestJsonResult` anyway? +This is again similar to what django querysets do. +It will evaluate the `LazyPostgrestJsonResult` fetching all elements from the API into the cache and return the length of the cache. + +##### Using `count=Count.EXACT` + +You've learned that `count=Count.NONE` will count your elements just fine so why would you ever want to use this option? +The reason is quite simple: Fetching all elements for a large table can be expensive; and unnecessarily so if you don't even need them. +That's often the case when using pagination. +You want to show a subset of all elements but also display how many pages with more elements there are. +To do so you need the count of all elements and the first few elements depending on your page size. +What you don't need however is all elements so why fetch them? + +This counting strategy allows you to get the count without fetching all elements. +So what happens when calling `len()` on your `LazyPostgrestJsonResult` this time? +`postgrestutils` will explicitly request the count for your request which will be cheaper for large tables. + +Be careful with this for very large tables however as this can take a very long time as explained in the [PostgREST documentation](http://postgrest.org/en/stable/admin.html#count-header-dos). +As also mentioned there future versions will support estimating the count. + +#### Filtering http://postgrest.org/en/stable/api.html -#### Django helpers +### Django helpers -##### custom `user_account_fetched` signal +#### custom `user_account_fetched` signal `postgrestutils` provides a custom signal called `user_account_fetched` which provides the current request and the account of the current user on login. To use this feature (and you really should 99.9% of the time) configure your settings accordingly by specifying the columns you need from the `account` endpoint: diff --git a/postgrestutils/client/__init__.py b/postgrestutils/client/__init__.py index e385b1e..fce495c 100644 --- a/postgrestutils/client/__init__.py +++ b/postgrestutils/client/__init__.py @@ -1,5 +1,7 @@ -from . import postgrestclient from .. import app_settings +from .postgrestclient import Count, PostgrestClient # the instance of the client to be used -pgrest_client = postgrestclient.PostgrestClient(app_settings.BASE_URI, app_settings.JWT) +pgrest_client = PostgrestClient(app_settings.BASE_URI, app_settings.JWT) + +__all__ = ['Count', 'pgrest_client'] diff --git a/postgrestutils/client/postgrestclient.py b/postgrestutils/client/postgrestclient.py index ddf0abd..9134dc2 100644 --- a/postgrestutils/client/postgrestclient.py +++ b/postgrestutils/client/postgrestclient.py @@ -1,9 +1,15 @@ +import copy +import enum from urllib.parse import urljoin import requests from postgrestutils.client.utils import datetime_parser +REPR_OUTPUT_SIZE = 20 + +Count = enum.Enum('Count', (('NONE', None), ('EXACT', 'exact'))) + class PostgrestClient: def __init__(self, base_uri, token=None): @@ -17,26 +23,141 @@ class PostgrestClient: if token: self.session.headers['Authorization'] = 'Bearer {}'.format(token) - def get(self, path, singular=False, parse_dt=True, **kwargs): + def get(self, endpoint, singular=False, parse_dt=True, count=Count.NONE, **kwargs): """ - :param path: specifies the endpoint - :param singular: if True returns a JSON object rather than a list (406 when multiple results are returned) - :param parse_dt: if True attempts to parse datetime strings to python datetime objects + :param endpoint: specifies which endpoint to request + :param singular: if True returns a JSON object rather than a list (406 + when multiple results are returned) + :param parse_dt: if True parses datetime strings as returned by + PostgREST to python datetime objects + :param count: counting strategy as explained in the README :param kwargs: pass kwargs directly to requests's .get() method - :return: result(s) as python object or raises HTTPError + :return: single element as dict (singular=True), lazy python object for + multiple elements or raises HTTPError """ - if singular: - self.session.headers["Accept"] = "application/vnd.pgrst.object+json" + if singular: # immediately evaluate and return result + res = LazyPostgrestJsonResult(self, endpoint, singular, parse_dt, count, **kwargs) + res._fetch_all() # will raise requests.HTTPError if no or multiple elements are returned + return res._result_cache[0] + return LazyPostgrestJsonResult(self, endpoint, singular, parse_dt, count, **kwargs) + + +class LazyPostgrestJsonResult: + def __init__(self, client, endpoint, singular, parse_dt, count, **kwargs): + self._len_cache = None + self._result_cache = None + + self.client = client + self.endpoint = endpoint + self.singular = singular + self.parse_dt = parse_dt + self.count = count + self.request_kwargs = kwargs + + def __repr__(self): + data = list(self[:REPR_OUTPUT_SIZE + 1]) + if len(data) > REPR_OUTPUT_SIZE: + data[-1] = "...(remaining elements truncated)..." + return '<{} {}>'.format(self.__class__.__name__, data) + + def __len__(self): + if self.count != Count.NONE: + self._fetch_len() else: - self.session.headers["Accept"] = "application/json" - res = self.session.get(urljoin(self.base_uri, path), **kwargs) + self._fetch_all() + return self._len_cache + + def _fetch_len(self): + if self._len_cache is None: + request_kwargs = copy.deepcopy(self.request_kwargs) + request_kwargs.setdefault('headers', dict())['Prefer'] = 'count={}'.format(self.count.value) + request_kwargs['headers']['Range-Unit'] = 'items' + # Have to request something so just fetch the first item + request_kwargs['headers']['Range'] = '0-0' + if self.singular: + request_kwargs['headers']['Accept'] = 'application/vnd.pgrst.object+json' + + resp = self.client.session.get(urljoin(self.client.base_uri, self.endpoint), **request_kwargs) + + count = int(resp.headers['Content-Range'].split('/')[-1]) + self._len_cache = count + + # If the request yields only one element anyway, might as well cache + # it. When using singular=False, count=Count.EXACT and the result + # is a single element this saves an API request in cases where + # len() is called before using the result. + if count == 1: + self._result_cache = self._parse_response(resp) + + def _fetch_all(self): + if self._result_cache is None: + request_kwargs = copy.deepcopy(self.request_kwargs) + + if self.singular: + request_kwargs.setdefault('headers', dict())['Accept'] = 'application/vnd.pgrst.object+json' + + resp = self.client.session.get(urljoin(self.client.base_uri, self.endpoint), **request_kwargs) + self._result_cache = self._parse_response(resp) + + # fetched all elements anyway, caching their length is very cheap + self._len_cache = len(self._result_cache) + + def _fetch_some(self, range): + request_kwargs = copy.deepcopy(self.request_kwargs) + request_kwargs.setdefault('headers', dict())['Range-Unit'] = 'items' + request_kwargs['headers']['Range'] = range + + if self.singular: + request_kwargs['headers']['Accept'] = 'application/vnd.pgrst.object+json' + + resp = self.client.session.get(urljoin(self.client.base_uri, self.endpoint), **request_kwargs) + return self._parse_response(resp) + + def _parse_response(self, resp): try: - res.raise_for_status() + resp.raise_for_status() except requests.HTTPError as e: - raise type(e)(res.status_code, res.reason, res.text) + raise type(e)(resp.status_code, resp.reason, resp.text) - if parse_dt: - json_result = res.json(object_hook=datetime_parser) + if self.parse_dt: + json_result = resp.json(object_hook=datetime_parser) else: - json_result = res.json() - return json_result + json_result = resp.json() + # always return a list even if it contains a single element only + return [json_result] if self.singular else json_result + + def __getitem__(self, key): + if not isinstance(key, (int, slice)): + raise TypeError( + "{self.__class__.__name__} indices must be integers or slices, not {key.__class__.__name__}".format( + self=self, + key=key + ) + ) + if ((isinstance(key, int) and key < 0) or + (isinstance(key, slice) and ((key.start is not None and key.start < 0) or + (key.stop is not None and key.stop < 0)))): + raise ValueError("{self.__class__.__name__} does not support negative indexing".format(self=self)) + if isinstance(key, slice) and key.step is not None: + raise ValueError("{self.__class__.__name__} does not support stepping".format(self=self)) + + if self._result_cache is not None: + return self._result_cache[key] + + if isinstance(key, slice): + range = '{start}-{stop}'.format( + start=key.start or 0, + stop=key.stop and key.stop - 1 or '' + ) + else: + range = '{0}-{0}'.format(key) + return self._fetch_some(range) + + def refresh_from_pgrest(self): + """Lazily refresh data from PostgREST""" + self._result_cache = None + self._len_cache = None + + def __iter__(self): + self._fetch_all() + return iter(self._result_cache) diff --git a/postgrestutils/client/utils.py b/postgrestutils/client/utils.py index 5daa063..fa56221 100644 --- a/postgrestutils/client/utils.py +++ b/postgrestutils/client/utils.py @@ -50,7 +50,12 @@ def datetime_parser(json_dict): parts = clean_parts(match.groupdict()) if parts.get('offsetsign') and parts.get('offsethours') and parts.get('offsetminutes'): sign = -1 if parts.pop('offsetsign', '+') == '-' else 1 - tz = timezone(offset=sign * timedelta(hours=int(parts.pop('offsethours')), minutes=int(parts.pop('offsetminutes')))) + tz = timezone( + offset=sign * timedelta( + hours=int(parts.pop('offsethours')), + minutes=int(parts.pop('offsetminutes')) + ) + ) parsed_dt = datetime(**parts).replace(tzinfo=tz).astimezone() else: # naive datetime so we assume local time -- GitLab