From 802e38922d9c8a8e66a688f55e120c33d47d34c7 Mon Sep 17 00:00:00 2001 From: beckerfy <fynn.becker@hs-hannover.de> Date: Tue, 21 Jul 2020 17:18:05 +0200 Subject: [PATCH] Close #12 Add new counting strategies --- README.md | 40 +++++-- postgrestutils/__init__.py | 10 +- tests/test_postgrestclient.py | 191 ++++++++++++++++++++++++++++++++++ 3 files changed, 230 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 8521b68..779dbf0 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ with postgrestutils.Session() as s: By default constructing a new `postgrestutils.Session` will take the settings discussed in [setup](#setup) into account. Hence there is no need to specify `base_uri` or `token` explicitly unless you are using more than one API or database role in your project. -Additionally `postgrestutils.Session` takes `schema: Optional[str] = None`, `parse_dt: bool = True` and `count: Count = Count.NONE` (some of which are explained later on). +Additionally `postgrestutils.Session` takes `schema: Optional[str] = None`, `parse_dt: bool = True` and `count: postgrestutils.Count = postgrestutils.Count.NONE` (some of which are explained later on). These options are session defaults and may be overridden on a per-request basis, e.g. ```python @@ -85,7 +85,7 @@ This can be useful to force evaluation of a `JsonResultSet`. Using a `JsonResultSet` in any boolean context will evaluate it. - Using the `.get()` method on a session. Getting some lazy object when explicitly requesting a single element doesn't make much sense. -Like django's `Model.objects.get()` this will return the requested element or raise a `ObjectDoesNotExist`/`MultipleObjectsReturned` if none or multiple objects were found. +Like django's `Model.objects.get()` this will return the requested element or raise a `postgrestutils.ObjectDoesNotExist`/`postgrestutils.MultipleObjectsReturned` if none or multiple objects were found. #### Pagination @@ -142,8 +142,8 @@ If the cache is not yet populated indexing and slicing - even on the same index/ - `repr()`. Since this just returns a slice of itself the cache won't be populated. -- `len()` when not using the `count=Count.NONE` kwarg. -Counting strategies other than `Count.NONE` are not required to fetch all elements in order to determine their length. +- `len()` when not using the `count=postgrestutils.Count.NONE` kwarg. +Counting strategies other than `postgrestutils.Count.NONE` are not required to fetch all elements in order to determine their length. [More on counting strategies.](#counting-strategies) ##### A note on caching and `len()` @@ -160,19 +160,20 @@ Your object will now behave as if you just created it. #### Counting strategies -PostgREST currently offers two [counting strategies](http://postgrest.org/en/stable/api.html#limits-and-pagination): that is counting and not counting. -`postgrestutils` lets you decide on which to use by specifying the `count` kwarg. +PostgREST currently offers multiple [counting strategies](http://postgrest.org/en/stable/api.html#exact-count). +`postgrestutils` lets you decide on which to use by specifying the `count` kwarg on a session or passing it on a per-request basis to `.get()` and `.filter()`. +While this document attempts to explain counting strategies sufficiently consulting the linked PostgREST documentation may be insightful at times. -##### Using `count=Count.NONE` +##### Using `count=postgrestutils.Count.NONE` If you don't need to know the count for your request this is obviously a good counting strategy to choose. But what happens if you need the count and just call `len()` on your `JsonResultSet` anyway? This is again similar to what django querysets do. It will evaluate the `JsonResultSet` fetching all elements from the API into the cache and return the length of the cache. -##### Using `count=Count.EXACT` +##### Using `count=postgrestutils.Count.EXACT` -You've learned that `count=Count.NONE` will count your elements just fine so why would you ever want to use this option? +You've learned that `count=postgrestutils.Count.NONE` will count your elements just fine so why would you ever want to use this option? The reason is quite simple: Fetching all elements for a large table can be expensive; and unnecessarily so if you don't even need them. That's often the case when using pagination. You want to show a subset of all elements but also display how many pages with more elements there are. @@ -184,7 +185,26 @@ So what happens when calling `len()` on your `JsonResultSet` this time? `postgrestutils` will explicitly request the count for your request which will be cheaper for large tables. Be careful with this for very large tables however as this can take a very long time as explained in the [PostgREST documentation](http://postgrest.org/en/stable/admin.html#count-header-dos). -As also mentioned there future versions will support estimating the count. + +##### Using `count=postgrestutils.Count.PLANNED` + +Now what? +Your table is very large, `postgrestutils.Count.EXACT` takes too long and `postgrestutils.Count.NONE` is out of question entirely. + +`postgrestutils.Count.PLANNED` to the rescue. +Using this counting strategy you explicitly instruct the client to leverage PostgreSQL statistics collected from `ANALYZE`ing tables. +This will yield fairly accurate results depending on how often new statistics are collected. + +##### Using `count=postgrestutils.Count.ESTIMATED` + +So `postgrestutils.Count.NONE` and `postgrestutils.Count.EXACT` are feasible for small tables. +For very large tables those either take too long or require too much memory and `postgrestutils.Count.ESTIMATED` is the only viable alternative. +However `postgrestutils.Count.PLANNED` can potentially lead to deviations even for small tables where they are quite notable. +If only we could have the best of both worlds... + +Enter `postgrestutils.Count.ESTIMATED`. +The idea is quite simple: `postgrestutils.Count.ESTIMATED` uses the `postgrestutils.Count.EXACT` strategy up to a certain threshold then falls back to the `postgrestutils.Count.PLANNED` strategy. +That threshold is defined by settings `max-rows` in your PostgREST configuration which will also limit the amount of rows fetched per request. #### Filtering http://postgrest.org/en/stable/api.html diff --git a/postgrestutils/__init__.py b/postgrestutils/__init__.py index f08987a..35dc1c4 100644 --- a/postgrestutils/__init__.py +++ b/postgrestutils/__init__.py @@ -15,7 +15,15 @@ default_app_config = "postgrestutils.apps.PostgrestUtilsConfig" REPR_OUTPUT_SIZE = 20 -Count = enum.Enum('Count', (('NONE', None), ('EXACT', 'exact'))) +Count = enum.Enum( + 'Count', + ( + ('NONE', None), + ('EXACT', 'exact'), + ('PLANNED', 'planned'), + ('ESTIMATED', 'estimated') + ) +) DEFAULT_SCHEMA = object() diff --git a/tests/test_postgrestclient.py b/tests/test_postgrestclient.py index 2c93e80..265b046 100644 --- a/tests/test_postgrestclient.py +++ b/tests/test_postgrestclient.py @@ -351,6 +351,197 @@ class TestPgrestClientFilterStrategyExact(TestCase): self.assertEqual(mock.call_count, 5) # should have been called 5 times (fetch len, range, first and all) +@Mocker() +class TestPgrestClientFilterStrategyPlanned(TestCase): + def setUp(self): + super().setUp() + self.data = SUPERHERO_TEST_DATA + + def test_fetch_all_first(self, mock): + # in order to fetch all + mock.register_uri( + 'GET', + 'http://example.com/superhero', + request_headers=DEFAULT_HEADERS, + status_code=200, + reason='OK', + json=self.data + ) + with default_session(count=postgrestutils.Count.PLANNED) as s: + res = s.filter('superhero') + + self.assertIsInstance(res, postgrestutils.JsonResultSet) # should return lazy object + self.assertFalse(mock.called) # no request should have been made yet + + self.assertEqual(list(res), self.data) # fetch data + self.assertTrue(mock.called_once) # should have been called once + self.assertEqual(res._result_cache, self.data) # fetched data should be cached + self.assertEqual(res._len_cache, len(self.data)) # len of fetched data should also be cached + self.assertEqual(list(res), self.data) # should utilize cache + self.assertEqual(res[:1], self.data[:1]) # should utilize cache + self.assertEqual(res[:0], self.data[:0]) # should return empty list + self.assertEqual(res[4:2], self.data[4:2]) # should return empty list + self.assertEqual(res[2:], self.data[2:]) # should utilize cache + self.assertEqual(res[0], self.data[0]) # should utilize cache + self.assertTrue(mock.called_once) # should not have been called again + + def test_fetch_len_first(self, mock): + # in order to fetch all + mock.register_uri( + 'GET', + 'http://example.com/superhero', + request_headers=DEFAULT_HEADERS, + status_code=200, + reason='OK', + json=self.data + ) + # in order to fetch first + mock.register_uri( + 'GET', + 'http://example.com/superhero', + request_headers={**DEFAULT_HEADERS, **{'Range-Unit': 'items', 'Range': '0-0'}}, + status_code=200, + reason='OK', + headers={'Content-Range': '0-0/*'}, + json=[self.data[0]] + ) + # in order to fetch range since index 2 + mock.register_uri( + 'GET', + 'http://example.com/superhero', + request_headers={**DEFAULT_HEADERS, **{'Range-Unit': 'items', 'Range': '2-'}}, + status_code=200, + reason='OK', + headers={'Content-Range': '2-4/*'}, + json=self.data[2:] + ) + # in order to fetch length + mock.register_uri( + 'GET', + 'http://example.com/superhero', + request_headers={**DEFAULT_HEADERS, **{'Range-Unit': 'items', 'Range': '0-0', 'Prefer': 'count=planned'}}, + status_code=206, + reason='Partial Content', + headers={'Content-Range': '0-0/5'}, + json=self.data[0] + ) + with default_session(count=postgrestutils.Count.PLANNED) as s: + res = s.filter('superhero') + + self.assertIsInstance(res, postgrestutils.JsonResultSet) # should return lazy object + self.assertFalse(mock.called) # no request should have been made yet + + self.assertEqual(len(res), len(self.data)) # should fetch len + self.assertTrue(mock.called_once) # should have been called once + self.assertEqual(res._len_cache, len(self.data)) # len of fetched data should be cached + self.assertEqual(res[:1], self.data[:1]) # should fetch first element as range + self.assertEqual(res[:0], self.data[:0]) # should return empty list + self.assertEqual(res[4:2], self.data[4:2]) # should return empty list + self.assertEqual(res[2:], self.data[2:]) # should fetch range starting at index 2 + self.assertEqual(res[0], self.data[0]) # should fetch first element as range but return dict + self.assertEqual(list(res), self.data) # should fetch all elements + self.assertEqual(res._result_cache, self.data) # should cache all elements + self.assertTrue(mock.called) # should have been called at least once + self.assertEqual(mock.call_count, 5) # should have been called 5 times (fetch len, range, first and all) + + +@Mocker() +class TestPgrestClientFilterStrategyEstimated(TestCase): + def setUp(self): + super().setUp() + self.data = SUPERHERO_TEST_DATA + + def test_fetch_all_first(self, mock): + # in order to fetch all + mock.register_uri( + 'GET', + 'http://example.com/superhero', + request_headers=DEFAULT_HEADERS, + status_code=200, + reason='OK', + json=self.data + ) + with default_session(count=postgrestutils.Count.ESTIMATED) as s: + res = s.filter('superhero') + + self.assertIsInstance(res, postgrestutils.JsonResultSet) # should return lazy object + self.assertFalse(mock.called) # no request should have been made yet + + self.assertEqual(list(res), self.data) # fetch data + self.assertTrue(mock.called_once) # should have been called once + self.assertEqual(res._result_cache, self.data) # fetched data should be cached + self.assertEqual(res._len_cache, len(self.data)) # len of fetched data should also be cached + self.assertEqual(list(res), self.data) # should utilize cache + self.assertEqual(res[:1], self.data[:1]) # should utilize cache + self.assertEqual(res[:0], self.data[:0]) # should return empty list + self.assertEqual(res[4:2], self.data[4:2]) # should return empty list + self.assertEqual(res[2:], self.data[2:]) # should utilize cache + self.assertEqual(res[0], self.data[0]) # should utilize cache + self.assertTrue(mock.called_once) # should not have been called again + + def test_fetch_len_first(self, mock): + # in order to fetch all + mock.register_uri( + 'GET', + 'http://example.com/superhero', + request_headers=DEFAULT_HEADERS, + status_code=200, + reason='OK', + json=self.data + ) + # in order to fetch first + mock.register_uri( + 'GET', + 'http://example.com/superhero', + request_headers={**DEFAULT_HEADERS, **{'Range-Unit': 'items', 'Range': '0-0'}}, + status_code=200, + reason='OK', + headers={'Content-Range': '0-0/*'}, + json=[self.data[0]] + ) + # in order to fetch range since index 2 + mock.register_uri( + 'GET', + 'http://example.com/superhero', + request_headers={**DEFAULT_HEADERS, **{'Range-Unit': 'items', 'Range': '2-'}}, + status_code=200, + reason='OK', + headers={'Content-Range': '2-4/*'}, + json=self.data[2:] + ) + # in order to fetch length + mock.register_uri( + 'GET', + 'http://example.com/superhero', + request_headers={ + **DEFAULT_HEADERS, + **{'Range-Unit': 'items', 'Range': '0-0', 'Prefer': 'count=estimated'} + }, + status_code=206, + reason='Partial Content', + headers={'Content-Range': '0-0/5'}, + json=self.data[0] + ) + with default_session(count=postgrestutils.Count.ESTIMATED) as s: + res = s.filter('superhero') + + self.assertIsInstance(res, postgrestutils.JsonResultSet) # should return lazy object + self.assertFalse(mock.called) # no request should have been made yet + + self.assertEqual(len(res), len(self.data)) # should fetch len + self.assertTrue(mock.called_once) # should have been called once + self.assertEqual(res._len_cache, len(self.data)) # len of fetched data should be cached + self.assertEqual(res[:1], self.data[:1]) # should fetch first element as range + self.assertEqual(res[:0], self.data[:0]) # should return empty list + self.assertEqual(res[4:2], self.data[4:2]) # should return empty list + self.assertEqual(res[2:], self.data[2:]) # should fetch range starting at index 2 + self.assertEqual(res[0], self.data[0]) # should fetch first element as range but return dict + self.assertEqual(list(res), self.data) # should fetch all elements + self.assertEqual(res._result_cache, self.data) # should cache all elements + self.assertTrue(mock.called) # should have been called at least once + self.assertEqual(mock.call_count, 5) # should have been called 5 times (fetch len, range, first and all) + + @Mocker() class TestPgrestClientSessionDefaults(TestCase): def setUp(self): -- GitLab