diff --git a/complaint_search/defaults.py b/complaint_search/defaults.py index e30cf3f..e907994 100644 --- a/complaint_search/defaults.py +++ b/complaint_search/defaults.py @@ -16,10 +16,12 @@ AGG_PRODUCT_DEFAULT = 30 AGG_SUBPRODUCT_DEFAULT = 90 # Other defaults: -# Pagination depth is the max hits that users can explore page by page. -# The default result size matches the default for users of our search. +# Pagination batch is the number of results we paginate at a time. +# Max pagination depth is the farthest we'll paginate – 100 batches. +# The default result size matches the front-end default for users. # The trend_depth default limits display to 5 items in some Trends contexts. -PAGINATION_DEPTH_DEFAULT = 1000 +PAGINATION_BATCH = 100 +MAX_PAGINATION_DEPTH = 10000 RESULT_SIZE_DEFAULT = 25 RESULT_SIZE_OPTIONS = [10, 50, 100] TREND_DEPTH_DEFAULT = 5 diff --git a/complaint_search/es_interface.py b/complaint_search/es_interface.py index 87912b4..34c9acb 100644 --- a/complaint_search/es_interface.py +++ b/complaint_search/es_interface.py @@ -2,6 +2,7 @@ import logging import os from datetime import datetime, timedelta +from math import ceil from elasticsearch import Elasticsearch, RequestsHttpConnection, helpers from flags.state import flag_enabled @@ -10,7 +11,8 @@ from complaint_search.defaults import ( CSV_ORDERED_HEADERS, EXPORT_FORMATS, - PAGINATION_DEPTH_DEFAULT, + MAX_PAGINATION_DEPTH, + PAGINATION_BATCH, PARAMS, ) from complaint_search.es_builders import ( @@ -61,13 +63,27 @@ def build_trend_meta(response): return meta +# PAGINATION_BATCH = 100 +# MAX_PAGINATION_DEPTH = 10000 +def get_pagination_query_size(page, user_batch_size): + batch_ahead = PAGINATION_BATCH * 2 + batch_point = page * user_batch_size + if batch_point < PAGINATION_BATCH: + return batch_ahead + multiplier = ceil(batch_point / PAGINATION_BATCH) + query_size = batch_ahead + (PAGINATION_BATCH * multiplier) + if query_size <= MAX_PAGINATION_DEPTH: + return query_size + else: + return MAX_PAGINATION_DEPTH + + def get_break_points(hits, size): - """Return a dict of 'search-after' values for pagination.""" - end_page = int(PAGINATION_DEPTH_DEFAULT / size) + """Return a dict of upcoming 'search-after' values for pagination.""" + end_page = int(MAX_PAGINATION_DEPTH / size) break_points = {} if size >= len(hits): return break_points - # we don't need a break point for page 1; start with page 2 page = 2 break_points[page] = hits[size - 1].get("sort") next_batch = hits[size:] @@ -193,16 +209,6 @@ def _get_meta(): "field": "date_indexed", "format": "yyyy-MM-dd'T'12:00:00-05:00" } - }, - "max_narratives": { - "filter": {"term": {"has_narrative": "true"}}, - "aggs": { - "max_date": { - "max": { - "field": ":updated_at", - } - } - } } } } @@ -269,13 +275,12 @@ def search(agg_exclude=None, **kwargs): - Update params with request details. - Add a formatted 'search_after' param if pagination is requested. - Build a search body based on params - - Add filter and aggregation sections to the search body, based on params. + - Add param-based post_filter and aggregation sections to the search body. - Add a track_total_hits directive to get accurate hit counts (new in 2021) + - Assemble pagination break points if needed. - Then responses are finalized based on whether the results are to be viewed + The response is finalized based on whether the results are to be viewed in a browser or exported as CSV or JSON. - - Viewable results are paginated in most cases. Exportable results are produced with "scroll" Elasticsearch searches, and are never paginated. """ @@ -307,16 +312,20 @@ def search(agg_exclude=None, **kwargs): res = _get_es().search(index=_COMPLAINT_ES_INDEX, body=body) hit_total = res['hits']['total']['value'] break_points = {} - # page = 1 if res['hits']['hits']: - if hit_total and hit_total > body["size"]: + user_batch_size = body["size"] + if hit_total and hit_total > user_batch_size: # We have more than one page of results and need pagination pagination_body = copy.deepcopy(body) - pagination_body["size"] = PAGINATION_DEPTH_DEFAULT + # cleaner to get page from frontend, but 'frm' works for now + page = params.get("frm", user_batch_size) / user_batch_size + pagination_body["size"] = get_pagination_query_size( + page, user_batch_size + ) if "search_after" in pagination_body: del pagination_body["search_after"] log.info( - 'Harvesting pagination dict using %s/%s/_search with %s', + 'Harvesting break points using %s/%s/_search with %s', _ES_URL, _COMPLAINT_ES_INDEX, pagination_body ) pagination_res = _get_es().search( @@ -324,7 +333,9 @@ def search(agg_exclude=None, **kwargs): body=pagination_body ) break_points = get_break_points( - pagination_res['hits']['hits'], body["size"]) + pagination_res['hits']['hits'], + user_batch_size + ) res["_meta"] = _get_meta() res["_meta"]["break_points"] = break_points diff --git a/complaint_search/tests/test_es_interface.py b/complaint_search/tests/test_es_interface.py index ba690ad..9a22107 100644 --- a/complaint_search/tests/test_es_interface.py +++ b/complaint_search/tests/test_es_interface.py @@ -263,7 +263,7 @@ def test__pagination_generated(self, mock_count, mock_meta, mock_es): "hits": { "total": {"value": 10000}, "hits": fake_hits}} - response = search(size=2, search_after="1620752400004_4367497") + response = search(size=2, search_after="1620752400004_4367497", page=2) self.assertEqual(len(response.get("_meta").get("break_points")), 2) @mock.patch("complaint_search.es_interface._get_now") diff --git a/complaint_search/tests/test_get_pagination_query_size.py b/complaint_search/tests/test_get_pagination_query_size.py new file mode 100644 index 0000000..276ac82 --- /dev/null +++ b/complaint_search/tests/test_get_pagination_query_size.py @@ -0,0 +1,54 @@ +import unittest + +from complaint_search.defaults import MAX_PAGINATION_DEPTH, PAGINATION_BATCH +from complaint_search.es_interface import get_pagination_query_size + + +class TestPaginationSize(unittest.TestCase): + + def setUp(self): + self.user_sizes = [10, 25, 50, 100] + + def test_get_pagination_query_size_page_1(self): + page = 1 + for user_size in self.user_sizes[:3]: + self.assertEqual( + get_pagination_query_size(page, user_size), + PAGINATION_BATCH * 2 + ) + self.assertEqual( + get_pagination_query_size(page, 400), + 600 + ) + + def test_get_pagination_query_size_page_2(self): + page = 2 + self.assertEqual( + get_pagination_query_size(page, 25), + PAGINATION_BATCH * 2 + ) + self.assertEqual( + get_pagination_query_size(page, 100), + PAGINATION_BATCH * 4 + ) + + def test_get_pagination_query_size_with_remainder(self): + page = 6 + self.assertEqual( + get_pagination_query_size(page, 25), + PAGINATION_BATCH * 4 + ) + + def test_get_pagination_query_size_equals_max(self): + page = 100 + self.assertEqual( + get_pagination_query_size(page, 100), + MAX_PAGINATION_DEPTH + ) + + def test_get_pagination_query_size_exceeds_max(self): + page = 101 + self.assertEqual( + get_pagination_query_size(page, 100), + MAX_PAGINATION_DEPTH + ) diff --git a/tox.ini b/tox.ini index 4f66d88..df155ba 100644 --- a/tox.ini +++ b/tox.ini @@ -15,7 +15,7 @@ setenv= commands= coverage erase coverage run manage.py test {posargs} - coverage report + coverage report -m coverage html [testenv:lint] @@ -50,4 +50,4 @@ sections=FUTURE,STDLIB,DJANGO,THIRDPARTY,FIRSTPARTY,LOCALFOLDER [travis] python= - 3.6: py36-dj22, lint + 3.8: py38-dj22, lint