Skip to content

Commit

Permalink
Merge pull request #187 from cfpb/batch-pagination
Browse files Browse the repository at this point in the history
Allow efficient deep pagination to 10K hits
  • Loading branch information
higs4281 authored Feb 17, 2022
2 parents ea5c7f3 + e347724 commit 2cc3f0d
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 29 deletions.
8 changes: 5 additions & 3 deletions complaint_search/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@
AGG_PRODUCT_DEFAULT = 30
AGG_SUBPRODUCT_DEFAULT = 90
# Other defaults:
# Pagination depth is the max hits that users can explore page by page.
# The default result size matches the default for users of our search.
# Pagination batch is the number of results we paginate at a time.
# Max pagination depth is the farthest we'll paginate – 100 batches.
# The default result size matches the front-end default for users.
# The trend_depth default limits display to 5 items in some Trends contexts.
PAGINATION_DEPTH_DEFAULT = 1000
PAGINATION_BATCH = 100
MAX_PAGINATION_DEPTH = 10000
RESULT_SIZE_DEFAULT = 25
RESULT_SIZE_OPTIONS = [10, 50, 100]
TREND_DEPTH_DEFAULT = 5
Expand Down
57 changes: 34 additions & 23 deletions complaint_search/es_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import os
from datetime import datetime, timedelta
from math import ceil

from elasticsearch import Elasticsearch, RequestsHttpConnection, helpers
from flags.state import flag_enabled
Expand All @@ -10,7 +11,8 @@
from complaint_search.defaults import (
CSV_ORDERED_HEADERS,
EXPORT_FORMATS,
PAGINATION_DEPTH_DEFAULT,
MAX_PAGINATION_DEPTH,
PAGINATION_BATCH,
PARAMS,
)
from complaint_search.es_builders import (
Expand Down Expand Up @@ -61,13 +63,27 @@ def build_trend_meta(response):
return meta


# PAGINATION_BATCH = 100
# MAX_PAGINATION_DEPTH = 10000
def get_pagination_query_size(page, user_batch_size):
batch_ahead = PAGINATION_BATCH * 2
batch_point = page * user_batch_size
if batch_point < PAGINATION_BATCH:
return batch_ahead
multiplier = ceil(batch_point / PAGINATION_BATCH)
query_size = batch_ahead + (PAGINATION_BATCH * multiplier)
if query_size <= MAX_PAGINATION_DEPTH:
return query_size
else:
return MAX_PAGINATION_DEPTH


def get_break_points(hits, size):
"""Return a dict of 'search-after' values for pagination."""
end_page = int(PAGINATION_DEPTH_DEFAULT / size)
"""Return a dict of upcoming 'search-after' values for pagination."""
end_page = int(MAX_PAGINATION_DEPTH / size)
break_points = {}
if size >= len(hits):
return break_points
# we don't need a break point for page 1; start with page 2
page = 2
break_points[page] = hits[size - 1].get("sort")
next_batch = hits[size:]
Expand Down Expand Up @@ -193,16 +209,6 @@ def _get_meta():
"field": "date_indexed",
"format": "yyyy-MM-dd'T'12:00:00-05:00"
}
},
"max_narratives": {
"filter": {"term": {"has_narrative": "true"}},
"aggs": {
"max_date": {
"max": {
"field": ":updated_at",
}
}
}
}
}
}
Expand Down Expand Up @@ -269,13 +275,12 @@ def search(agg_exclude=None, **kwargs):
- Update params with request details.
- Add a formatted 'search_after' param if pagination is requested.
- Build a search body based on params
- Add filter and aggregation sections to the search body, based on params.
- Add param-based post_filter and aggregation sections to the search body.
- Add a track_total_hits directive to get accurate hit counts (new in 2021)
- Assemble pagination break points if needed.
Then responses are finalized based on whether the results are to be viewed
The response is finalized based on whether the results are to be viewed
in a browser or exported as CSV or JSON.
Viewable results are paginated in most cases.
Exportable results are produced with "scroll" Elasticsearch searches,
and are never paginated.
"""
Expand Down Expand Up @@ -307,24 +312,30 @@ def search(agg_exclude=None, **kwargs):
res = _get_es().search(index=_COMPLAINT_ES_INDEX, body=body)
hit_total = res['hits']['total']['value']
break_points = {}
# page = 1
if res['hits']['hits']:
if hit_total and hit_total > body["size"]:
user_batch_size = body["size"]
if hit_total and hit_total > user_batch_size:
# We have more than one page of results and need pagination
pagination_body = copy.deepcopy(body)
pagination_body["size"] = PAGINATION_DEPTH_DEFAULT
# cleaner to get page from frontend, but 'frm' works for now
page = params.get("frm", user_batch_size) / user_batch_size
pagination_body["size"] = get_pagination_query_size(
page, user_batch_size
)
if "search_after" in pagination_body:
del pagination_body["search_after"]
log.info(
'Harvesting pagination dict using %s/%s/_search with %s',
'Harvesting break points using %s/%s/_search with %s',
_ES_URL, _COMPLAINT_ES_INDEX, pagination_body
)
pagination_res = _get_es().search(
index=_COMPLAINT_ES_INDEX,
body=pagination_body
)
break_points = get_break_points(
pagination_res['hits']['hits'], body["size"])
pagination_res['hits']['hits'],
user_batch_size
)
res["_meta"] = _get_meta()
res["_meta"]["break_points"] = break_points

Expand Down
2 changes: 1 addition & 1 deletion complaint_search/tests/test_es_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def test__pagination_generated(self, mock_count, mock_meta, mock_es):
"hits": {
"total": {"value": 10000},
"hits": fake_hits}}
response = search(size=2, search_after="1620752400004_4367497")
response = search(size=2, search_after="1620752400004_4367497", page=2)
self.assertEqual(len(response.get("_meta").get("break_points")), 2)

@mock.patch("complaint_search.es_interface._get_now")
Expand Down
54 changes: 54 additions & 0 deletions complaint_search/tests/test_get_pagination_query_size.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import unittest

from complaint_search.defaults import MAX_PAGINATION_DEPTH, PAGINATION_BATCH
from complaint_search.es_interface import get_pagination_query_size


class TestPaginationSize(unittest.TestCase):

def setUp(self):
self.user_sizes = [10, 25, 50, 100]

def test_get_pagination_query_size_page_1(self):
page = 1
for user_size in self.user_sizes[:3]:
self.assertEqual(
get_pagination_query_size(page, user_size),
PAGINATION_BATCH * 2
)
self.assertEqual(
get_pagination_query_size(page, 400),
600
)

def test_get_pagination_query_size_page_2(self):
page = 2
self.assertEqual(
get_pagination_query_size(page, 25),
PAGINATION_BATCH * 2
)
self.assertEqual(
get_pagination_query_size(page, 100),
PAGINATION_BATCH * 4
)

def test_get_pagination_query_size_with_remainder(self):
page = 6
self.assertEqual(
get_pagination_query_size(page, 25),
PAGINATION_BATCH * 4
)

def test_get_pagination_query_size_equals_max(self):
page = 100
self.assertEqual(
get_pagination_query_size(page, 100),
MAX_PAGINATION_DEPTH
)

def test_get_pagination_query_size_exceeds_max(self):
page = 101
self.assertEqual(
get_pagination_query_size(page, 100),
MAX_PAGINATION_DEPTH
)
4 changes: 2 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ setenv=
commands=
coverage erase
coverage run manage.py test {posargs}
coverage report
coverage report -m
coverage html

[testenv:lint]
Expand Down Expand Up @@ -50,4 +50,4 @@ sections=FUTURE,STDLIB,DJANGO,THIRDPARTY,FIRSTPARTY,LOCALFOLDER

[travis]
python=
3.6: py36-dj22, lint
3.8: py38-dj22, lint

0 comments on commit 2cc3f0d

Please sign in to comment.