Skip to content

Commit

Permalink
Allow efficient deep pagination to 10K hits
Browse files Browse the repository at this point in the history
This change sets the default initial pagination depth to 100 for
fast initial searches, but allows users to explore as deeply as they
dare by progressively increasing the pagination depth by another
100 hits, as needed, up to 10K.

Searches will lose tiny amounts of speed as pagination goes deeper,
maxing out at around 6 seconds per "next page" request when a theoretical user
approaches 10,000 hits under the sea.

This scheme also enables the saving of search URLs with a pagination "page"
number that will reproduce the search.

Testing

You can test the deep pagination locally if you have a local complaint index
populated in Docker or local Elasticsearch.

Docker

If you're running cfgov in Docker, you can clone ccdb5-api into the develop-apps
folder and checkint out this branch.

Virtual env

If you're running cfgov in a virtual environment:
- Activate your cfgov virtual environment
- Clone the ccdb5-api repo to a folder of your choice
- Check out this branch, and then use `pip install -e .` to inject the branch
into your virtual env.

We can also deploy the code to a DEV server for CA testing.
  • Loading branch information
higs4281 committed Feb 15, 2022
1 parent ea5c7f3 commit e347724
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 29 deletions.
8 changes: 5 additions & 3 deletions complaint_search/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@
AGG_PRODUCT_DEFAULT = 30
AGG_SUBPRODUCT_DEFAULT = 90
# Other defaults:
# Pagination depth is the max hits that users can explore page by page.
# The default result size matches the default for users of our search.
# Pagination batch is the number of results we paginate at a time.
# Max pagination depth is the farthest we'll paginate – 100 batches.
# The default result size matches the front-end default for users.
# The trend_depth default limits display to 5 items in some Trends contexts.
PAGINATION_DEPTH_DEFAULT = 1000
PAGINATION_BATCH = 100
MAX_PAGINATION_DEPTH = 10000
RESULT_SIZE_DEFAULT = 25
RESULT_SIZE_OPTIONS = [10, 50, 100]
TREND_DEPTH_DEFAULT = 5
Expand Down
57 changes: 34 additions & 23 deletions complaint_search/es_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import os
from datetime import datetime, timedelta
from math import ceil

from elasticsearch import Elasticsearch, RequestsHttpConnection, helpers
from flags.state import flag_enabled
Expand All @@ -10,7 +11,8 @@
from complaint_search.defaults import (
CSV_ORDERED_HEADERS,
EXPORT_FORMATS,
PAGINATION_DEPTH_DEFAULT,
MAX_PAGINATION_DEPTH,
PAGINATION_BATCH,
PARAMS,
)
from complaint_search.es_builders import (
Expand Down Expand Up @@ -61,13 +63,27 @@ def build_trend_meta(response):
return meta


# PAGINATION_BATCH = 100
# MAX_PAGINATION_DEPTH = 10000
def get_pagination_query_size(page, user_batch_size):
batch_ahead = PAGINATION_BATCH * 2
batch_point = page * user_batch_size
if batch_point < PAGINATION_BATCH:
return batch_ahead
multiplier = ceil(batch_point / PAGINATION_BATCH)
query_size = batch_ahead + (PAGINATION_BATCH * multiplier)
if query_size <= MAX_PAGINATION_DEPTH:
return query_size
else:
return MAX_PAGINATION_DEPTH


def get_break_points(hits, size):
"""Return a dict of 'search-after' values for pagination."""
end_page = int(PAGINATION_DEPTH_DEFAULT / size)
"""Return a dict of upcoming 'search-after' values for pagination."""
end_page = int(MAX_PAGINATION_DEPTH / size)
break_points = {}
if size >= len(hits):
return break_points
# we don't need a break point for page 1; start with page 2
page = 2
break_points[page] = hits[size - 1].get("sort")
next_batch = hits[size:]
Expand Down Expand Up @@ -193,16 +209,6 @@ def _get_meta():
"field": "date_indexed",
"format": "yyyy-MM-dd'T'12:00:00-05:00"
}
},
"max_narratives": {
"filter": {"term": {"has_narrative": "true"}},
"aggs": {
"max_date": {
"max": {
"field": ":updated_at",
}
}
}
}
}
}
Expand Down Expand Up @@ -269,13 +275,12 @@ def search(agg_exclude=None, **kwargs):
- Update params with request details.
- Add a formatted 'search_after' param if pagination is requested.
- Build a search body based on params
- Add filter and aggregation sections to the search body, based on params.
- Add param-based post_filter and aggregation sections to the search body.
- Add a track_total_hits directive to get accurate hit counts (new in 2021)
- Assemble pagination break points if needed.
Then responses are finalized based on whether the results are to be viewed
The response is finalized based on whether the results are to be viewed
in a browser or exported as CSV or JSON.
Viewable results are paginated in most cases.
Exportable results are produced with "scroll" Elasticsearch searches,
and are never paginated.
"""
Expand Down Expand Up @@ -307,24 +312,30 @@ def search(agg_exclude=None, **kwargs):
res = _get_es().search(index=_COMPLAINT_ES_INDEX, body=body)
hit_total = res['hits']['total']['value']
break_points = {}
# page = 1
if res['hits']['hits']:
if hit_total and hit_total > body["size"]:
user_batch_size = body["size"]
if hit_total and hit_total > user_batch_size:
# We have more than one page of results and need pagination
pagination_body = copy.deepcopy(body)
pagination_body["size"] = PAGINATION_DEPTH_DEFAULT
# cleaner to get page from frontend, but 'frm' works for now
page = params.get("frm", user_batch_size) / user_batch_size
pagination_body["size"] = get_pagination_query_size(
page, user_batch_size
)
if "search_after" in pagination_body:
del pagination_body["search_after"]
log.info(
'Harvesting pagination dict using %s/%s/_search with %s',
'Harvesting break points using %s/%s/_search with %s',
_ES_URL, _COMPLAINT_ES_INDEX, pagination_body
)
pagination_res = _get_es().search(
index=_COMPLAINT_ES_INDEX,
body=pagination_body
)
break_points = get_break_points(
pagination_res['hits']['hits'], body["size"])
pagination_res['hits']['hits'],
user_batch_size
)
res["_meta"] = _get_meta()
res["_meta"]["break_points"] = break_points

Expand Down
2 changes: 1 addition & 1 deletion complaint_search/tests/test_es_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def test__pagination_generated(self, mock_count, mock_meta, mock_es):
"hits": {
"total": {"value": 10000},
"hits": fake_hits}}
response = search(size=2, search_after="1620752400004_4367497")
response = search(size=2, search_after="1620752400004_4367497", page=2)
self.assertEqual(len(response.get("_meta").get("break_points")), 2)

@mock.patch("complaint_search.es_interface._get_now")
Expand Down
54 changes: 54 additions & 0 deletions complaint_search/tests/test_get_pagination_query_size.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import unittest

from complaint_search.defaults import MAX_PAGINATION_DEPTH, PAGINATION_BATCH
from complaint_search.es_interface import get_pagination_query_size


class TestPaginationSize(unittest.TestCase):

def setUp(self):
self.user_sizes = [10, 25, 50, 100]

def test_get_pagination_query_size_page_1(self):
page = 1
for user_size in self.user_sizes[:3]:
self.assertEqual(
get_pagination_query_size(page, user_size),
PAGINATION_BATCH * 2
)
self.assertEqual(
get_pagination_query_size(page, 400),
600
)

def test_get_pagination_query_size_page_2(self):
page = 2
self.assertEqual(
get_pagination_query_size(page, 25),
PAGINATION_BATCH * 2
)
self.assertEqual(
get_pagination_query_size(page, 100),
PAGINATION_BATCH * 4
)

def test_get_pagination_query_size_with_remainder(self):
page = 6
self.assertEqual(
get_pagination_query_size(page, 25),
PAGINATION_BATCH * 4
)

def test_get_pagination_query_size_equals_max(self):
page = 100
self.assertEqual(
get_pagination_query_size(page, 100),
MAX_PAGINATION_DEPTH
)

def test_get_pagination_query_size_exceeds_max(self):
page = 101
self.assertEqual(
get_pagination_query_size(page, 100),
MAX_PAGINATION_DEPTH
)
4 changes: 2 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ setenv=
commands=
coverage erase
coverage run manage.py test {posargs}
coverage report
coverage report -m
coverage html

[testenv:lint]
Expand Down Expand Up @@ -50,4 +50,4 @@ sections=FUTURE,STDLIB,DJANGO,THIRDPARTY,FIRSTPARTY,LOCALFOLDER

[travis]
python=
3.6: py36-dj22, lint
3.8: py38-dj22, lint

0 comments on commit e347724

Please sign in to comment.