Skip to content

Commit

Permalink
Merge pull request #3667 from freelawproject/3658-fix-bad-escape-hl-m…
Browse files Browse the repository at this point in the history
…erger

Introduced a matched_fields parameter for combining highlights in FVH fields
  • Loading branch information
mlissner authored Jan 26, 2024
2 parents afcf3cd + b8cd6ab commit b087596
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 67 deletions.
45 changes: 35 additions & 10 deletions cl/lib/elasticsearch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,6 +698,7 @@ def build_has_child_query(
fields_to_exclude.append(field)
highlight_options["fields"][field] = {
"type": settings.ES_HIGHLIGHTER,
"matched_fields": [field, f"{field}.exact"],
"fragment_size": fragment_size,
"no_match_size": no_match_size,
"number_of_fragments": number_of_fragments,
Expand Down Expand Up @@ -1131,7 +1132,7 @@ def add_es_highlighting(
fields_to_exclude = []
highlighting_fields = []
hl_tag = ALERTS_HL_TAG if alerts else SEARCH_HL_TAG

matched_fields = False
match cd["type"]:
case SEARCH_TYPES.ORAL_ARGUMENT:
highlighting_fields = (
Expand All @@ -1144,18 +1145,30 @@ def add_es_highlighting(
highlighting_fields = SOLR_PEOPLE_ES_HL_FIELDS
case SEARCH_TYPES.RECAP | SEARCH_TYPES.DOCKETS:
highlighting_fields = SEARCH_RECAP_HL_FIELDS
matched_fields = True
case SEARCH_TYPES.OPINION:
highlighting_fields = SEARCH_OPINION_HL_FIELDS
matched_fields = True

search_query = search_query.source(excludes=fields_to_exclude)
for field in highlighting_fields:
search_query = search_query.highlight(
field,
type="plain",
number_of_fragments=0,
pre_tags=[f"<{hl_tag}>"],
post_tags=[f"</{hl_tag}>"],
)
if matched_fields:
search_query = search_query.highlight(
field,
type=settings.ES_HIGHLIGHTER,
matched_fields=[field, f"{field}.exact"],
number_of_fragments=0,
pre_tags=[f"<{hl_tag}>"],
post_tags=[f"</{hl_tag}>"],
)
else:
search_query = search_query.highlight(
field,
type="plain",
number_of_fragments=0,
pre_tags=[f"<{hl_tag}>"],
post_tags=[f"</{hl_tag}>"],
)

return search_query

Expand All @@ -1174,7 +1187,7 @@ def replace_highlight(

for word in unique_hl_strings:
# Create a pattern to match the word as a whole word.
pattern = rf"(?<!\w){regex.escape(word)}(?!\w)"
pattern = rf"(?<!\w){word}(?!\w)"

# Replace with the specified tag
replacement = f"<{tag}>{word}</{tag}>"
Expand Down Expand Up @@ -1210,7 +1223,10 @@ def select_unique_hl(


def merge_highlights_into_result(
highlights: dict[str, Any], result: AttrDict | dict[str, Any], tag: str
highlights: dict[str, Any],
result: AttrDict | dict[str, Any],
tag: str,
search_type: str | None = None,
) -> None:
"""Merges the highlight terms into the search result.
This function processes highlighted fields in the `highlights` attribute
Expand All @@ -1221,6 +1237,7 @@ def merge_highlights_into_result(
their highlighted terms.
:param result: The AttrDict object containing search results.
:param tag: The HTML tag used to mark highlighted terms.
:param search_type: The search type being performed.
:return: None, the function updates the results in place.
"""

Expand All @@ -1229,6 +1246,12 @@ def merge_highlights_into_result(
field,
highlight_list,
) in highlights.items():
if search_type in [SEARCH_TYPES.RECAP, SEARCH_TYPES.OPINION]:
# For RECAP and Opinions Search that use FVH, highlighted results
# are already combined. Simply assign them to the _source field.
result[field] = highlight_list
continue

# If a query highlights fields, the "field.exact", "field" or
# both versions are available. Highlighted terms in each
# version can differ, so the best thing to do is combine
Expand Down Expand Up @@ -1338,6 +1361,7 @@ def set_results_highlights(results: Page | Response, search_type: str) -> None:
highlights,
result,
SEARCH_HL_TAG,
search_type,
)

# Merge child document highlights
Expand All @@ -1350,6 +1374,7 @@ def set_results_highlights(results: Page | Response, search_type: str) -> None:
highlights,
child_doc["_source"],
SEARCH_HL_TAG,
search_type,
)


Expand Down
17 changes: 0 additions & 17 deletions cl/search/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,32 +145,21 @@
]
SEARCH_RECAP_HL_FIELDS = [
"assignedTo",
"assignedTo.exact",
"caseName",
"caseName.exact",
"cause",
"cause.exact",
"court_citation_string",
"docketNumber",
"docketNumber.exact",
"juryDemand",
"juryDemand.exact",
"referredTo",
"referredTo.exact",
"suitNature",
"suitNature.exact",
]

SEARCH_OPINION_HL_FIELDS = [
"caseName",
"caseName.exact",
"citation",
"citation.exact",
"court_citation_string",
"docketNumber",
"docketNumber.exact",
"suitNature",
"suitNature.exact",
]


Expand All @@ -181,17 +170,11 @@
# or provide a different integer to limit the snippet length.
SEARCH_RECAP_CHILD_HL_FIELDS = {
"short_description": 0,
"short_description.exact": 0,
"description": 0,
"description.exact": 0,
"document_type": 0,
"document_type.exact": 0,
"plain_text": 100,
"plain_text.exact": 100,
}
SEARCH_OPINION_CHILD_HL_FIELDS = {
"text": 100,
"text.exact": 100,
}

MULTI_VALUE_HL_FIELDS = ["citation"]
Expand Down
20 changes: 7 additions & 13 deletions cl/search/tests/tests_es_opinion.py
Original file line number Diff line number Diff line change
Expand Up @@ -1094,11 +1094,10 @@ async def test_results_highlights(self) -> None:
params = {"q": "'Howard v. Honda'"}

r = await self._test_article_count(params, 1, "highlights case name")
self.assertIn("<mark>Howard</mark>", r.content.decode())
self.assertEqual(r.content.decode().count("<mark>Howard</mark>"), 1)

self.assertIn("<mark>Honda</mark>", r.content.decode())
self.assertEqual(r.content.decode().count("<mark>Honda</mark>"), 1)
self.assertIn("<mark>Howard v. Honda</mark>", r.content.decode())
self.assertEqual(
r.content.decode().count("<mark>Howard v. Honda</mark>"), 1
)

# Highlight Citation. Multiple HL fields are properly merged.
params = {"q": "citation:(22 AL) OR citation:(33 state)"}
Expand All @@ -1110,23 +1109,18 @@ async def test_results_highlights(self) -> None:

params = {"q": '"22 AL 339"'}
r = await self._test_article_count(params, 1, "highlights case name")
self.assertIn("<mark>22</mark>", r.content.decode())
self.assertIn("<mark>AL</mark>", r.content.decode())
self.assertIn("<mark>339</mark>", r.content.decode())
self.assertIn("<mark>22 AL 339</mark>", r.content.decode())

params = {"q": '22 AL OR "Yeates 1"'}
r = await self._test_article_count(params, 1, "highlights case name")
self.assertIn("<mark>22</mark>", r.content.decode())
self.assertIn("<mark>AL</mark>", r.content.decode())
self.assertIn("<mark>Yeates</mark>", r.content.decode())
self.assertIn("<mark>1</mark>", r.content.decode())
self.assertIn("<mark>Yeates 1</mark>", r.content.decode())

# Highlight docketNumber.
params = {"q": 'docketNumber:"docket number 2"'}
r = await self._test_article_count(params, 1, "highlights case name")
self.assertIn("<mark>docket</mark>", r.content.decode())
self.assertIn("<mark>number</mark>", r.content.decode())
self.assertIn("<mark>2</mark>", r.content.decode())
self.assertIn("<mark>docket number 2</mark>", r.content.decode())

# Highlight suitNature.
params = {"q": '"copyright"'}
Expand Down
55 changes: 28 additions & 27 deletions cl/search/tests/tests_es_recap.py
Original file line number Diff line number Diff line change
Expand Up @@ -1450,10 +1450,10 @@ async def test_results_highlights(self) -> None:
0, r.content.decode(), 1, "highlights case name"
)

self.assertIn("<mark>SUBPOENAS</mark>", r.content.decode())
self.assertIn("<mark>SERVED</mark>", r.content.decode())
self.assertIn("<mark>OFF</mark>", r.content.decode())
self.assertEqual(r.content.decode().count("<mark>OFF</mark>"), 1)
self.assertIn("<mark>SUBPOENAS SERVED OFF</mark>", r.content.decode())
self.assertEqual(
r.content.decode().count("<mark>SUBPOENAS SERVED OFF</mark>"), 1
)

# Confirm we can limit the length of the plain_text snippet using the
# NO_MATCH_HL_SIZE setting.
Expand All @@ -1478,8 +1478,10 @@ async def test_results_highlights(self) -> None:
0, r.content.decode(), 2, "highlights case name"
)

self.assertIn("<mark>Thalassa</mark>", r.content.decode())
self.assertEqual(r.content.decode().count("<mark>Thalassa</mark>"), 1)
self.assertIn("<mark>Thalassa Miller</mark>", r.content.decode())
self.assertEqual(
r.content.decode().count("<mark>Thalassa Miller</mark>"), 1
)

# Highlight referred_to.
params = {"type": SEARCH_TYPES.RECAP, "q": "Persephone Sinclair"}
Expand All @@ -1490,9 +1492,9 @@ async def test_results_highlights(self) -> None:
0, r.content.decode(), 2, "highlights case name"
)

self.assertIn("<mark>Persephone</mark>", r.content.decode())
self.assertIn("<mark>Persephone Sinclair</mark>", r.content.decode())
self.assertEqual(
r.content.decode().count("<mark>Persephone</mark>"), 1
r.content.decode().count("<mark>Persephone Sinclair</mark>"), 1
)

# Highlight docketNumber.
Expand Down Expand Up @@ -1520,9 +1522,9 @@ async def test_results_highlights(self) -> None:
0, r.content.decode(), 1, "highlights description"
)

self.assertIn("<mark>Discharging</mark>", r.content.decode())
self.assertIn("<mark>Discharging Debtor</mark>", r.content.decode())
self.assertEqual(
r.content.decode().count("<mark>Discharging</mark>"), 1
r.content.decode().count("<mark>Discharging Debtor</mark>"), 1
)
# Highlight suitNature and text.
params = {"type": SEARCH_TYPES.RECAP, "q": "Lorem 440"}
Expand All @@ -1543,8 +1545,9 @@ async def test_results_highlights(self) -> None:
self._count_child_documents(
0, r.content.decode(), 1, "highlights plain_text"
)
self.assertEqual(r.content.decode().count("<mark>Maecenas</mark>"), 1)
self.assertEqual(r.content.decode().count("<mark>nunc</mark>"), 1)
self.assertEqual(
r.content.decode().count("<mark>Maecenas nunc</mark>"), 1
)
self.assertEqual(r.content.decode().count("<mark>justo</mark>"), 1)

# Highlight plain_text snippet.
Expand All @@ -1566,9 +1569,8 @@ async def test_results_highlights(self) -> None:
self._count_child_documents(
0, r.content.decode(), 1, "highlights plain_text"
)
self.assertEqual(r.content.decode().count("<mark>Document</mark>"), 1)
self.assertEqual(
r.content.decode().count("<mark>attachment</mark>"), 1
r.content.decode().count("<mark>Document attachment</mark>"), 1
)

# Highlight filter: caseName
Expand Down Expand Up @@ -1620,6 +1622,7 @@ async def test_results_highlights(self) -> None:
params, 1, "highlights Nature of Suit"
)
self.assertIn("<mark>Thalassa</mark>", r.content.decode())
self.assertIn("<mark>Miller</mark>", r.content.decode())

# Highlight filter: Referred to
params = {"type": SEARCH_TYPES.RECAP, "referred_to": "Persephone"}
Expand All @@ -1635,11 +1638,9 @@ async def test_results_highlights(self) -> None:
r = await self._test_article_count(params, 1, "filter + query")
self.assertIn("<mark>Amicus</mark>", r.content.decode())
self.assertEqual(r.content.decode().count("<mark>Amicus</mark>"), 1)
self.assertIn("<mark>Document</mark>", r.content.decode())
self.assertIn("<mark>attachment</mark>", r.content.decode())
self.assertEqual(r.content.decode().count("<mark>Document</mark>"), 1)
self.assertIn("<mark>Document attachment</mark>", r.content.decode())
self.assertEqual(
r.content.decode().count("<mark>attachment</mark>"), 1
r.content.decode().count("<mark>Document attachment</mark>"), 1
)

@override_settings(NO_MATCH_HL_SIZE=50)
Expand All @@ -1666,9 +1667,8 @@ def test_phrase_search_stemming(self) -> None:
)

# Confirm phrase search are properly highlighted.
terms_list = search_phrase.replace('"', "").split(" ")
for term in terms_list:
self.assertIn(f"<mark>{term}</mark>", r.content.decode())
search_term = search_phrase.replace('"', "")
self.assertIn(f"<mark>{search_term}</mark>", r.content.decode())

with self.captureOnCommitCallbacks(execute=True):
rd_1.delete()
Expand Down Expand Up @@ -1707,9 +1707,8 @@ def test_phrase_search_duplicated_terms(self) -> None:
)

# Confirm phrase search are properly highlighted.
terms_list = search_phrase.replace('"', "").split(" ")
for term in terms_list:
self.assertIn(f"<mark>{term}</mark>", r.content.decode())
search_term = search_phrase.replace('"', "")
self.assertIn(f"<mark>{search_term}</mark>", r.content.decode())

# Confirm we're able to HL terms combined with chars like ",", "." or
# or any other symbols.
Expand All @@ -1725,9 +1724,11 @@ def test_phrase_search_duplicated_terms(self) -> None:
)

# Confirm phrase search are properly highlighted.
terms_list = search_phrase.replace('"', "").split(" ")
for term in terms_list:
self.assertIn(f"<mark>{term}</mark>", r.content.decode())
self.assertIn(
f"<mark>this was finished, this unwieldy process</mark>",
r.content.decode(),
)
self.assertIn(f"<mark>ipsum</mark>", r.content.decode())

with self.captureOnCommitCallbacks(execute=True):
rd_1.delete()
Expand Down

0 comments on commit b087596

Please sign in to comment.