Skip to content

Commit

Permalink
Merge pull request #3718 from owid/search-country-matching-stop-words
Browse files Browse the repository at this point in the history
  • Loading branch information
marcelgerber authored Jun 25, 2024
2 parents 537dcbc + 7da0aaa commit f603a6b
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 34 deletions.
36 changes: 30 additions & 6 deletions site/search/SearchPanel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ import {
extractRegionNamesFromSearchQuery,
pickEntitiesForChartHit,
} from "./SearchUtils.js"
import { HitAttributeHighlightResult } from "instantsearch.js"

const siteAnalytics = new SiteAnalytics()

Expand Down Expand Up @@ -131,14 +132,30 @@ const getEntityQueryStr = (
}
}

function ChartHit({ hit }: { hit: IChartHit }) {
function ChartHit({
hit,
searchQueryRegionsMatches,
}: {
hit: IChartHit
searchQueryRegionsMatches?: Region[] | undefined
}) {
const [imgLoaded, setImgLoaded] = useState(false)
const [imgError, setImgError] = useState(false)

const entities = useMemo(
() => pickEntitiesForChartHit(hit),
// eslint-disable-next-line react-hooks/exhaustive-deps
[hit._highlightResult?.availableEntities]
() =>
pickEntitiesForChartHit(
hit._highlightResult?.availableEntities as
| HitAttributeHighlightResult[]
| undefined,
hit.availableEntities,
searchQueryRegionsMatches
),
[
hit._highlightResult?.availableEntities,
hit.availableEntities,
searchQueryRegionsMatches,
]
)
const queryStr = useMemo(() => getEntityQueryStr(entities), [entities])
const previewUrl = queryStr
Expand Down Expand Up @@ -702,13 +719,20 @@ const SearchResults = (props: SearchResultsProps) => {
/>
</div>
</header>
<Hits
<Hits<IChartHit>
classNames={{
root: "search-results__list-container",
list: "search-results__charts-list grid grid-cols-4 grid-sm-cols-2",
item: "search-results__chart-hit span-md-cols-2",
}}
hitComponent={ChartHit}
hitComponent={(props) => (
<ChartHit
{...props}
searchQueryRegionsMatches={
searchQueryRegionsMatches
}
/>
)}
/>
</section>
</NoResultsBoundary>
Expand Down
86 changes: 58 additions & 28 deletions site/search/SearchUtils.tsx
Original file line number Diff line number Diff line change
@@ -1,15 +1,35 @@
import { HitAttributeHighlightResult } from "instantsearch.js"
import { IChartHit } from "./searchTypes.js"
import { EntityName } from "@ourworldindata/types"
import {
Region,
getRegionByNameOrVariantName,
regions,
countries,
escapeRegExp,
removeTrailingParenthetical,
} from "@ourworldindata/utils"

/**
* The below code is used to search for entities we can highlight in charts and explorer results.
*
* There are two main functions here:
* - `extractRegionNamesFromSearchQuery` looks at the search query (e.g. "covid cases us china asia") and extracts anything
* that looks like a country, region or variant name (e.g. "US"), case-insensitive.
* It doesn't have any knowledge of what entities are actually available.
* - `pickEntitiesForChartHit` gets information about the entities available in a chart.
* It also receives the result of `extractRegionNamesFromSearchQuery`, i.e. a list of regions that are mentioned in the search query.
* This is useful because Algolia removes stop words like "the" and "and", which makes it difficult to match entities like
* "Trinidad and Tobago".
* - It then reduces this list to the entities that are actually available in the chart.
* - Afterwards, it uses the highlighted entities from Algolia to pick any other entities that are fully contained in the
* search query - this now adds any entities _not_ in the `regions` list, like "high-income countries" or "Salmon (farmed)".
*
* In practice, we use `pickEntitiesForChartHit` for explorers, since there we don't have any entity information available,
* and can only act based on the fact that most explorers are country-based and have data for most countries and regions.
* For charts, we use the more accurate `pickEntitiesForChartHit` function, since entity information is available.
*
* -- @marcelgerber, 2024-06-18
*/

const allCountryNamesAndVariants = regions.flatMap((c) => [
c.name,
...(("variantNames" in c && c.variantNames) || []),
Expand All @@ -31,18 +51,39 @@ export const extractRegionNamesFromSearchQuery = (query: string) => {
const removeHighlightTags = (text: string) =>
text.replace(/<\/?(mark|strong)>/g, "")

export function pickEntitiesForChartHit(hit: IChartHit): EntityName[] {
const availableEntitiesHighlighted = hit._highlightResult
?.availableEntities as HitAttributeHighlightResult[] | undefined
export function pickEntitiesForChartHit(
availableEntitiesHighlighted: HitAttributeHighlightResult[] | undefined,
availableEntities: EntityName[] | undefined,
searchQueryRegionsMatches: Region[] | undefined
): EntityName[] {
if (!availableEntities) return []

const pickedEntities = availableEntitiesHighlighted
?.filter((highlightEntry) => {
if (highlightEntry.matchLevel === "none") return false
const pickedEntities = new Set(
searchQueryRegionsMatches?.map((r) => r.name)
)

// Remove any trailing parentheses, e.g. "Africa (UN)" -> "Africa"
const entityNameWithoutTrailingParens = removeTrailingParenthetical(
removeHighlightTags(highlightEntry.value)
// Build intersection of searchQueryRegionsMatches and availableEntities, so we only select entities that are actually present in the chart
if (pickedEntities.size > 0) {
const availableEntitiesSet = new Set(availableEntities)
for (const entity of pickedEntities) {
if (!availableEntitiesSet.has(entity)) {
pickedEntities.delete(entity)
}
}
}

if (availableEntitiesHighlighted) {
for (const highlightEntry of availableEntitiesHighlighted) {
if (highlightEntry.matchLevel === "none") continue

const withoutHighlightTags = removeHighlightTags(
highlightEntry.value
)
if (pickedEntities.has(withoutHighlightTags)) continue

// Remove any trailing parentheses, e.g. "Africa (UN)" -> "Africa"
const withoutTrailingParens =
removeTrailingParenthetical(withoutHighlightTags)

// The sequence of words that Algolia matched; could be something like ["arab", "united", "republic"]
// which we want to check against the entity name
Expand All @@ -53,27 +94,16 @@ export function pickEntitiesForChartHit(hit: IChartHit): EntityName[] {
// Pick entity if the matched sequence contains the full entity name
if (
matchedSequenceLowerCase.startsWith(
entityNameWithoutTrailingParens
withoutTrailingParens
.replaceAll("-", " ") // makes "high-income countries" into "high income countries", enabling a match
.toLowerCase()
)
)
return true

const country = countries.find(
(c) => c.name === entityNameWithoutTrailingParens
)
if (country?.variantNames) {
// Pick entity if the matched sequence contains any of the variant names
return country.variantNames.some((variant) =>
matchedSequenceLowerCase.includes(variant.toLowerCase())
)
}
pickedEntities.add(withoutHighlightTags)
}
}

return false
})
.map((highlightEntry) => removeHighlightTags(highlightEntry.value))
.sort()
const sortedEntities = [...pickedEntities].sort()

return pickedEntities ?? []
return sortedEntities ?? []
}

0 comments on commit f603a6b

Please sign in to comment.