Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enhance(mysql): add better indexes for posts_gdocs table #4567

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 8 additions & 9 deletions adminSiteServer/apiRoutes/misc.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,18 @@ export async function fetchAllWork(
type GdocRecord = Pick<DbRawPostGdoc, "id" | "publishedAt">

const author = req.query.author
const gdocs = await db
.knexRaw<GdocRecord>(
trx,
`-- sql
SELECT id, publishedAt
const gdocs = await db.knexRaw<GdocRecord>(
trx,
`-- sql
SELECT id
FROM posts_gdocs
WHERE JSON_CONTAINS(content->'$.authors', ?)
WHERE JSON_CONTAINS(authors, ?)
AND type NOT IN ("data-insight", "fragment")
AND published = 1
ORDER BY publishedAt DESC
`,
[`"${author}"`]
)
.then((rows) => lodash.orderBy(rows, (row) => row.publishedAt, "desc"))
[`"${author}"`]
)

const archieLines = gdocs.map(
(post) => `url: https://docs.google.com/document/d/${post.id}/edit`
Expand Down
5 changes: 3 additions & 2 deletions db/db.ts
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ export const getPublishedDataInsights = (
`-- sql
SELECT
content->>'$.title' AS title,
content->>'$.authors' AS authors,
authors,
publishedAt,
updatedAt,
slug,
Expand Down Expand Up @@ -426,7 +426,8 @@ export const getPublishedGdocPosts = async (
g.publishedAt,
g.revisionId,
g.slug,
g.updatedAt
g.updatedAt,
g.authors
FROM
posts_gdocs g
WHERE
Expand Down
55 changes: 55 additions & 0 deletions db/migration/1739788202649-PostsGdocsIndexes.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import { MigrationInterface, QueryRunner } from "typeorm"

export class PostsGdocsIndexes1739788202649 implements MigrationInterface {
public async up(queryRunner: QueryRunner): Promise<void> {
// Change `type` col def from `VIRTUAL` to `STORED`
await queryRunner.query(`-- sql
ALTER TABLE posts_gdocs
DROP INDEX idx_posts_gdocs_type,
DROP COLUMN type,
ADD COLUMN type VARCHAR(255) GENERATED ALWAYS AS (JSON_UNQUOTE(JSON_EXTRACT(content, '$.type'))) STORED AFTER slug
`)

// Add a combined index on `type`, `published` and `publishedAt`
// It can get used for just `type` queries, and often times we also filter by `published` in the same query
// and potentially by `publishedAt` as well
await queryRunner.query(`-- sql
ALTER TABLE posts_gdocs
ADD INDEX idx_posts_gdocs_type_published_publishedAt (type, published, publishedAt)
`)

// Another common type of query is to filter/sort by `published` and `publishedAt`
await queryRunner.query(`-- sql
ALTER TABLE posts_gdocs
ADD INDEX idx_posts_gdocs_published_publishedAt (published, publishedAt)
`)

// Add a computed stored column for `authors`
await queryRunner.query(`-- sql
ALTER TABLE posts_gdocs
ADD COLUMN authors JSON GENERATED ALWAYS AS (JSON_EXTRACT(content, '$.authors')) STORED AFTER content
`)
}

public async down(queryRunner: QueryRunner): Promise<void> {
await queryRunner.query(`-- sql
ALTER TABLE posts_gdocs
DROP COLUMN authors
`)

await queryRunner.query(`-- sql
DROP INDEX idx_posts_gdocs_published_publishedAt ON posts_gdocs
`)

await queryRunner.query(`-- sql
DROP INDEX idx_posts_gdocs_type_published_publishedAt ON posts_gdocs
`)

await queryRunner.query(`-- sql
ALTER TABLE posts_gdocs
DROP COLUMN type,
ADD COLUMN type VARCHAR(255) GENERATED ALWAYS AS (JSON_UNQUOTE(JSON_EXTRACT(content, '$.type'))) VIRTUAL AFTER slug,
ADD INDEX idx_posts_gdocs_type (type)
`)
}
}
2 changes: 1 addition & 1 deletion db/model/Gdoc/GdocBase.ts
Original file line number Diff line number Diff line change
Expand Up @@ -946,7 +946,7 @@ export async function getMinimalGdocPostsByIds(
id,
content ->> '$.title' as title,
slug,
content ->> '$.authors' as authors,
authors,
publishedAt,
published,
content ->> '$.subtitle' as subtitle,
Expand Down
2 changes: 1 addition & 1 deletion db/model/Gdoc/GdocFactory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ export async function getAllMinimalGdocBaseObjects(
id,
content ->> '$.title' as title,
slug,
content ->> '$.authors' as authors,
authors,
publishedAt,
published,
content ->> '$.subtitle' as subtitle,
Expand Down
18 changes: 6 additions & 12 deletions db/model/Post.ts
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ export const getRelatedResearchAndWritingForVariables = async (
p.content ->> '$.title' AS title,
p.slug AS postSlug,
COALESCE(csr.chart_id, c.id) AS chartId,
p.content ->> '$.authors' AS authors,
authors,
p.content ->> '$."featured-image"' AS thumbnail,
COALESCE(pv.views_365d, 0) AS pageviews,
'gdocs' AS post_source,
Expand Down Expand Up @@ -657,7 +657,7 @@ export const getLatestWorkByAuthor = async (
pg.slug,
pg.content->>'$.title' AS title,
pg.content->>'$.subtitle' AS subtitle,
pg.content->>'$.authors' AS authors,
authors,
pg.publishedAt,
CASE
WHEN content ->> '$."deprecation-notice"' IS NOT NULL THEN '${ARCHVED_THUMBNAIL_FILENAME}'
Expand All @@ -666,19 +666,13 @@ export const getLatestWorkByAuthor = async (
FROM
posts_gdocs pg
WHERE
pg.content ->> '$.authors' LIKE ?
JSON_CONTAINS(authors, ?)
AND pg.published = TRUE
AND pg.type = "${OwidGdocType.Article}"
ORDER BY publishedAt DESC
`,
[`%${author}%`]
[`"${author}"`]
)

// We're sorting in JS because of the "Out of sort memory, consider
// increasing server sort buffer size" error when using ORDER BY. Adding an
// index on the publishedAt column doesn't help.
return sortBy(
rawLatestWorkLinks.map((work) => parseLatestWork(work)),
// Sort by most recent first
(work) => -work.publishedAt! // "!" because we're only selecting published posts, so publishedAt can't be NULL
)
return rawLatestWorkLinks.map(parseLatestWork)
}
34 changes: 26 additions & 8 deletions packages/@ourworldindata/types/src/dbTypes/PostsGdocs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { JsonString } from "../domainTypes/Various.js"
import {
OwidGdocContent,
OwidGdocPublicationContext,
OwidGdocType,
} from "../gdocTypes/Gdoc.js"
import { MinimalTag } from "./Tags.js"

Expand All @@ -20,14 +21,18 @@ export interface DbInsertPostGdoc {
slug: string
updatedAt?: Date | null
}
export type DbRawPostGdoc = Required<DbInsertPostGdoc>
export type DbRawPostGdoc = Required<DbInsertPostGdoc> & {
type?: OwidGdocType
authors?: JsonString
}
export type DbEnrichedPostGdoc = Omit<
DbRawPostGdoc,
"content" | "manualBreadcrumbs" | "published"
"content" | "manualBreadcrumbs" | "published" | "authors"
> & {
content: OwidGdocContent
manualBreadcrumbs: BreadcrumbItem[] | null
published: boolean
authors?: string[]
}

export type DBRawPostGdocWithTags = DbRawPostGdoc & {
Expand All @@ -52,6 +57,12 @@ export function parsePostsGdocsBreadcrumbs(
return breadcrumbs ? JSON.parse(breadcrumbs) : null
}

export function parsePostGdocsAuthors(
authors: JsonString | undefined
): string[] {
return authors ? JSON.parse(authors) : []
}

export function serializePostsGdocsBreadcrumbs(
breadcrumbs: BreadcrumbItem[] | null
): JsonString | null {
Expand All @@ -64,6 +75,7 @@ export function parsePostsGdocsRow(row: DbRawPostGdoc): DbEnrichedPostGdoc {
content: parsePostGdocContent(row.content),
manualBreadcrumbs: parsePostsGdocsBreadcrumbs(row.manualBreadcrumbs),
published: !!row.published,
authors: parsePostGdocsAuthors(row.authors),
}
}

Expand All @@ -76,12 +88,18 @@ export function parsePostsGdocsWithTagsRow(
}
}

export function serializePostsGdocsRow(row: DbEnrichedPostGdoc): DbRawPostGdoc {
// Kind of awkward, but automatic breadcrumbs are part of OwidGdocBaseInterface,
// but not part of the DB schema. So we remove them here.
if ("breadcrumbs" in row) {
delete row.breadcrumbs
}
export function serializePostsGdocsRow(
row: DbEnrichedPostGdoc
): DbInsertPostGdoc {
// Kind of awkward, but some props may be set on the row but we don't want to insert them.
// So we remove them here.
const KEYS_TO_REMOVE = ["breadcrumbs", "type", "authors"]

KEYS_TO_REMOVE.forEach((key) => {
if (key in row) {
delete (row as any)[key]
}
})
return {
...row,
content: serializePostGdocContent(row.content),
Expand Down
Loading