From 477421fc7495753597538249c2e8dfe1fd353bae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=AF=D0=BA=D0=B8=D0=BC=D0=B5=D0=BD=D0=BA=D0=BE=D0=B2=20?= =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=90=D0=BB=D0=B5=D0=BA?= =?UTF-8?q?=D1=81=D0=B0=D0=BD=D0=B4=D1=80=D0=BE=D0=B2=D0=B8=D1=87?= Date: Mon, 3 Feb 2025 18:04:47 +0300 Subject: [PATCH] [DOP-22585] Move views creation to migration --- .pre-commit-config.yaml | 4 +- ...02-03_f017d4c58658_create_analytic_view.py | 80 +++++++++++++++++++ ...tic_views.py => refresh_analytic_views.py} | 47 ----------- docs/reference/database/index.rst | 17 ++-- docs/reference/database/views_cli.rst | 4 +- pyproject.toml | 2 +- 6 files changed, 94 insertions(+), 60 deletions(-) create mode 100644 data_rentgen/db/migrations/versions/2025-02-03_f017d4c58658_create_analytic_view.py rename data_rentgen/db/scripts/{create_analytic_views.py => refresh_analytic_views.py} (56%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7912bc8..f323317 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,10 +39,10 @@ repos: - id: chmod args: ['644'] exclude_types: [shell] - exclude: ^(.*__main__\.py|data_rentgen/server/scripts/export_openapi_schema\.py|data_rentgen/db/scripts/create_partitions\.py|data_rentgen/db/scripts/create_analytic_views\.py)$ + exclude: ^(.*__main__\.py|data_rentgen/server/scripts/export_openapi_schema\.py|data_rentgen/db/scripts/create_partitions\.py|data_rentgen/db/scripts/refresh_analytic_views\.py)$ - id: chmod args: ['755'] - files: ^(.*__main__\.py|data_rentgen/server/scripts/export_openapi_schema\.py|data_rentgen/db/scripts/create_partitions\.py|data_rentgen/db/scripts/create_analytic_views\.py)$ + files: ^(.*__main__\.py|data_rentgen/server/scripts/export_openapi_schema\.py|data_rentgen/db/scripts/create_partitions\.py|data_rentgen/db/scripts/refrsh_analytic_views\.py)$ - id: insert-license files: .*\.py$ exclude: ^(data_rentgen/dependencies/stub.py|docs/.*\.py|tests/.*\.py)$ diff --git a/data_rentgen/db/migrations/versions/2025-02-03_f017d4c58658_create_analytic_view.py b/data_rentgen/db/migrations/versions/2025-02-03_f017d4c58658_create_analytic_view.py new file mode 100644 index 0000000..ebb4d4d --- /dev/null +++ b/data_rentgen/db/migrations/versions/2025-02-03_f017d4c58658_create_analytic_view.py @@ -0,0 +1,80 @@ +# SPDX-FileCopyrightText: 2024-2025 MTS PJSC +# SPDX-License-Identifier: Apache-2.0 +"""create_analytics_views + +Revision ID: f017d4c58658 +Revises: 56f5a3f9442a +Create Date: 2025-02-03 16:44:27.746785 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "f017d4c58658" +down_revision = "56f5a3f9442a" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + for base_table in ("output", "input"): + for depth, suffix in view_sufix_map.items(): + view_name = base_table + suffix + op.execute(sa.text(f"DROP VIEW IF EXISTS {view_name}")) + op.execute(sa.text(get_statement(base_table, depth, suffix))) + + +def downgrade() -> None: + for base_table in ("output", "input"): + for _, suffix in view_sufix_map.items(): + view_name = base_table + suffix + op.execute(sa.text(f"DROP VIEW IF EXISTS {view_name}")) + + +view_sufix_map = { + "day": "_daily_stats", + "week": "_weekly_stats", + "month": "_monthly_stats", +} + + +def get_statement(base_table: str, depth: str, suffix: str) -> str: + view_name = base_table + suffix + return f""" + CREATE MATERIALIZED VIEW {view_name} + AS ( + WITH aggregates AS ( + SELECT + {base_table}.dataset_id as dataset_id + , u.id as user_id + , u.name as user_name + , max({base_table}.created_at) as last_interaction_dt + , count(*) as num_of_interactions + , sum(num_bytes) as sum_bytes + , sum(num_rows) as sum_rows + , sum(num_files) as sum_files + FROM {base_table} + JOIN run r ON {base_table}.run_id = r.id + JOIN public.user u ON r.started_by_user_id = u.id + WHERE {base_table}.created_at >= now() - interval '1 {depth}' + GROUP BY {base_table}.dataset_id, u.id + ) + SELECT + d.name as dataset_name + , l.name as dataset_location + , l.type as dataset_location_type + , agr.user_id + , agr.user_name + , agr.last_interaction_dt + , agr.num_of_interactions + , agr.sum_bytes + , agr.sum_rows + , agr.sum_files + FROM aggregates agr + JOIN dataset d ON agr.dataset_id = d.id + LEFT JOIN location l ON d.location_id = l.id + ) + WITH NO DATA + """ diff --git a/data_rentgen/db/scripts/create_analytic_views.py b/data_rentgen/db/scripts/refresh_analytic_views.py similarity index 56% rename from data_rentgen/db/scripts/create_analytic_views.py rename to data_rentgen/db/scripts/refresh_analytic_views.py index 935dbb0..ec7435e 100755 --- a/data_rentgen/db/scripts/create_analytic_views.py +++ b/data_rentgen/db/scripts/refresh_analytic_views.py @@ -52,52 +52,6 @@ def get_parser() -> ArgumentParser: return parser -def get_statement(base_table: str, type: str) -> str: - view_name = base_table + view_sufix_map[type] - return f""" - CREATE MATERIALIZED VIEW IF NOT EXISTS {view_name} - AS ( - WITH aggregates AS ( - SELECT - {base_table}.dataset_id as dataset_id - , u.id as user_id - , u.name as user_name - , max({base_table}.created_at) as last_interaction_dt - , count(*) as num_of_interactions - , sum(num_bytes) as sum_bytes - , sum(num_rows) as sum_rows - , sum(num_files) as sum_files - FROM {base_table} - JOIN run r ON {base_table}.run_id = r.id - JOIN public.user u ON r.started_by_user_id = u.id - WHERE {base_table}.created_at >= now() - interval '1 {type}' - GROUP BY {base_table}.dataset_id, u.id - ) - SELECT - d.name as dataset_name - , l.name as dataset_location - , l.type as dataset_location_type - , agr.user_id - , agr.user_name - , agr.last_interaction_dt - , agr.num_of_interactions - , agr.sum_bytes - , agr.sum_rows - , agr.sum_files - FROM aggregates agr - JOIN dataset d ON agr.dataset_id = d.id - LEFT JOIN location l ON d.location_id = l.id - ) - """ - - -async def create_view(depth: Depth, session: AsyncSession): - for base_table in ("output", "input"): - statement = get_statement(base_table, depth) - logger.debug("Executing statement: %s", statement) - await session.execute(text(statement)) - - async def refresh_view(depth: Depth, session: AsyncSession): for base_table in ("output", "input"): view_name = base_table + view_sufix_map[depth] @@ -122,7 +76,6 @@ async def main(args: list[str]) -> None: session_factory = create_session_factory(db_settings) async with session_factory() as session: for depth in depths: - await create_view(depth, session) await refresh_view(depth, session) await session.commit() diff --git a/docs/reference/database/index.rst b/docs/reference/database/index.rst index bab975a..14ac809 100644 --- a/docs/reference/database/index.rst +++ b/docs/reference/database/index.rst @@ -13,7 +13,8 @@ After migrations are performed, it is required to run script which creates parti By default, it creates monthly partitions, for current and next month. This can be changed by overriding command args. This script should run on schedule, for example by adding a dedicated entry to `crontab `_. -Additionally after migrations you can run a script which creates analytics views. +Along with migrations analytics views are created. By default these materializd views are empty(`WITH NO DATA`). +In order to fill these tables with data you need to run refresh script. The command for this shown below. Views based on data in ``output`` and ``input`` tables and has such structure: .. code:: text @@ -30,8 +31,8 @@ Views based on data in ``output`` and ``input`` tables and has such structure: sum_files - Sum of files in given interval. ``num_files`` - column. We provide three types of views: ``day``, ``week`` and ``month``, based on the time period in which the aggregation occur. -By default, script creates pair views for all intervals. -You can specify which views to create with ``depth`` parameter. Options are: ``day``, ``week``, ``month``. +Views are created for all intervals and by default, script refresh all views. +You can specify which views to refresh with ``depth`` parameter. Options are: ``day``, ``week``, ``month``. Requirements ------------ @@ -80,11 +81,11 @@ With Docker 0 0 * * * docker exec data-rentgen-server-1 "python -m data_rentgen.db.scripts.create_partitions" -* Create analytic views: +* Refresh analytic views: .. code:: console - $ docker exec data-rentgen-server-1 "python -m data_rentgen.db.scripts.create_analytic_views" + $ docker exec data-rentgen-server-1 "python -m data_rentgen.db.scripts.refresh_analytic_views" * Add analytic views refresh script to crontab, to run every day: @@ -94,7 +95,7 @@ With Docker .. code:: text - 0 0 * * * docker exec data-rentgen-server-1 "python -m data_rentgen.db.scripts.create_analytic_views" + 0 0 * * * docker exec data-rentgen-server-1 "python -m data_rentgen.db.scripts.refresh_analytic_views" Without Docker @@ -162,7 +163,7 @@ Without Docker .. code:: console - $ python -m data_rentgen.db.scripts.create_analytic_views + $ python -m data_rentgen.db.scripts.refresh_analytic_views * Add analytic views refresh script to crontab, to run every day: @@ -173,7 +174,7 @@ Without Docker .. code:: text # read settings from .env file, and run script using a specific venv with all required dependencies - 0 0 * * * /bin/bash -c "source /some/.env && /some/.venv/bin/python -m data_rentgen.db.scripts.create_analytic_views" + 0 0 * * * /bin/bash -c "source /some/.env && /some/.venv/bin/python -m data_rentgen.db.scripts.refresh_analytic_views" diff --git a/docs/reference/database/views_cli.rst b/docs/reference/database/views_cli.rst index fe481d1..e7af962 100644 --- a/docs/reference/database/views_cli.rst +++ b/docs/reference/database/views_cli.rst @@ -4,6 +4,6 @@ CLI for creating views =========================== .. argparse:: - :module: data_rentgen.db.scripts.create_analytic_views + :module: data_rentgen.db.scripts.refresh_analytic_views :func: get_parser - :prog: python -m data_rentgen.db.scripts.create_analytic_views + :prog: python -m data_rentgen.db.scripts.refresh_analytic_views diff --git a/pyproject.toml b/pyproject.toml index aaa0bb8..05a7abb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -445,7 +445,7 @@ per-file-ignores = [ # WPS432 Found magic number: 255 "data_rentgen/db/*.py:WPS102,WPS432", # S608 Possible SQL injection - "data_rentgen/db/scripts/create_analytic_views.py:S608", + "data_rentgen/db/migrations/versions/2025-02-03_f017d4c58658_create_analytic_view.py:S608,WPS102", # WPS237 Found a too complex `f` string "data_rentgen/server/exceptions/*.py:WPS237", # WPS441 Found control variable used after block: input