Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DOP-22585] Move views creation to migration #149

Merged
merged 4 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ repos:
- id: chmod
args: ['644']
exclude_types: [shell]
exclude: ^(.*__main__\.py|data_rentgen/server/scripts/export_openapi_schema\.py|data_rentgen/db/scripts/create_partitions\.py|data_rentgen/db/scripts/create_analytic_views\.py)$
exclude: ^(.*__main__\.py|data_rentgen/server/scripts/export_openapi_schema\.py|data_rentgen/db/scripts/create_partitions\.py|data_rentgen/db/scripts/refresh_analytic_views\.py)$
- id: chmod
args: ['755']
files: ^(.*__main__\.py|data_rentgen/server/scripts/export_openapi_schema\.py|data_rentgen/db/scripts/create_partitions\.py|data_rentgen/db/scripts/create_analytic_views\.py)$
files: ^(.*__main__\.py|data_rentgen/server/scripts/export_openapi_schema\.py|data_rentgen/db/scripts/create_partitions\.py|data_rentgen/db/scripts/refrsh_analytic_views\.py)$
TiGrib marked this conversation as resolved.
Show resolved Hide resolved
- id: insert-license
files: .*\.py$
exclude: ^(data_rentgen/dependencies/stub.py|docs/.*\.py|tests/.*\.py)$
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# SPDX-FileCopyrightText: 2024-2025 MTS PJSC
# SPDX-License-Identifier: Apache-2.0
"""create_analytics_views

Revision ID: f017d4c58658
Revises: 56f5a3f9442a
Create Date: 2025-02-03 16:44:27.746785

"""

import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision = "f017d4c58658"
down_revision = "56f5a3f9442a"
branch_labels = None
depends_on = None


def upgrade() -> None:
for base_table in ("output", "input"):
for depth, suffix in view_sufix_map.items():
view_name = base_table + suffix
op.execute(sa.text(f"DROP VIEW IF EXISTS {view_name}"))
op.execute(sa.text(get_statement(base_table, depth, suffix)))


def downgrade() -> None:
for base_table in ("output", "input"):
for _, suffix in view_sufix_map.items():
TiGrib marked this conversation as resolved.
Show resolved Hide resolved
view_name = base_table + suffix
op.execute(sa.text(f"DROP VIEW IF EXISTS {view_name}"))


view_sufix_map = {
"day": "_daily_stats",
"week": "_weekly_stats",
"month": "_monthly_stats",
}


def get_statement(base_table: str, depth: str, suffix: str) -> str:
view_name = base_table + suffix
return f"""
CREATE MATERIALIZED VIEW {view_name}
AS (
WITH aggregates AS (
SELECT
{base_table}.dataset_id as dataset_id
, u.id as user_id
, u.name as user_name
, max({base_table}.created_at) as last_interaction_dt
, count(*) as num_of_interactions
, sum(num_bytes) as sum_bytes
, sum(num_rows) as sum_rows
, sum(num_files) as sum_files
FROM {base_table}
JOIN run r ON {base_table}.run_id = r.id
JOIN public.user u ON r.started_by_user_id = u.id
WHERE {base_table}.created_at >= now() - interval '1 {depth}'
GROUP BY {base_table}.dataset_id, u.id
)
SELECT
d.name as dataset_name
, l.name as dataset_location
, l.type as dataset_location_type
, agr.user_id
, agr.user_name
, agr.last_interaction_dt
, agr.num_of_interactions
, agr.sum_bytes
, agr.sum_rows
, agr.sum_files
FROM aggregates agr
JOIN dataset d ON agr.dataset_id = d.id
LEFT JOIN location l ON d.location_id = l.id
)
WITH NO DATA
"""
Original file line number Diff line number Diff line change
Expand Up @@ -52,52 +52,6 @@ def get_parser() -> ArgumentParser:
return parser


def get_statement(base_table: str, type: str) -> str:
view_name = base_table + view_sufix_map[type]
return f"""
CREATE MATERIALIZED VIEW IF NOT EXISTS {view_name}
AS (
WITH aggregates AS (
SELECT
{base_table}.dataset_id as dataset_id
, u.id as user_id
, u.name as user_name
, max({base_table}.created_at) as last_interaction_dt
, count(*) as num_of_interactions
, sum(num_bytes) as sum_bytes
, sum(num_rows) as sum_rows
, sum(num_files) as sum_files
FROM {base_table}
JOIN run r ON {base_table}.run_id = r.id
JOIN public.user u ON r.started_by_user_id = u.id
WHERE {base_table}.created_at >= now() - interval '1 {type}'
GROUP BY {base_table}.dataset_id, u.id
)
SELECT
d.name as dataset_name
, l.name as dataset_location
, l.type as dataset_location_type
, agr.user_id
, agr.user_name
, agr.last_interaction_dt
, agr.num_of_interactions
, agr.sum_bytes
, agr.sum_rows
, agr.sum_files
FROM aggregates agr
JOIN dataset d ON agr.dataset_id = d.id
LEFT JOIN location l ON d.location_id = l.id
)
"""


async def create_view(depth: Depth, session: AsyncSession):
for base_table in ("output", "input"):
statement = get_statement(base_table, depth)
logger.debug("Executing statement: %s", statement)
await session.execute(text(statement))


async def refresh_view(depth: Depth, session: AsyncSession):
for base_table in ("output", "input"):
view_name = base_table + view_sufix_map[depth]
Expand All @@ -122,7 +76,6 @@ async def main(args: list[str]) -> None:
session_factory = create_session_factory(db_settings)
async with session_factory() as session:
for depth in depths:
await create_view(depth, session)
await refresh_view(depth, session)
await session.commit()

Expand Down
17 changes: 9 additions & 8 deletions docs/reference/database/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ After migrations are performed, it is required to run script which creates parti
By default, it creates monthly partitions, for current and next month. This can be changed by overriding command args.
This script should run on schedule, for example by adding a dedicated entry to `crontab <https://help.ubuntu.com/community/CronHowto>`_.

Additionally after migrations you can run a script which creates analytics views.
Along with migrations analytics views are created. By default these materializd views are empty(`WITH NO DATA`).
TiGrib marked this conversation as resolved.
Show resolved Hide resolved
In order to fill these tables with data you need to run refresh script. The command for this shown below.
Views based on data in ``output`` and ``input`` tables and has such structure:

.. code:: text
Expand All @@ -30,8 +31,8 @@ Views based on data in ``output`` and ``input`` tables and has such structure:
sum_files - Sum of files in given interval. ``num_files`` - column.

We provide three types of views: ``day``, ``week`` and ``month``, based on the time period in which the aggregation occur.
By default, script creates pair views for all intervals.
You can specify which views to create with ``depth`` parameter. Options are: ``day``, ``week``, ``month``.
Views are created for all intervals and by default, script refresh all views.
You can specify which views to refresh with ``depth`` parameter. Options are: ``day``, ``week``, ``month``.

Requirements
------------
Expand Down Expand Up @@ -80,11 +81,11 @@ With Docker

0 0 * * * docker exec data-rentgen-server-1 "python -m data_rentgen.db.scripts.create_partitions"

* Create analytic views:
* Refresh analytic views:

.. code:: console

$ docker exec data-rentgen-server-1 "python -m data_rentgen.db.scripts.create_analytic_views"
$ docker exec data-rentgen-server-1 "python -m data_rentgen.db.scripts.refresh_analytic_views"

* Add analytic views refresh script to crontab, to run every day:

Expand All @@ -94,7 +95,7 @@ With Docker

.. code:: text

0 0 * * * docker exec data-rentgen-server-1 "python -m data_rentgen.db.scripts.create_analytic_views"
0 0 * * * docker exec data-rentgen-server-1 "python -m data_rentgen.db.scripts.refresh_analytic_views"


Without Docker
Expand Down Expand Up @@ -162,7 +163,7 @@ Without Docker

.. code:: console

$ python -m data_rentgen.db.scripts.create_analytic_views
$ python -m data_rentgen.db.scripts.refresh_analytic_views

* Add analytic views refresh script to crontab, to run every day:

Expand All @@ -173,7 +174,7 @@ Without Docker
.. code:: text

# read settings from .env file, and run script using a specific venv with all required dependencies
0 0 * * * /bin/bash -c "source /some/.env && /some/.venv/bin/python -m data_rentgen.db.scripts.create_analytic_views"
0 0 * * * /bin/bash -c "source /some/.env && /some/.venv/bin/python -m data_rentgen.db.scripts.refresh_analytic_views"



Expand Down
4 changes: 2 additions & 2 deletions docs/reference/database/views_cli.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@ CLI for creating views
===========================

.. argparse::
:module: data_rentgen.db.scripts.create_analytic_views
:module: data_rentgen.db.scripts.refresh_analytic_views
:func: get_parser
:prog: python -m data_rentgen.db.scripts.create_analytic_views
:prog: python -m data_rentgen.db.scripts.refresh_analytic_views
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ per-file-ignores = [
# WPS432 Found magic number: 255
"data_rentgen/db/*.py:WPS102,WPS432",
# S608 Possible SQL injection
"data_rentgen/db/scripts/create_analytic_views.py:S608",
"data_rentgen/db/migrations/versions/2025-02-03_f017d4c58658_create_analytic_view.py:S608,WPS102",
# WPS237 Found a too complex `f` string
"data_rentgen/server/exceptions/*.py:WPS237",
# WPS441 Found control variable used after block: input
Expand Down
Loading