Skip to content

Commit

Permalink
[DOP-22576] - Analytics views (#148)
Browse files Browse the repository at this point in the history
* [DOP-22576] Add script code

* [DOP-22576] Add docs for script

* [DOP-22576] script fixes

* [DOP-22576] fix depth description

* [DOP-22576] fix docker-compose ref in docs
  • Loading branch information
TiGrib authored Jan 29, 2025
1 parent ed8e3af commit 483fdcd
Show file tree
Hide file tree
Showing 8 changed files with 214 additions and 5 deletions.
5 changes: 2 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ repos:
- id: chmod
args: ['644']
exclude_types: [shell]
exclude: ^(.*__main__\.py|data_rentgen/server/scripts/export_openapi_schema\.py|data_rentgen/db/scripts/create_partitions\.py)$
exclude: ^(.*__main__\.py|data_rentgen/server/scripts/export_openapi_schema\.py|data_rentgen/db/scripts/create_partitions\.py|data_rentgen/db/scripts/create_analytic_views\.py)$
- id: chmod
args: ['755']
files: ^(.*__main__\.py|data_rentgen/server/scripts/export_openapi_schema\.py|data_rentgen/db/scripts/create_partitions\.py)$
files: ^(.*__main__\.py|data_rentgen/server/scripts/export_openapi_schema\.py|data_rentgen/db/scripts/create_partitions\.py|data_rentgen/db/scripts/create_analytic_views\.py)$
- id: insert-license
files: .*\.py$
exclude: ^(data_rentgen/dependencies/stub.py|docs/.*\.py|tests/.*\.py)$
Expand Down Expand Up @@ -149,7 +149,6 @@ repos:
- id: check-hooks-apply
- id: check-useless-excludes


ci:
skip:
- flake8 # checked with Github Actions
Expand Down
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ db-downgrade: ##@DB Downgrade head migration
db-partitions: ##@DB Create partitions
${POETRY} run python -m data_rentgen.db.scripts.create_partitions --start 2024-07-01

db-views: ##@DB Create views
${POETRY} run python -m data_rentgen.db.scripts.create_analytic_views $(ARGS)

broker: broker-start ##@Broker Prepare broker (in docker)

Expand Down
131 changes: 131 additions & 0 deletions data_rentgen/db/scripts/create_analytic_views.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/usr/bin/env python3

# SPDX-FileCopyrightText: 2024 MTS PJSC
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations

import asyncio
import logging
import sys
from argparse import ArgumentParser
from enum import Enum

from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession

from data_rentgen.db.factory import create_session_factory
from data_rentgen.db.settings import DatabaseSettings
from data_rentgen.logging.settings import LoggingSettings
from data_rentgen.logging.setup_logging import setup_logging

logger = logging.getLogger(__name__)


class Depth(str, Enum):
DAY = "day"
WEEK = "week"
MONTH = "month"

def __str__(self) -> str:
return self.value


view_sufix_map = {
"day": "_daily_stats",
"week": "_weekly_stats",
"month": "_monthly_stats",
}


def get_parser() -> ArgumentParser:
parser = ArgumentParser(
usage="python3 -m data_rentgen.db.scripts.create_analytics_view --depths day",
description="Create matherialized views based on input and output table with given depths",
)
parser.add_argument(
"--depths",
type=Depth,
choices=[item.value for item in Depth],
nargs="+",
help="Depth of matherialized view data (created_at filter). You can provide list of args",
)
return parser


def get_statement(base_table: str, type: str) -> str:
view_name = base_table + view_sufix_map[type]
return f"""
CREATE MATERIALIZED VIEW IF NOT EXISTS {view_name}
AS (
WITH aggregates AS (
SELECT
{base_table}.dataset_id as dataset_id
, u.id as user_id
, u.name as user_name
, max({base_table}.created_at) as last_interaction_dt
, count(*) as num_of_interactions
, sum(num_bytes) as sum_bytes
, sum(num_rows) as sum_rows
, sum(num_files) as sum_files
FROM {base_table}
JOIN run r ON {base_table}.run_id = r.id
JOIN public.user u ON r.started_by_user_id = u.id
WHERE {base_table}.created_at >= now() - interval '1 {type}'
GROUP BY {base_table}.dataset_id, u.id
)
SELECT
d.name as dataset_name
, l.name as dataset_location
, l.type as dataset_location_type
, agr.user_id
, agr.user_name
, agr.last_interaction_dt
, agr.num_of_interactions
, agr.sum_bytes
, agr.sum_rows
, agr.sum_files
FROM aggregates agr
JOIN dataset d ON agr.dataset_id = d.id
LEFT JOIN location l ON d.location_id = l.id
)
"""


async def create_view(depth: Depth, session: AsyncSession):
for base_table in ("output", "input"):
statement = get_statement(base_table, depth)
logger.debug("Executing statement: %s", statement)
await session.execute(text(statement))


async def refresh_view(depth: Depth, session: AsyncSession):
for base_table in ("output", "input"):
view_name = base_table + view_sufix_map[depth]
logger.info("Refresh view: %s", view_name)
statement = f"REFRESH MATERIALIZED VIEW {view_name}"
await session.execute(text(statement))


async def main(args: list[str]) -> None:
setup_logging(LoggingSettings())

parser = get_parser()
params = parser.parse_args(args)
depths = params.depths
if not depths:
logger.info("Create views for all depths")
depths = Depth
else:
depths = sorted(set(depths))

db_settings = DatabaseSettings()
session_factory = create_session_factory(db_settings)
async with session_factory() as session:
for depth in depths:
await create_view(depth, session)
await refresh_view(depth, session)
await session.commit()


if __name__ == "__main__":
asyncio.run(main(sys.argv[1:]))
9 changes: 9 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,15 @@ services:
db:
condition: service_healthy

db-views:
image: mtsrus/data-rentgen:develop
command: |
python -m data_rentgen.db.scripts.create_analytic_views
env_file: .env.docker
depends_on:
db-migration:
condition: service_completed_successfully

frontend:
image: mtsrus/data-rentgen-ui:develop
restart: unless-stopped
Expand Down
59 changes: 58 additions & 1 deletion docs/reference/database/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,26 @@ After migrations are performed, it is required to run script which creates parti
By default, it creates monthly partitions, for current and next month. This can be changed by overriding command args.
This script should run on schedule, for example by adding a dedicated entry to `crontab <https://help.ubuntu.com/community/CronHowto>`_.

Additionally after migrations you can run a script which creates analytics views.
Views based on data in ``output`` and ``input`` tables and has such structure:

.. code:: text
dataset_name - Name of dataset.
dataset_location - Name of dataset location (e.g. clusster name).
dataset_location_type - Type of dataset location (e.g. hive, hdfs, postgres).
user_id - Internal user id.
user_name - Internal user name (e.g. name of user which run spark job).
last_interaction_dt - Time when user lat time interact with dataset. Read or write depens on base table.
num_of_interactions - Number of interactions in given interval.
sum_bytes - Sum of bytes in given interval. ``num_bytes`` - column.
sum_rows - Sum of rows in given interval. ``num_rows`` - column.
sum_files - Sum of files in given interval. ``num_files`` - column.
We provide three types of views: ``day``, ``week`` and ``month``, based on the time period in which the aggregation occur.
By default, script creates pair views for all intervals.
You can specify which views to create with ``depth`` parameter. Options are: ``day``, ``week``, ``month``.

Requirements
------------

Expand Down Expand Up @@ -43,7 +63,7 @@ With Docker
.. dropdown:: ``docker-compose.yml``

.. literalinclude:: ../../../docker-compose.yml
:emphasize-lines: 1-15,93-94
:emphasize-lines: 1-15,102-103

.. dropdown:: ``.env.docker``

Expand All @@ -60,6 +80,23 @@ With Docker
0 0 * * * docker exec data-rentgen-server-1 "python -m data_rentgen.db.scripts.create_partitions"
* Create analytic views:

.. code:: console
$ docker exec data-rentgen-server-1 "python -m data_rentgen.db.scripts.create_analytic_views"
* Add analytic views refresh script to crontab, to run every day:

.. code:: console
$ crontab -e
.. code:: text
0 0 * * * docker exec data-rentgen-server-1 "python -m data_rentgen.db.scripts.create_analytic_views"
Without Docker
~~~~~~~~~~~~~~

Expand Down Expand Up @@ -121,6 +158,25 @@ Without Docker
# read settings from .env file, and run script using a specific venv with all required dependencies
0 0 * * * /bin/bash -c "source /some/.env && /some/.venv/bin/python -m data_rentgen.db.scripts.create_partitions"
* Create analytic views:

.. code:: console
$ python -m data_rentgen.db.scripts.create_analytic_views
* Add analytic views refresh script to crontab, to run every day:

.. code:: console
$ crontab -e
.. code:: text
# read settings from .env file, and run script using a specific venv with all required dependencies
0 0 * * * /bin/bash -c "source /some/.env && /some/.venv/bin/python -m data_rentgen.db.scripts.create_analytic_views"
See also
--------

Expand All @@ -129,4 +185,5 @@ See also

configuration
partitions_cli
views_cli
structure
9 changes: 9 additions & 0 deletions docs/reference/database/views_cli.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
.. _create-views-cli:

CLI for creating views
===========================

.. argparse::
:module: data_rentgen.db.scripts.create_analytic_views
:func: get_parser
:prog: python -m data_rentgen.db.scripts.create_analytic_views
2 changes: 1 addition & 1 deletion docs/reference/frontend/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ With Docker
.. dropdown:: ``docker-compose.yml``

.. literalinclude:: ../../../docker-compose.yml
:emphasize-lines: 77-91
:emphasize-lines: 86-100

.. dropdown:: ``.env.docker``

Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,8 @@ per-file-ignores = [
# WPS102 Found incorrect module name pattern
# WPS432 Found magic number: 255
"data_rentgen/db/*.py:WPS102,WPS432",
# S608 Possible SQL injection
"data_rentgen/db/scripts/create_analytic_views.py:S608",
# WPS237 Found a too complex `f` string
"data_rentgen/server/exceptions/*.py:WPS237",
# WPS441 Found control variable used after block: input
Expand Down

0 comments on commit 483fdcd

Please sign in to comment.